xref: /titanic_44/usr/src/uts/common/disp/fx.c (revision 1db2880b3a411e3c56e50c7dc42d3b137fcc4e48)
1  /*
2   * CDDL HEADER START
3   *
4   * The contents of this file are subject to the terms of the
5   * Common Development and Distribution License (the "License").
6   * You may not use this file except in compliance with the License.
7   *
8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9   * or http://www.opensolaris.org/os/licensing.
10   * See the License for the specific language governing permissions
11   * and limitations under the License.
12   *
13   * When distributing Covered Code, include this CDDL HEADER in each
14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15   * If applicable, add the following below this CDDL HEADER, with the
16   * fields enclosed by brackets "[]" replaced with your own identifying
17   * information: Portions Copyright [yyyy] [name of copyright owner]
18   *
19   * CDDL HEADER END
20   */
21  
22  /*
23   * Copyright (c) 1994, 2010, Oracle and/or its affiliates. All rights reserved.
24   * Copyright 2013, Joyent, Inc. All rights reserved.
25   */
26  
27  #include <sys/types.h>
28  #include <sys/param.h>
29  #include <sys/sysmacros.h>
30  #include <sys/cred.h>
31  #include <sys/proc.h>
32  #include <sys/session.h>
33  #include <sys/strsubr.h>
34  #include <sys/user.h>
35  #include <sys/priocntl.h>
36  #include <sys/class.h>
37  #include <sys/disp.h>
38  #include <sys/procset.h>
39  #include <sys/debug.h>
40  #include <sys/kmem.h>
41  #include <sys/errno.h>
42  #include <sys/fx.h>
43  #include <sys/fxpriocntl.h>
44  #include <sys/cpuvar.h>
45  #include <sys/systm.h>
46  #include <sys/vtrace.h>
47  #include <sys/schedctl.h>
48  #include <sys/tnf_probe.h>
49  #include <sys/sunddi.h>
50  #include <sys/spl.h>
51  #include <sys/modctl.h>
52  #include <sys/policy.h>
53  #include <sys/sdt.h>
54  #include <sys/cpupart.h>
55  #include <sys/cpucaps.h>
56  
57  static pri_t fx_init(id_t, int, classfuncs_t **);
58  
59  static struct sclass csw = {
60  	"FX",
61  	fx_init,
62  	0
63  };
64  
65  static struct modlsched modlsched = {
66  	&mod_schedops, "Fixed priority sched class", &csw
67  };
68  
69  static struct modlinkage modlinkage = {
70  	MODREV_1, (void *)&modlsched, NULL
71  };
72  
73  
74  /*
75   * control flags (kparms->fx_cflags).
76   */
77  #define	FX_DOUPRILIM	0x01    /* change user priority limit */
78  #define	FX_DOUPRI	0x02    /* change user priority */
79  #define	FX_DOTQ		0x04    /* change FX time quantum */
80  
81  
82  #define	FXMAXUPRI 60		/* maximum user priority setting */
83  
84  #define	FX_MAX_UNPRIV_PRI	0	/* maximum unpriviledge priority */
85  
86  /*
87   * The fxproc_t structures that have a registered callback vector,
88   * are also kept in an array of circular doubly linked lists. A hash on
89   * the thread id (from ddi_get_kt_did()) is used to determine which list
90   * each of such fxproc structures should be placed. Each list has a dummy
91   * "head" which is never removed, so the list is never empty.
92   */
93  
94  #define	FX_CB_LISTS 16		/* number of lists, must be power of 2 */
95  #define	FX_CB_LIST_HASH(ktid)	((uint_t)ktid & (FX_CB_LISTS - 1))
96  
97  /* Insert fxproc into callback list */
98  #define	FX_CB_LIST_INSERT(fxpp)						\
99  {									\
100  	int index = FX_CB_LIST_HASH(fxpp->fx_ktid);			\
101  	kmutex_t *lockp = &fx_cb_list_lock[index];			\
102  	fxproc_t *headp = &fx_cb_plisthead[index];			\
103  	mutex_enter(lockp);						\
104  	fxpp->fx_cb_next = headp->fx_cb_next;				\
105  	fxpp->fx_cb_prev = headp;					\
106  	headp->fx_cb_next->fx_cb_prev = fxpp;				\
107  	headp->fx_cb_next = fxpp;					\
108  	mutex_exit(lockp);						\
109  }
110  
111  /*
112   * Remove thread from callback list.
113   */
114  #define	FX_CB_LIST_DELETE(fxpp)						\
115  {									\
116  	int index = FX_CB_LIST_HASH(fxpp->fx_ktid);			\
117  	kmutex_t *lockp = &fx_cb_list_lock[index];			\
118  	mutex_enter(lockp);						\
119  	fxpp->fx_cb_prev->fx_cb_next = fxpp->fx_cb_next;		\
120  	fxpp->fx_cb_next->fx_cb_prev = fxpp->fx_cb_prev;		\
121  	mutex_exit(lockp);						\
122  }
123  
124  #define	FX_HAS_CB(fxpp)	(fxpp->fx_callback != NULL)
125  
126  /* adjust x to be between 0 and fx_maxumdpri */
127  
128  #define	FX_ADJUST_PRI(pri)						\
129  {									\
130  	if (pri < 0)							\
131  		pri = 0;  						\
132  	else if (pri > fx_maxumdpri) 					\
133  		pri = fx_maxumdpri;  					\
134  }
135  
136  #define	FX_ADJUST_QUANTUM(q)						\
137  {									\
138  	if (q > INT_MAX)						\
139  		q = INT_MAX;						\
140  	else if (q <= 0)						\
141  		q = FX_TQINF;						\
142  }
143  
144  #define	FX_ISVALID(pri, quantum) \
145  	(((pri >= 0) || (pri == FX_CB_NOCHANGE)) &&			\
146  	    ((quantum >= 0) || (quantum == FX_NOCHANGE) ||		\
147  		(quantum == FX_TQDEF) || (quantum == FX_TQINF)))
148  
149  
150  static id_t	fx_cid;		/* fixed priority class ID */
151  static fxdpent_t *fx_dptbl;	/* fixed priority disp parameter table */
152  
153  static pri_t	fx_maxupri = FXMAXUPRI;
154  static pri_t	fx_maxumdpri;	/* max user mode fixed priority */
155  
156  static pri_t	fx_maxglobpri;	/* maximum global priority used by fx class */
157  static kmutex_t	fx_dptblock;	/* protects fixed priority dispatch table */
158  
159  
160  static kmutex_t	fx_cb_list_lock[FX_CB_LISTS];	/* protects list of fxprocs */
161  						/* that have callbacks */
162  static fxproc_t	fx_cb_plisthead[FX_CB_LISTS];	/* dummy fxproc at head of */
163  						/* list of fxprocs with */
164  						/* callbacks */
165  
166  static int	fx_admin(caddr_t, cred_t *);
167  static int	fx_getclinfo(void *);
168  static int	fx_parmsin(void *);
169  static int	fx_parmsout(void *, pc_vaparms_t *);
170  static int	fx_vaparmsin(void *, pc_vaparms_t *);
171  static int	fx_vaparmsout(void *, pc_vaparms_t *);
172  static int	fx_getclpri(pcpri_t *);
173  static int	fx_alloc(void **, int);
174  static void	fx_free(void *);
175  static int	fx_enterclass(kthread_t *, id_t, void *, cred_t *, void *);
176  static void	fx_exitclass(void *);
177  static int	fx_canexit(kthread_t *, cred_t *);
178  static int	fx_fork(kthread_t *, kthread_t *, void *);
179  static void	fx_forkret(kthread_t *, kthread_t *);
180  static void	fx_parmsget(kthread_t *, void *);
181  static int	fx_parmsset(kthread_t *, void *, id_t, cred_t *);
182  static void	fx_stop(kthread_t *, int, int);
183  static void	fx_exit(kthread_t *);
184  static pri_t	fx_swapin(kthread_t *, int);
185  static pri_t	fx_swapout(kthread_t *, int);
186  static void	fx_trapret(kthread_t *);
187  static void	fx_preempt(kthread_t *);
188  static void	fx_setrun(kthread_t *);
189  static void	fx_sleep(kthread_t *);
190  static void	fx_tick(kthread_t *);
191  static void	fx_wakeup(kthread_t *);
192  static int	fx_donice(kthread_t *, cred_t *, int, int *);
193  static int	fx_doprio(kthread_t *, cred_t *, int, int *);
194  static pri_t	fx_globpri(kthread_t *);
195  static void	fx_yield(kthread_t *);
196  static void	fx_nullsys();
197  
198  extern fxdpent_t *fx_getdptbl(void);
199  
200  static void	fx_change_priority(kthread_t *, fxproc_t *);
201  static fxproc_t *fx_list_lookup(kt_did_t);
202  static void fx_list_release(fxproc_t *);
203  
204  
205  static struct classfuncs fx_classfuncs = {
206  	/* class functions */
207  	fx_admin,
208  	fx_getclinfo,
209  	fx_parmsin,
210  	fx_parmsout,
211  	fx_vaparmsin,
212  	fx_vaparmsout,
213  	fx_getclpri,
214  	fx_alloc,
215  	fx_free,
216  
217  	/* thread functions */
218  	fx_enterclass,
219  	fx_exitclass,
220  	fx_canexit,
221  	fx_fork,
222  	fx_forkret,
223  	fx_parmsget,
224  	fx_parmsset,
225  	fx_stop,
226  	fx_exit,
227  	fx_nullsys,	/* active */
228  	fx_nullsys,	/* inactive */
229  	fx_swapin,
230  	fx_swapout,
231  	fx_trapret,
232  	fx_preempt,
233  	fx_setrun,
234  	fx_sleep,
235  	fx_tick,
236  	fx_wakeup,
237  	fx_donice,
238  	fx_globpri,
239  	fx_nullsys,	/* set_process_group */
240  	fx_yield,
241  	fx_doprio,
242  };
243  
244  
245  int
246  _init()
247  {
248  	return (mod_install(&modlinkage));
249  }
250  
251  int
252  _fini()
253  {
254  	return (EBUSY);
255  }
256  
257  int
258  _info(struct modinfo *modinfop)
259  {
260  	return (mod_info(&modlinkage, modinfop));
261  }
262  
263  /*
264   * Fixed priority class initialization. Called by dispinit() at boot time.
265   * We can ignore the clparmsz argument since we know that the smallest
266   * possible parameter buffer is big enough for us.
267   */
268  /* ARGSUSED */
269  static pri_t
270  fx_init(id_t cid, int clparmsz, classfuncs_t **clfuncspp)
271  {
272  	int i;
273  	extern pri_t fx_getmaxumdpri(void);
274  
275  	fx_dptbl = fx_getdptbl();
276  	fx_maxumdpri = fx_getmaxumdpri();
277  	fx_maxglobpri = fx_dptbl[fx_maxumdpri].fx_globpri;
278  
279  	fx_cid = cid;		/* Record our class ID */
280  
281  	/*
282  	 * Initialize the hash table for fxprocs with callbacks
283  	 */
284  	for (i = 0; i < FX_CB_LISTS; i++) {
285  		fx_cb_plisthead[i].fx_cb_next = fx_cb_plisthead[i].fx_cb_prev =
286  		    &fx_cb_plisthead[i];
287  	}
288  
289  	/*
290  	 * We're required to return a pointer to our classfuncs
291  	 * structure and the highest global priority value we use.
292  	 */
293  	*clfuncspp = &fx_classfuncs;
294  	return (fx_maxglobpri);
295  }
296  
297  /*
298   * Get or reset the fx_dptbl values per the user's request.
299   */
300  static int
301  fx_admin(caddr_t uaddr, cred_t *reqpcredp)
302  {
303  	fxadmin_t	fxadmin;
304  	fxdpent_t	*tmpdpp;
305  	int		userdpsz;
306  	int		i;
307  	size_t		fxdpsz;
308  
309  	if (get_udatamodel() == DATAMODEL_NATIVE) {
310  		if (copyin(uaddr, &fxadmin, sizeof (fxadmin_t)))
311  			return (EFAULT);
312  	}
313  #ifdef _SYSCALL32_IMPL
314  	else {
315  		/* get fxadmin struct from ILP32 caller */
316  		fxadmin32_t fxadmin32;
317  		if (copyin(uaddr, &fxadmin32, sizeof (fxadmin32_t)))
318  			return (EFAULT);
319  		fxadmin.fx_dpents =
320  		    (struct fxdpent *)(uintptr_t)fxadmin32.fx_dpents;
321  		fxadmin.fx_ndpents = fxadmin32.fx_ndpents;
322  		fxadmin.fx_cmd = fxadmin32.fx_cmd;
323  	}
324  #endif /* _SYSCALL32_IMPL */
325  
326  	fxdpsz = (fx_maxumdpri + 1) * sizeof (fxdpent_t);
327  
328  	switch (fxadmin.fx_cmd) {
329  	case FX_GETDPSIZE:
330  		fxadmin.fx_ndpents = fx_maxumdpri + 1;
331  
332  		if (get_udatamodel() == DATAMODEL_NATIVE) {
333  			if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
334  				return (EFAULT);
335  		}
336  #ifdef _SYSCALL32_IMPL
337  		else {
338  			/* return fxadmin struct to ILP32 caller */
339  			fxadmin32_t fxadmin32;
340  			fxadmin32.fx_dpents =
341  			    (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
342  			fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
343  			fxadmin32.fx_cmd = fxadmin.fx_cmd;
344  			if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
345  				return (EFAULT);
346  		}
347  #endif /* _SYSCALL32_IMPL */
348  		break;
349  
350  	case FX_GETDPTBL:
351  		userdpsz = MIN(fxadmin.fx_ndpents * sizeof (fxdpent_t),
352  		    fxdpsz);
353  		if (copyout(fx_dptbl, fxadmin.fx_dpents, userdpsz))
354  			return (EFAULT);
355  
356  		fxadmin.fx_ndpents = userdpsz / sizeof (fxdpent_t);
357  
358  		if (get_udatamodel() == DATAMODEL_NATIVE) {
359  			if (copyout(&fxadmin, uaddr, sizeof (fxadmin_t)))
360  				return (EFAULT);
361  		}
362  #ifdef _SYSCALL32_IMPL
363  		else {
364  			/* return fxadmin struct to ILP32 callers */
365  			fxadmin32_t fxadmin32;
366  			fxadmin32.fx_dpents =
367  			    (caddr32_t)(uintptr_t)fxadmin.fx_dpents;
368  			fxadmin32.fx_ndpents = fxadmin.fx_ndpents;
369  			fxadmin32.fx_cmd = fxadmin.fx_cmd;
370  			if (copyout(&fxadmin32, uaddr, sizeof (fxadmin32_t)))
371  				return (EFAULT);
372  		}
373  #endif /* _SYSCALL32_IMPL */
374  		break;
375  
376  	case FX_SETDPTBL:
377  		/*
378  		 * We require that the requesting process has sufficient
379  		 * privileges. We also require that the table supplied by
380  		 * the user exactly match the current fx_dptbl in size.
381  		 */
382  		if (secpolicy_dispadm(reqpcredp) != 0) {
383  			return (EPERM);
384  		}
385  		if (fxadmin.fx_ndpents * sizeof (fxdpent_t) != fxdpsz) {
386  			return (EINVAL);
387  		}
388  
389  		/*
390  		 * We read the user supplied table into a temporary buffer
391  		 * where it is validated before being copied over the
392  		 * fx_dptbl.
393  		 */
394  		tmpdpp = kmem_alloc(fxdpsz, KM_SLEEP);
395  		if (copyin(fxadmin.fx_dpents, tmpdpp, fxdpsz)) {
396  			kmem_free(tmpdpp, fxdpsz);
397  			return (EFAULT);
398  		}
399  		for (i = 0; i < fxadmin.fx_ndpents; i++) {
400  
401  			/*
402  			 * Validate the user supplied values. All we are doing
403  			 * here is verifying that the values are within their
404  			 * allowable ranges and will not panic the system. We
405  			 * make no attempt to ensure that the resulting
406  			 * configuration makes sense or results in reasonable
407  			 * performance.
408  			 */
409  			if (tmpdpp[i].fx_quantum <= 0 &&
410  			    tmpdpp[i].fx_quantum != FX_TQINF) {
411  				kmem_free(tmpdpp, fxdpsz);
412  				return (EINVAL);
413  			}
414  		}
415  
416  		/*
417  		 * Copy the user supplied values over the current fx_dptbl
418  		 * values. The fx_globpri member is read-only so we don't
419  		 * overwrite it.
420  		 */
421  		mutex_enter(&fx_dptblock);
422  		for (i = 0; i < fxadmin.fx_ndpents; i++) {
423  			fx_dptbl[i].fx_quantum = tmpdpp[i].fx_quantum;
424  		}
425  		mutex_exit(&fx_dptblock);
426  		kmem_free(tmpdpp, fxdpsz);
427  		break;
428  
429  	default:
430  		return (EINVAL);
431  	}
432  	return (0);
433  }
434  
435  /*
436   * Allocate a fixed priority class specific thread structure and
437   * initialize it with the parameters supplied. Also move the thread
438   * to specified priority.
439   */
440  static int
441  fx_enterclass(kthread_t *t, id_t cid, void *parmsp, cred_t *reqpcredp,
442      void *bufp)
443  {
444  	fxkparms_t	*fxkparmsp = (fxkparms_t *)parmsp;
445  	fxproc_t	*fxpp;
446  	pri_t		reqfxupri;
447  	pri_t		reqfxuprilim;
448  
449  	fxpp = (fxproc_t *)bufp;
450  	ASSERT(fxpp != NULL);
451  
452  	/*
453  	 * Initialize the fxproc structure.
454  	 */
455  	fxpp->fx_flags = 0;
456  	fxpp->fx_callback = NULL;
457  	fxpp->fx_cookie = NULL;
458  
459  	if (fxkparmsp == NULL) {
460  		/*
461  		 * Use default values.
462  		 */
463  		fxpp->fx_pri = fxpp->fx_uprilim = 0;
464  		fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
465  		fxpp->fx_nice =  NZERO;
466  	} else {
467  		/*
468  		 * Use supplied values.
469  		 */
470  
471  		if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0) {
472  			reqfxuprilim = 0;
473  		} else {
474  			if (fxkparmsp->fx_uprilim > FX_MAX_UNPRIV_PRI &&
475  			    secpolicy_setpriority(reqpcredp) != 0)
476  				return (EPERM);
477  			reqfxuprilim = fxkparmsp->fx_uprilim;
478  			FX_ADJUST_PRI(reqfxuprilim);
479  		}
480  
481  		if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0) {
482  			reqfxupri = reqfxuprilim;
483  		} else {
484  			if (fxkparmsp->fx_upri > FX_MAX_UNPRIV_PRI &&
485  			    secpolicy_setpriority(reqpcredp) != 0)
486  				return (EPERM);
487  			/*
488  			 * Set the user priority to the requested value
489  			 * or the upri limit, whichever is lower.
490  			 */
491  			reqfxupri = fxkparmsp->fx_upri;
492  			FX_ADJUST_PRI(reqfxupri);
493  
494  			if (reqfxupri > reqfxuprilim)
495  				reqfxupri = reqfxuprilim;
496  		}
497  
498  
499  		fxpp->fx_uprilim = reqfxuprilim;
500  		fxpp->fx_pri = reqfxupri;
501  
502  		fxpp->fx_nice = NZERO - (NZERO * reqfxupri) / fx_maxupri;
503  
504  		if (((fxkparmsp->fx_cflags & FX_DOTQ) == 0) ||
505  		    (fxkparmsp->fx_tqntm == FX_TQDEF)) {
506  			fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
507  		} else {
508  			if (secpolicy_setpriority(reqpcredp) != 0)
509  				return (EPERM);
510  
511  			if (fxkparmsp->fx_tqntm == FX_TQINF)
512  				fxpp->fx_pquantum = FX_TQINF;
513  			else {
514  				fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
515  			}
516  		}
517  
518  	}
519  
520  	fxpp->fx_timeleft = fxpp->fx_pquantum;
521  	cpucaps_sc_init(&fxpp->fx_caps);
522  	fxpp->fx_tp = t;
523  
524  	thread_lock(t);			/* get dispatcher lock on thread */
525  	t->t_clfuncs = &(sclass[cid].cl_funcs->thread);
526  	t->t_cid = cid;
527  	t->t_cldata = (void *)fxpp;
528  	t->t_schedflag &= ~TS_RUNQMATCH;
529  	fx_change_priority(t, fxpp);
530  	thread_unlock(t);
531  
532  	return (0);
533  }
534  
535  /*
536   * The thread is exiting.
537   */
538  static void
539  fx_exit(kthread_t *t)
540  {
541  	fxproc_t *fxpp;
542  
543  	thread_lock(t);
544  	fxpp = (fxproc_t *)(t->t_cldata);
545  
546  	/*
547  	 * A thread could be exiting in between clock ticks, so we need to
548  	 * calculate how much CPU time it used since it was charged last time.
549  	 *
550  	 * CPU caps are not enforced on exiting processes - it is usually
551  	 * desirable to exit as soon as possible to free resources.
552  	 */
553  	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ONLY);
554  
555  	if (FX_HAS_CB(fxpp)) {
556  		FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
557  		fxpp->fx_callback = NULL;
558  		fxpp->fx_cookie = NULL;
559  		thread_unlock(t);
560  		FX_CB_LIST_DELETE(fxpp);
561  		return;
562  	}
563  
564  	thread_unlock(t);
565  }
566  
567  /*
568   * Exiting the class. Free fxproc structure of thread.
569   */
570  static void
571  fx_exitclass(void *procp)
572  {
573  	fxproc_t *fxpp = (fxproc_t *)procp;
574  
575  	thread_lock(fxpp->fx_tp);
576  	if (FX_HAS_CB(fxpp)) {
577  
578  		FX_CB_EXIT(FX_CALLB(fxpp), fxpp->fx_cookie);
579  
580  		fxpp->fx_callback = NULL;
581  		fxpp->fx_cookie = NULL;
582  		thread_unlock(fxpp->fx_tp);
583  		FX_CB_LIST_DELETE(fxpp);
584  	} else
585  		thread_unlock(fxpp->fx_tp);
586  
587  	kmem_free(fxpp, sizeof (fxproc_t));
588  }
589  
590  /* ARGSUSED */
591  static int
592  fx_canexit(kthread_t *t, cred_t *cred)
593  {
594  	/*
595  	 * A thread can always leave the FX class
596  	 */
597  	return (0);
598  }
599  
600  /*
601   * Initialize fixed-priority class specific proc structure for a child.
602   * callbacks are not inherited upon fork.
603   */
604  static int
605  fx_fork(kthread_t *t, kthread_t *ct, void *bufp)
606  {
607  	fxproc_t	*pfxpp;		/* ptr to parent's fxproc structure */
608  	fxproc_t	*cfxpp;		/* ptr to child's fxproc structure */
609  
610  	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
611  
612  	cfxpp = (fxproc_t *)bufp;
613  	ASSERT(cfxpp != NULL);
614  	thread_lock(t);
615  	pfxpp = (fxproc_t *)t->t_cldata;
616  	/*
617  	 * Initialize child's fxproc structure.
618  	 */
619  	cfxpp->fx_timeleft = cfxpp->fx_pquantum = pfxpp->fx_pquantum;
620  	cfxpp->fx_pri = pfxpp->fx_pri;
621  	cfxpp->fx_uprilim = pfxpp->fx_uprilim;
622  	cfxpp->fx_nice = pfxpp->fx_nice;
623  	cfxpp->fx_callback = NULL;
624  	cfxpp->fx_cookie = NULL;
625  	cfxpp->fx_flags = pfxpp->fx_flags & ~(FXBACKQ);
626  	cpucaps_sc_init(&cfxpp->fx_caps);
627  
628  	cfxpp->fx_tp = ct;
629  	ct->t_cldata = (void *)cfxpp;
630  	thread_unlock(t);
631  
632  	/*
633  	 * Link new structure into fxproc list.
634  	 */
635  	return (0);
636  }
637  
638  
639  /*
640   * Child is placed at back of dispatcher queue and parent gives
641   * up processor so that the child runs first after the fork.
642   * This allows the child immediately execing to break the multiple
643   * use of copy on write pages with no disk home. The parent will
644   * get to steal them back rather than uselessly copying them.
645   */
646  static void
647  fx_forkret(kthread_t *t, kthread_t *ct)
648  {
649  	proc_t	*pp = ttoproc(t);
650  	proc_t	*cp = ttoproc(ct);
651  	fxproc_t *fxpp;
652  
653  	ASSERT(t == curthread);
654  	ASSERT(MUTEX_HELD(&pidlock));
655  
656  	/*
657  	 * Grab the child's p_lock before dropping pidlock to ensure
658  	 * the process does not disappear before we set it running.
659  	 */
660  	mutex_enter(&cp->p_lock);
661  	continuelwps(cp);
662  	mutex_exit(&cp->p_lock);
663  
664  	mutex_enter(&pp->p_lock);
665  	mutex_exit(&pidlock);
666  	continuelwps(pp);
667  
668  	thread_lock(t);
669  	fxpp = (fxproc_t *)(t->t_cldata);
670  	t->t_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
671  	ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
672  	THREAD_TRANSITION(t);
673  	fx_setrun(t);
674  	thread_unlock(t);
675  	/*
676  	 * Safe to drop p_lock now since it is safe to change
677  	 * the scheduling class after this point.
678  	 */
679  	mutex_exit(&pp->p_lock);
680  
681  	swtch();
682  }
683  
684  
685  /*
686   * Get information about the fixed-priority class into the buffer
687   * pointed to by fxinfop. The maximum configured user priority
688   * is the only information we supply.
689   */
690  static int
691  fx_getclinfo(void *infop)
692  {
693  	fxinfo_t *fxinfop = (fxinfo_t *)infop;
694  	fxinfop->fx_maxupri = fx_maxupri;
695  	return (0);
696  }
697  
698  
699  
700  /*
701   * Return the user mode scheduling priority range.
702   */
703  static int
704  fx_getclpri(pcpri_t *pcprip)
705  {
706  	pcprip->pc_clpmax = fx_maxupri;
707  	pcprip->pc_clpmin = 0;
708  	return (0);
709  }
710  
711  
712  static void
713  fx_nullsys()
714  {}
715  
716  
717  /*
718   * Get the fixed-priority parameters of the thread pointed to by
719   * fxprocp into the buffer pointed to by fxparmsp.
720   */
721  static void
722  fx_parmsget(kthread_t *t, void *parmsp)
723  {
724  	fxproc_t *fxpp = (fxproc_t *)t->t_cldata;
725  	fxkparms_t *fxkparmsp = (fxkparms_t *)parmsp;
726  
727  	fxkparmsp->fx_upri = fxpp->fx_pri;
728  	fxkparmsp->fx_uprilim = fxpp->fx_uprilim;
729  	fxkparmsp->fx_tqntm = fxpp->fx_pquantum;
730  }
731  
732  
733  
734  /*
735   * Check the validity of the fixed-priority parameters in the buffer
736   * pointed to by fxparmsp.
737   */
738  static int
739  fx_parmsin(void *parmsp)
740  {
741  	fxparms_t	*fxparmsp = (fxparms_t *)parmsp;
742  	uint_t		cflags;
743  	longlong_t	ticks;
744  	/*
745  	 * Check validity of parameters.
746  	 */
747  
748  	if ((fxparmsp->fx_uprilim > fx_maxupri ||
749  	    fxparmsp->fx_uprilim < 0) &&
750  	    fxparmsp->fx_uprilim != FX_NOCHANGE)
751  		return (EINVAL);
752  
753  	if ((fxparmsp->fx_upri > fx_maxupri ||
754  	    fxparmsp->fx_upri < 0) &&
755  	    fxparmsp->fx_upri != FX_NOCHANGE)
756  		return (EINVAL);
757  
758  	if ((fxparmsp->fx_tqsecs == 0 && fxparmsp->fx_tqnsecs == 0) ||
759  	    fxparmsp->fx_tqnsecs >= NANOSEC)
760  		return (EINVAL);
761  
762  	cflags = (fxparmsp->fx_upri != FX_NOCHANGE ? FX_DOUPRI : 0);
763  
764  	if (fxparmsp->fx_uprilim != FX_NOCHANGE) {
765  		cflags |= FX_DOUPRILIM;
766  	}
767  
768  	if (fxparmsp->fx_tqnsecs != FX_NOCHANGE)
769  		cflags |= FX_DOTQ;
770  
771  	/*
772  	 * convert the buffer to kernel format.
773  	 */
774  
775  	if (fxparmsp->fx_tqnsecs >= 0) {
776  		if ((ticks = SEC_TO_TICK((longlong_t)fxparmsp->fx_tqsecs) +
777  		    NSEC_TO_TICK_ROUNDUP(fxparmsp->fx_tqnsecs)) > INT_MAX)
778  			return (ERANGE);
779  
780  		((fxkparms_t *)fxparmsp)->fx_tqntm = (int)ticks;
781  	} else {
782  		if ((fxparmsp->fx_tqnsecs != FX_NOCHANGE) &&
783  		    (fxparmsp->fx_tqnsecs != FX_TQINF) &&
784  		    (fxparmsp->fx_tqnsecs != FX_TQDEF))
785  			return (EINVAL);
786  		((fxkparms_t *)fxparmsp)->fx_tqntm = fxparmsp->fx_tqnsecs;
787  	}
788  
789  	((fxkparms_t *)fxparmsp)->fx_cflags = cflags;
790  
791  	return (0);
792  }
793  
794  
795  /*
796   * Check the validity of the fixed-priority parameters in the pc_vaparms_t
797   * structure vaparmsp and put them in the buffer pointed to by fxprmsp.
798   * pc_vaparms_t contains (key, value) pairs of parameter.
799   */
800  static int
801  fx_vaparmsin(void *prmsp, pc_vaparms_t *vaparmsp)
802  {
803  	uint_t		secs = 0;
804  	uint_t		cnt;
805  	int		nsecs = 0;
806  	int		priflag, secflag, nsecflag, limflag;
807  	longlong_t	ticks;
808  	fxkparms_t	*fxprmsp = (fxkparms_t *)prmsp;
809  	pc_vaparm_t	*vpp = &vaparmsp->pc_parms[0];
810  
811  
812  	/*
813  	 * First check the validity of parameters and convert them
814  	 * from the user supplied format to the internal format.
815  	 */
816  	priflag = secflag = nsecflag = limflag = 0;
817  
818  	fxprmsp->fx_cflags = 0;
819  
820  	if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
821  		return (EINVAL);
822  
823  	for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
824  
825  		switch (vpp->pc_key) {
826  		case FX_KY_UPRILIM:
827  			if (limflag++)
828  				return (EINVAL);
829  			fxprmsp->fx_cflags |= FX_DOUPRILIM;
830  			fxprmsp->fx_uprilim = (pri_t)vpp->pc_parm;
831  			if (fxprmsp->fx_uprilim > fx_maxupri ||
832  			    fxprmsp->fx_uprilim < 0)
833  				return (EINVAL);
834  			break;
835  
836  		case FX_KY_UPRI:
837  			if (priflag++)
838  				return (EINVAL);
839  			fxprmsp->fx_cflags |= FX_DOUPRI;
840  			fxprmsp->fx_upri = (pri_t)vpp->pc_parm;
841  			if (fxprmsp->fx_upri > fx_maxupri ||
842  			    fxprmsp->fx_upri < 0)
843  				return (EINVAL);
844  			break;
845  
846  		case FX_KY_TQSECS:
847  			if (secflag++)
848  				return (EINVAL);
849  			fxprmsp->fx_cflags |= FX_DOTQ;
850  			secs = (uint_t)vpp->pc_parm;
851  			break;
852  
853  		case FX_KY_TQNSECS:
854  			if (nsecflag++)
855  				return (EINVAL);
856  			fxprmsp->fx_cflags |= FX_DOTQ;
857  			nsecs = (int)vpp->pc_parm;
858  			break;
859  
860  		default:
861  			return (EINVAL);
862  		}
863  	}
864  
865  	if (vaparmsp->pc_vaparmscnt == 0) {
866  		/*
867  		 * Use default parameters.
868  		 */
869  		fxprmsp->fx_upri = 0;
870  		fxprmsp->fx_uprilim = 0;
871  		fxprmsp->fx_tqntm = FX_TQDEF;
872  		fxprmsp->fx_cflags = FX_DOUPRI | FX_DOUPRILIM | FX_DOTQ;
873  	} else if ((fxprmsp->fx_cflags & FX_DOTQ) != 0) {
874  		if ((secs == 0 && nsecs == 0) || nsecs >= NANOSEC)
875  			return (EINVAL);
876  
877  		if (nsecs >= 0) {
878  			if ((ticks = SEC_TO_TICK((longlong_t)secs) +
879  			    NSEC_TO_TICK_ROUNDUP(nsecs)) > INT_MAX)
880  				return (ERANGE);
881  
882  			fxprmsp->fx_tqntm = (int)ticks;
883  		} else {
884  			if (nsecs != FX_TQINF && nsecs != FX_TQDEF)
885  				return (EINVAL);
886  			fxprmsp->fx_tqntm = nsecs;
887  		}
888  	}
889  
890  	return (0);
891  }
892  
893  
894  /*
895   * Nothing to do here but return success.
896   */
897  /* ARGSUSED */
898  static int
899  fx_parmsout(void *parmsp, pc_vaparms_t *vaparmsp)
900  {
901  	register fxkparms_t	*fxkprmsp = (fxkparms_t *)parmsp;
902  
903  	if (vaparmsp != NULL)
904  		return (0);
905  
906  	if (fxkprmsp->fx_tqntm < 0) {
907  		/*
908  		 * Quantum field set to special value (e.g. FX_TQINF)
909  		 */
910  		((fxparms_t *)fxkprmsp)->fx_tqnsecs = fxkprmsp->fx_tqntm;
911  		((fxparms_t *)fxkprmsp)->fx_tqsecs = 0;
912  
913  	} else {
914  		/* Convert quantum from ticks to seconds-nanoseconds */
915  
916  		timestruc_t ts;
917  		TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
918  		((fxparms_t *)fxkprmsp)->fx_tqsecs = ts.tv_sec;
919  		((fxparms_t *)fxkprmsp)->fx_tqnsecs = ts.tv_nsec;
920  	}
921  
922  	return (0);
923  }
924  
925  
926  /*
927   * Copy all selected fixed-priority class parameters to the user.
928   * The parameters are specified by a key.
929   */
930  static int
931  fx_vaparmsout(void *prmsp, pc_vaparms_t *vaparmsp)
932  {
933  	fxkparms_t	*fxkprmsp = (fxkparms_t *)prmsp;
934  	timestruc_t	ts;
935  	uint_t		cnt;
936  	uint_t		secs;
937  	int		nsecs;
938  	int		priflag, secflag, nsecflag, limflag;
939  	pc_vaparm_t	*vpp = &vaparmsp->pc_parms[0];
940  
941  	ASSERT(MUTEX_NOT_HELD(&curproc->p_lock));
942  
943  	priflag = secflag = nsecflag = limflag = 0;
944  
945  	if (vaparmsp->pc_vaparmscnt > PC_VAPARMCNT)
946  		return (EINVAL);
947  
948  	if (fxkprmsp->fx_tqntm < 0) {
949  		/*
950  		 * Quantum field set to special value (e.g. FX_TQINF).
951  		 */
952  		secs = 0;
953  		nsecs = fxkprmsp->fx_tqntm;
954  	} else {
955  		/*
956  		 * Convert quantum from ticks to seconds-nanoseconds.
957  		 */
958  		TICK_TO_TIMESTRUC(fxkprmsp->fx_tqntm, &ts);
959  		secs = ts.tv_sec;
960  		nsecs = ts.tv_nsec;
961  	}
962  
963  
964  	for (cnt = 0; cnt < vaparmsp->pc_vaparmscnt; cnt++, vpp++) {
965  
966  		switch (vpp->pc_key) {
967  		case FX_KY_UPRILIM:
968  			if (limflag++)
969  				return (EINVAL);
970  			if (copyout(&fxkprmsp->fx_uprilim,
971  			    (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
972  				return (EFAULT);
973  			break;
974  
975  		case FX_KY_UPRI:
976  			if (priflag++)
977  				return (EINVAL);
978  			if (copyout(&fxkprmsp->fx_upri,
979  			    (void *)(uintptr_t)vpp->pc_parm, sizeof (pri_t)))
980  				return (EFAULT);
981  			break;
982  
983  		case FX_KY_TQSECS:
984  			if (secflag++)
985  				return (EINVAL);
986  			if (copyout(&secs,
987  			    (void *)(uintptr_t)vpp->pc_parm, sizeof (uint_t)))
988  				return (EFAULT);
989  			break;
990  
991  		case FX_KY_TQNSECS:
992  			if (nsecflag++)
993  				return (EINVAL);
994  			if (copyout(&nsecs,
995  			    (void *)(uintptr_t)vpp->pc_parm, sizeof (int)))
996  				return (EFAULT);
997  			break;
998  
999  		default:
1000  			return (EINVAL);
1001  		}
1002  	}
1003  
1004  	return (0);
1005  }
1006  
1007  /*
1008   * Set the scheduling parameters of the thread pointed to by fxprocp
1009   * to those specified in the buffer pointed to by fxparmsp.
1010   */
1011  /* ARGSUSED */
1012  static int
1013  fx_parmsset(kthread_t *tx, void *parmsp, id_t reqpcid, cred_t *reqpcredp)
1014  {
1015  	char		nice;
1016  	pri_t		reqfxuprilim;
1017  	pri_t		reqfxupri;
1018  	fxkparms_t	*fxkparmsp = (fxkparms_t *)parmsp;
1019  	fxproc_t	*fxpp;
1020  
1021  
1022  	ASSERT(MUTEX_HELD(&(ttoproc(tx))->p_lock));
1023  
1024  	thread_lock(tx);
1025  	fxpp = (fxproc_t *)tx->t_cldata;
1026  
1027  	if ((fxkparmsp->fx_cflags & FX_DOUPRILIM) == 0)
1028  		reqfxuprilim = fxpp->fx_uprilim;
1029  	else
1030  		reqfxuprilim = fxkparmsp->fx_uprilim;
1031  
1032  	/*
1033  	 * Basic permissions enforced by generic kernel code
1034  	 * for all classes require that a thread attempting
1035  	 * to change the scheduling parameters of a target
1036  	 * thread be privileged or have a real or effective
1037  	 * UID matching that of the target thread. We are not
1038  	 * called unless these basic permission checks have
1039  	 * already passed. The fixed priority class requires in
1040  	 * addition that the calling thread be privileged if it
1041  	 * is attempting to raise the pri above its current
1042  	 * value This may have been checked previously but if our
1043  	 * caller passed us a non-NULL credential pointer we assume
1044  	 * it hasn't and we check it here.
1045  	 */
1046  
1047  	if ((reqpcredp != NULL) &&
1048  	    (reqfxuprilim > fxpp->fx_uprilim ||
1049  	    ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)) &&
1050  	    secpolicy_raisepriority(reqpcredp) != 0) {
1051  		thread_unlock(tx);
1052  		return (EPERM);
1053  	}
1054  
1055  	FX_ADJUST_PRI(reqfxuprilim);
1056  
1057  	if ((fxkparmsp->fx_cflags & FX_DOUPRI) == 0)
1058  		reqfxupri = fxpp->fx_pri;
1059  	else
1060  		reqfxupri = fxkparmsp->fx_upri;
1061  
1062  
1063  	/*
1064  	 * Make sure the user priority doesn't exceed the upri limit.
1065  	 */
1066  	if (reqfxupri > reqfxuprilim)
1067  		reqfxupri = reqfxuprilim;
1068  
1069  	/*
1070  	 * Set fx_nice to the nice value corresponding to the user
1071  	 * priority we are setting.  Note that setting the nice field
1072  	 * of the parameter struct won't affect upri or nice.
1073  	 */
1074  
1075  	nice = NZERO - (reqfxupri * NZERO) / fx_maxupri;
1076  
1077  	if (nice > NZERO)
1078  		nice = NZERO;
1079  
1080  	fxpp->fx_uprilim = reqfxuprilim;
1081  	fxpp->fx_pri = reqfxupri;
1082  
1083  	if (fxkparmsp->fx_tqntm == FX_TQINF)
1084  		fxpp->fx_pquantum = FX_TQINF;
1085  	else if (fxkparmsp->fx_tqntm == FX_TQDEF)
1086  		fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1087  	else if ((fxkparmsp->fx_cflags & FX_DOTQ) != 0)
1088  		fxpp->fx_pquantum = fxkparmsp->fx_tqntm;
1089  
1090  	fxpp->fx_nice = nice;
1091  
1092  	fx_change_priority(tx, fxpp);
1093  	thread_unlock(tx);
1094  	return (0);
1095  }
1096  
1097  
1098  /*
1099   * Return the global scheduling priority that would be assigned
1100   * to a thread entering the fixed-priority class with the fx_upri.
1101   */
1102  static pri_t
1103  fx_globpri(kthread_t *t)
1104  {
1105  	fxproc_t *fxpp;
1106  
1107  	ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));
1108  
1109  	fxpp = (fxproc_t *)t->t_cldata;
1110  	return (fx_dptbl[fxpp->fx_pri].fx_globpri);
1111  
1112  }
1113  
1114  /*
1115   * Arrange for thread to be placed in appropriate location
1116   * on dispatcher queue.
1117   *
1118   * This is called with the current thread in TS_ONPROC and locked.
1119   */
1120  static void
1121  fx_preempt(kthread_t *t)
1122  {
1123  	fxproc_t	*fxpp = (fxproc_t *)(t->t_cldata);
1124  
1125  	ASSERT(t == curthread);
1126  	ASSERT(THREAD_LOCK_HELD(curthread));
1127  
1128  	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1129  
1130  	/*
1131  	 * Check to see if we're doing "preemption control" here.  If
1132  	 * we are, and if the user has requested that this thread not
1133  	 * be preempted, and if preemptions haven't been put off for
1134  	 * too long, let the preemption happen here but try to make
1135  	 * sure the thread is rescheduled as soon as possible.  We do
1136  	 * this by putting it on the front of the highest priority run
1137  	 * queue in the FX class.  If the preemption has been put off
1138  	 * for too long, clear the "nopreempt" bit and let the thread
1139  	 * be preempted.
1140  	 */
1141  	if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1142  		if (fxpp->fx_pquantum == FX_TQINF ||
1143  		    fxpp->fx_timeleft > -SC_MAX_TICKS) {
1144  			DTRACE_SCHED1(schedctl__nopreempt, kthread_t *, t);
1145  			schedctl_set_yield(t, 1);
1146  			setfrontdq(t);
1147  			return;
1148  		} else {
1149  			schedctl_set_nopreempt(t, 0);
1150  			DTRACE_SCHED1(schedctl__preempt, kthread_t *, t);
1151  			TNF_PROBE_2(schedctl_preempt, "schedctl FX fx_preempt",
1152  			    /* CSTYLED */, tnf_pid, pid, ttoproc(t)->p_pid,
1153  			    tnf_lwpid, lwpid, t->t_tid);
1154  			/*
1155  			 * Fall through and be preempted below.
1156  			 */
1157  		}
1158  	}
1159  
1160  	if (FX_HAS_CB(fxpp)) {
1161  		clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1162  		pri_t	newpri = fxpp->fx_pri;
1163  		FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1164  		    &new_quantum, &newpri);
1165  		FX_ADJUST_QUANTUM(new_quantum);
1166  		if ((int)new_quantum != fxpp->fx_pquantum) {
1167  			fxpp->fx_pquantum = (int)new_quantum;
1168  			fxpp->fx_timeleft = fxpp->fx_pquantum;
1169  		}
1170  		FX_ADJUST_PRI(newpri);
1171  		fxpp->fx_pri = newpri;
1172  		THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1173  	}
1174  
1175  	/*
1176  	 * This thread may be placed on wait queue by CPU Caps. In this case we
1177  	 * do not need to do anything until it is removed from the wait queue.
1178  	 */
1179  	if (CPUCAPS_ENFORCE(t)) {
1180  		return;
1181  	}
1182  
1183  	if ((fxpp->fx_flags & (FXBACKQ)) == FXBACKQ) {
1184  		fxpp->fx_timeleft = fxpp->fx_pquantum;
1185  		fxpp->fx_flags &= ~FXBACKQ;
1186  		setbackdq(t);
1187  	} else {
1188  		setfrontdq(t);
1189  	}
1190  }
1191  
1192  static void
1193  fx_setrun(kthread_t *t)
1194  {
1195  	fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1196  
1197  	ASSERT(THREAD_LOCK_HELD(t));	/* t should be in transition */
1198  	fxpp->fx_flags &= ~FXBACKQ;
1199  
1200  	if (t->t_disp_time != ddi_get_lbolt())
1201  		setbackdq(t);
1202  	else
1203  		setfrontdq(t);
1204  }
1205  
1206  
1207  /*
1208   * Prepare thread for sleep. We reset the thread priority so it will
1209   * run at the kernel priority level when it wakes up.
1210   */
1211  static void
1212  fx_sleep(kthread_t *t)
1213  {
1214  	fxproc_t	*fxpp = (fxproc_t *)(t->t_cldata);
1215  
1216  	ASSERT(t == curthread);
1217  	ASSERT(THREAD_LOCK_HELD(t));
1218  
1219  	/*
1220  	 * Account for time spent on CPU before going to sleep.
1221  	 */
1222  	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1223  
1224  	if (FX_HAS_CB(fxpp)) {
1225  		FX_CB_SLEEP(FX_CALLB(fxpp), fxpp->fx_cookie);
1226  	}
1227  	t->t_stime = ddi_get_lbolt();		/* time stamp for the swapper */
1228  }
1229  
1230  
1231  /*
1232   * Return Values:
1233   *
1234   *	-1 if the thread is loaded or is not eligible to be swapped in.
1235   *
1236   * FX and RT threads are designed so that they don't swapout; however,
1237   * it is possible that while the thread is swapped out and in another class, it
1238   * can be changed to FX or RT.  Since these threads should be swapped in
1239   * as soon as they're runnable, rt_swapin returns SHRT_MAX, and fx_swapin
1240   * returns SHRT_MAX - 1, so that it gives deference to any swapped out
1241   * RT threads.
1242   */
1243  /* ARGSUSED */
1244  static pri_t
1245  fx_swapin(kthread_t *t, int flags)
1246  {
1247  	pri_t	tpri = -1;
1248  
1249  	ASSERT(THREAD_LOCK_HELD(t));
1250  
1251  	if (t->t_state == TS_RUN && (t->t_schedflag & TS_LOAD) == 0) {
1252  		tpri = (pri_t)SHRT_MAX - 1;
1253  	}
1254  
1255  	return (tpri);
1256  }
1257  
1258  /*
1259   * Return Values
1260   *	-1 if the thread isn't loaded or is not eligible to be swapped out.
1261   */
1262  /* ARGSUSED */
1263  static pri_t
1264  fx_swapout(kthread_t *t, int flags)
1265  {
1266  	ASSERT(THREAD_LOCK_HELD(t));
1267  
1268  	return (-1);
1269  
1270  }
1271  
1272  /* ARGSUSED */
1273  static void
1274  fx_stop(kthread_t *t, int why, int what)
1275  {
1276  	fxproc_t *fxpp = (fxproc_t *)(t->t_cldata);
1277  
1278  	ASSERT(THREAD_LOCK_HELD(t));
1279  
1280  	if (FX_HAS_CB(fxpp)) {
1281  		FX_CB_STOP(FX_CALLB(fxpp), fxpp->fx_cookie);
1282  	}
1283  }
1284  
1285  /*
1286   * Check for time slice expiration.  If time slice has expired
1287   * set runrun to cause preemption.
1288   */
1289  static void
1290  fx_tick(kthread_t *t)
1291  {
1292  	boolean_t call_cpu_surrender = B_FALSE;
1293  	fxproc_t *fxpp;
1294  
1295  	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1296  
1297  	thread_lock(t);
1298  
1299  	fxpp = (fxproc_t *)(t->t_cldata);
1300  
1301  	if (FX_HAS_CB(fxpp)) {
1302  		clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1303  		pri_t	newpri = fxpp->fx_pri;
1304  		FX_CB_TICK(FX_CALLB(fxpp), fxpp->fx_cookie,
1305  		    &new_quantum, &newpri);
1306  		FX_ADJUST_QUANTUM(new_quantum);
1307  		if ((int)new_quantum != fxpp->fx_pquantum) {
1308  			fxpp->fx_pquantum = (int)new_quantum;
1309  			fxpp->fx_timeleft = fxpp->fx_pquantum;
1310  		}
1311  		FX_ADJUST_PRI(newpri);
1312  		if (newpri != fxpp->fx_pri) {
1313  			fxpp->fx_pri = newpri;
1314  			fx_change_priority(t, fxpp);
1315  		}
1316  	}
1317  
1318  	/*
1319  	 * Keep track of thread's project CPU usage.  Note that projects
1320  	 * get charged even when threads are running in the kernel.
1321  	 */
1322  	call_cpu_surrender =  CPUCAPS_CHARGE(t, &fxpp->fx_caps,
1323  	    CPUCAPS_CHARGE_ENFORCE);
1324  
1325  	if ((fxpp->fx_pquantum != FX_TQINF) &&
1326  	    (--fxpp->fx_timeleft <= 0)) {
1327  		pri_t	new_pri;
1328  
1329  		/*
1330  		 * If we're doing preemption control and trying to
1331  		 * avoid preempting this thread, just note that
1332  		 * the thread should yield soon and let it keep
1333  		 * running (unless it's been a while).
1334  		 */
1335  		if (t->t_schedctl && schedctl_get_nopreempt(t)) {
1336  			if (fxpp->fx_timeleft > -SC_MAX_TICKS) {
1337  				DTRACE_SCHED1(schedctl__nopreempt,
1338  				    kthread_t *, t);
1339  				schedctl_set_yield(t, 1);
1340  				thread_unlock_nopreempt(t);
1341  				return;
1342  			}
1343  			TNF_PROBE_2(schedctl_failsafe,
1344  			    "schedctl FX fx_tick", /* CSTYLED */,
1345  			    tnf_pid, pid, ttoproc(t)->p_pid,
1346  			    tnf_lwpid, lwpid, t->t_tid);
1347  		}
1348  		new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1349  		ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1350  		/*
1351  		 * When the priority of a thread is changed,
1352  		 * it may be necessary to adjust its position
1353  		 * on a sleep queue or dispatch queue. Even
1354  		 * when the priority is not changed, we need
1355  		 * to preserve round robin on dispatch queue.
1356  		 * The function thread_change_pri accomplishes
1357  		 * this.
1358  		 */
1359  		if (thread_change_pri(t, new_pri, 0)) {
1360  			fxpp->fx_timeleft = fxpp->fx_pquantum;
1361  		} else {
1362  			call_cpu_surrender = B_TRUE;
1363  		}
1364  	} else if (t->t_state == TS_ONPROC &&
1365  	    t->t_pri < t->t_disp_queue->disp_maxrunpri) {
1366  		call_cpu_surrender = B_TRUE;
1367  	}
1368  
1369  	if (call_cpu_surrender) {
1370  		fxpp->fx_flags |= FXBACKQ;
1371  		cpu_surrender(t);
1372  	}
1373  	thread_unlock_nopreempt(t);	/* clock thread can't be preempted */
1374  }
1375  
1376  
1377  static void
1378  fx_trapret(kthread_t *t)
1379  {
1380  	cpu_t		*cp = CPU;
1381  
1382  	ASSERT(THREAD_LOCK_HELD(t));
1383  	ASSERT(t == curthread);
1384  	ASSERT(cp->cpu_dispthread == t);
1385  	ASSERT(t->t_state == TS_ONPROC);
1386  }
1387  
1388  
1389  /*
1390   * Processes waking up go to the back of their queue.
1391   */
1392  static void
1393  fx_wakeup(kthread_t *t)
1394  {
1395  	fxproc_t	*fxpp = (fxproc_t *)(t->t_cldata);
1396  
1397  	ASSERT(THREAD_LOCK_HELD(t));
1398  
1399  	t->t_stime = ddi_get_lbolt();		/* time stamp for the swapper */
1400  	if (FX_HAS_CB(fxpp)) {
1401  		clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1402  		pri_t	newpri = fxpp->fx_pri;
1403  		FX_CB_WAKEUP(FX_CALLB(fxpp), fxpp->fx_cookie,
1404  		    &new_quantum, &newpri);
1405  		FX_ADJUST_QUANTUM(new_quantum);
1406  		if ((int)new_quantum != fxpp->fx_pquantum) {
1407  			fxpp->fx_pquantum = (int)new_quantum;
1408  			fxpp->fx_timeleft = fxpp->fx_pquantum;
1409  		}
1410  
1411  		FX_ADJUST_PRI(newpri);
1412  		if (newpri != fxpp->fx_pri) {
1413  			fxpp->fx_pri = newpri;
1414  			THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1415  		}
1416  	}
1417  
1418  	fxpp->fx_flags &= ~FXBACKQ;
1419  
1420  	if (t->t_disp_time != ddi_get_lbolt())
1421  		setbackdq(t);
1422  	else
1423  		setfrontdq(t);
1424  }
1425  
1426  
1427  /*
1428   * When a thread yields, put it on the back of the run queue.
1429   */
1430  static void
1431  fx_yield(kthread_t *t)
1432  {
1433  	fxproc_t	*fxpp = (fxproc_t *)(t->t_cldata);
1434  
1435  	ASSERT(t == curthread);
1436  	ASSERT(THREAD_LOCK_HELD(t));
1437  
1438  	/*
1439  	 * Collect CPU usage spent before yielding CPU.
1440  	 */
1441  	(void) CPUCAPS_CHARGE(t, &fxpp->fx_caps, CPUCAPS_CHARGE_ENFORCE);
1442  
1443  	if (FX_HAS_CB(fxpp))  {
1444  		clock_t new_quantum =  (clock_t)fxpp->fx_pquantum;
1445  		pri_t	newpri = fxpp->fx_pri;
1446  		FX_CB_PREEMPT(FX_CALLB(fxpp), fxpp->fx_cookie,
1447  		    &new_quantum, &newpri);
1448  		FX_ADJUST_QUANTUM(new_quantum);
1449  		if ((int)new_quantum != fxpp->fx_pquantum) {
1450  			fxpp->fx_pquantum = (int)new_quantum;
1451  			fxpp->fx_timeleft = fxpp->fx_pquantum;
1452  		}
1453  		FX_ADJUST_PRI(newpri);
1454  		fxpp->fx_pri = newpri;
1455  		THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1456  	}
1457  
1458  	/*
1459  	 * Clear the preemption control "yield" bit since the user is
1460  	 * doing a yield.
1461  	 */
1462  	if (t->t_schedctl)
1463  		schedctl_set_yield(t, 0);
1464  
1465  	if (fxpp->fx_timeleft <= 0) {
1466  		/*
1467  		 * Time slice was artificially extended to avoid
1468  		 * preemption, so pretend we're preempting it now.
1469  		 */
1470  		DTRACE_SCHED1(schedctl__yield, int, -fxpp->fx_timeleft);
1471  		fxpp->fx_timeleft = fxpp->fx_pquantum;
1472  		THREAD_CHANGE_PRI(t, fx_dptbl[fxpp->fx_pri].fx_globpri);
1473  		ASSERT(t->t_pri >= 0 && t->t_pri <= fx_maxglobpri);
1474  	}
1475  
1476  	fxpp->fx_flags &= ~FXBACKQ;
1477  	setbackdq(t);
1478  }
1479  
1480  /*
1481   * Increment the nice value of the specified thread by incr and
1482   * return the new value in *retvalp.
1483   */
1484  static int
1485  fx_donice(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1486  {
1487  	int		newnice;
1488  	fxproc_t	*fxpp = (fxproc_t *)(t->t_cldata);
1489  	fxkparms_t	fxkparms;
1490  
1491  	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1492  
1493  	/* If there's no change to priority, just return current setting */
1494  	if (incr == 0) {
1495  		if (retvalp) {
1496  			*retvalp = fxpp->fx_nice - NZERO;
1497  		}
1498  		return (0);
1499  	}
1500  
1501  	if ((incr < 0 || incr > 2 * NZERO) &&
1502  	    secpolicy_raisepriority(cr) != 0)
1503  		return (EPERM);
1504  
1505  	/*
1506  	 * Specifying a nice increment greater than the upper limit of
1507  	 * 2 * NZERO - 1 will result in the thread's nice value being
1508  	 * set to the upper limit.  We check for this before computing
1509  	 * the new value because otherwise we could get overflow
1510  	 * if a privileged user specified some ridiculous increment.
1511  	 */
1512  	if (incr > 2 * NZERO - 1)
1513  		incr = 2 * NZERO - 1;
1514  
1515  	newnice = fxpp->fx_nice + incr;
1516  	if (newnice > NZERO)
1517  		newnice = NZERO;
1518  	else if (newnice < 0)
1519  		newnice = 0;
1520  
1521  	fxkparms.fx_uprilim = fxkparms.fx_upri =
1522  	    -((newnice - NZERO) * fx_maxupri) / NZERO;
1523  
1524  	fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1525  
1526  	fxkparms.fx_tqntm = FX_TQDEF;
1527  
1528  	/*
1529  	 * Reset the uprilim and upri values of the thread. Adjust
1530  	 * time quantum accordingly.
1531  	 */
1532  
1533  	(void) fx_parmsset(t, (void *)&fxkparms, (id_t)0, (cred_t *)NULL);
1534  
1535  	/*
1536  	 * Although fx_parmsset already reset fx_nice it may
1537  	 * not have been set to precisely the value calculated above
1538  	 * because fx_parmsset determines the nice value from the
1539  	 * user priority and we may have truncated during the integer
1540  	 * conversion from nice value to user priority and back.
1541  	 * We reset fx_nice to the value we calculated above.
1542  	 */
1543  	fxpp->fx_nice = (char)newnice;
1544  
1545  	if (retvalp)
1546  		*retvalp = newnice - NZERO;
1547  
1548  	return (0);
1549  }
1550  
1551  /*
1552   * Increment the priority of the specified thread by incr and
1553   * return the new value in *retvalp.
1554   */
1555  static int
1556  fx_doprio(kthread_t *t, cred_t *cr, int incr, int *retvalp)
1557  {
1558  	int		newpri;
1559  	fxproc_t	*fxpp = (fxproc_t *)(t->t_cldata);
1560  	fxkparms_t	fxkparms;
1561  
1562  	ASSERT(MUTEX_HELD(&(ttoproc(t))->p_lock));
1563  
1564  	/* If there's no change to priority, just return current setting */
1565  	if (incr == 0) {
1566  		*retvalp = fxpp->fx_pri;
1567  		return (0);
1568  	}
1569  
1570  	newpri = fxpp->fx_pri + incr;
1571  	if (newpri > fx_maxupri || newpri < 0)
1572  		return (EINVAL);
1573  
1574  	*retvalp = newpri;
1575  	fxkparms.fx_uprilim = fxkparms.fx_upri = newpri;
1576  	fxkparms.fx_tqntm = FX_NOCHANGE;
1577  	fxkparms.fx_cflags = FX_DOUPRILIM | FX_DOUPRI;
1578  
1579  	/*
1580  	 * Reset the uprilim and upri values of the thread.
1581  	 */
1582  	return (fx_parmsset(t, (void *)&fxkparms, (id_t)0, cr));
1583  }
1584  
1585  static void
1586  fx_change_priority(kthread_t *t, fxproc_t *fxpp)
1587  {
1588  	pri_t	new_pri;
1589  
1590  	ASSERT(THREAD_LOCK_HELD(t));
1591  	new_pri = fx_dptbl[fxpp->fx_pri].fx_globpri;
1592  	ASSERT(new_pri >= 0 && new_pri <= fx_maxglobpri);
1593  	t->t_cpri = fxpp->fx_pri;
1594  	if (t == curthread || t->t_state == TS_ONPROC) {
1595  		/* curthread is always onproc */
1596  		cpu_t	*cp = t->t_disp_queue->disp_cpu;
1597  		THREAD_CHANGE_PRI(t, new_pri);
1598  		if (t == cp->cpu_dispthread)
1599  			cp->cpu_dispatch_pri = DISP_PRIO(t);
1600  		if (DISP_MUST_SURRENDER(t)) {
1601  			fxpp->fx_flags |= FXBACKQ;
1602  			cpu_surrender(t);
1603  		} else {
1604  			fxpp->fx_timeleft = fxpp->fx_pquantum;
1605  		}
1606  	} else {
1607  		/*
1608  		 * When the priority of a thread is changed,
1609  		 * it may be necessary to adjust its position
1610  		 * on a sleep queue or dispatch queue.
1611  		 * The function thread_change_pri accomplishes
1612  		 * this.
1613  		 */
1614  		if (thread_change_pri(t, new_pri, 0)) {
1615  			/*
1616  			 * The thread was on a run queue. Reset
1617  			 * its CPU timeleft from the quantum
1618  			 * associated with the new priority.
1619  			 */
1620  			fxpp->fx_timeleft = fxpp->fx_pquantum;
1621  		} else {
1622  			fxpp->fx_flags |= FXBACKQ;
1623  		}
1624  	}
1625  }
1626  
1627  static int
1628  fx_alloc(void **p, int flag)
1629  {
1630  	void *bufp;
1631  
1632  	bufp = kmem_alloc(sizeof (fxproc_t), flag);
1633  	if (bufp == NULL) {
1634  		return (ENOMEM);
1635  	} else {
1636  		*p = bufp;
1637  		return (0);
1638  	}
1639  }
1640  
1641  static void
1642  fx_free(void *bufp)
1643  {
1644  	if (bufp)
1645  		kmem_free(bufp, sizeof (fxproc_t));
1646  }
1647  
1648  /*
1649   * Release the callback list mutex after successful lookup
1650   */
1651  void
1652  fx_list_release(fxproc_t *fxpp)
1653  {
1654  	int index = FX_CB_LIST_HASH(fxpp->fx_ktid);
1655  	kmutex_t *lockp = &fx_cb_list_lock[index];
1656  	mutex_exit(lockp);
1657  }
1658  
1659  fxproc_t *
1660  fx_list_lookup(kt_did_t ktid)
1661  {
1662  	int index = FX_CB_LIST_HASH(ktid);
1663  	kmutex_t *lockp = &fx_cb_list_lock[index];
1664  	fxproc_t *fxpp;
1665  
1666  	mutex_enter(lockp);
1667  
1668  	for (fxpp = fx_cb_plisthead[index].fx_cb_next;
1669  	    fxpp != &fx_cb_plisthead[index]; fxpp = fxpp->fx_cb_next) {
1670  		if (fxpp->fx_tp->t_cid == fx_cid && fxpp->fx_ktid == ktid &&
1671  		    fxpp->fx_callback != NULL) {
1672  			/*
1673  			 * The caller is responsible for calling
1674  			 * fx_list_release to drop the lock upon
1675  			 * successful lookup
1676  			 */
1677  			return (fxpp);
1678  		}
1679  	}
1680  	mutex_exit(lockp);
1681  	return ((fxproc_t *)NULL);
1682  }
1683  
1684  
1685  /*
1686   * register a callback set of routines for current thread
1687   * thread should already be in FX class
1688   */
1689  int
1690  fx_register_callbacks(fx_callbacks_t *fx_callback, fx_cookie_t cookie,
1691  	pri_t pri, clock_t quantum)
1692  {
1693  
1694  	fxproc_t	*fxpp;
1695  
1696  	if (fx_callback == NULL)
1697  		return (EINVAL);
1698  
1699  	if (secpolicy_dispadm(CRED()) != 0)
1700  		return (EPERM);
1701  
1702  	if (FX_CB_VERSION(fx_callback) != FX_CALLB_REV)
1703  		return (EINVAL);
1704  
1705  	if (!FX_ISVALID(pri, quantum))
1706  		return (EINVAL);
1707  
1708  	thread_lock(curthread);		/* get dispatcher lock on thread */
1709  
1710  	if (curthread->t_cid != fx_cid) {
1711  		thread_unlock(curthread);
1712  		return (EINVAL);
1713  	}
1714  
1715  	fxpp = (fxproc_t *)(curthread->t_cldata);
1716  	ASSERT(fxpp != NULL);
1717  	if (FX_HAS_CB(fxpp)) {
1718  		thread_unlock(curthread);
1719  		return (EINVAL);
1720  	}
1721  
1722  	fxpp->fx_callback = fx_callback;
1723  	fxpp->fx_cookie = cookie;
1724  
1725  	if (pri != FX_CB_NOCHANGE) {
1726  		fxpp->fx_pri = pri;
1727  		FX_ADJUST_PRI(fxpp->fx_pri);
1728  		if (quantum == FX_TQDEF) {
1729  			fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1730  		} else if (quantum == FX_TQINF) {
1731  			fxpp->fx_pquantum = FX_TQINF;
1732  		} else if (quantum != FX_NOCHANGE) {
1733  			FX_ADJUST_QUANTUM(quantum);
1734  			fxpp->fx_pquantum = quantum;
1735  		}
1736  	} else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1737  		if (quantum == FX_TQINF)
1738  			fxpp->fx_pquantum = FX_TQINF;
1739  		else {
1740  			FX_ADJUST_QUANTUM(quantum);
1741  			fxpp->fx_pquantum = quantum;
1742  		}
1743  	}
1744  
1745  	fxpp->fx_ktid = ddi_get_kt_did();
1746  
1747  	fx_change_priority(curthread, fxpp);
1748  
1749  	thread_unlock(curthread);
1750  
1751  	/*
1752  	 * Link new structure into fxproc list.
1753  	 */
1754  	FX_CB_LIST_INSERT(fxpp);
1755  	return (0);
1756  }
1757  
1758  /* unregister a callback set of routines for current thread */
1759  int
1760  fx_unregister_callbacks()
1761  {
1762  	fxproc_t	*fxpp;
1763  
1764  	if ((fxpp = fx_list_lookup(ddi_get_kt_did())) == NULL) {
1765  		/*
1766  		 * did not have a registered callback;
1767  		 */
1768  		return (EINVAL);
1769  	}
1770  
1771  	thread_lock(fxpp->fx_tp);
1772  	fxpp->fx_callback = NULL;
1773  	fxpp->fx_cookie = NULL;
1774  	thread_unlock(fxpp->fx_tp);
1775  	fx_list_release(fxpp);
1776  
1777  	FX_CB_LIST_DELETE(fxpp);
1778  	return (0);
1779  }
1780  
1781  /*
1782   * modify priority and/or quantum value of a thread with callback
1783   */
1784  int
1785  fx_modify_priority(kt_did_t ktid, clock_t quantum, pri_t pri)
1786  {
1787  	fxproc_t	*fxpp;
1788  
1789  	if (!FX_ISVALID(pri, quantum))
1790  		return (EINVAL);
1791  
1792  	if ((fxpp = fx_list_lookup(ktid)) == NULL) {
1793  		/*
1794  		 * either thread had exited or did not have a registered
1795  		 * callback;
1796  		 */
1797  		return (ESRCH);
1798  	}
1799  
1800  	thread_lock(fxpp->fx_tp);
1801  
1802  	if (pri != FX_CB_NOCHANGE) {
1803  		fxpp->fx_pri = pri;
1804  		FX_ADJUST_PRI(fxpp->fx_pri);
1805  		if (quantum == FX_TQDEF) {
1806  			fxpp->fx_pquantum = fx_dptbl[fxpp->fx_pri].fx_quantum;
1807  		} else if (quantum == FX_TQINF) {
1808  			fxpp->fx_pquantum = FX_TQINF;
1809  		} else if (quantum != FX_NOCHANGE) {
1810  			FX_ADJUST_QUANTUM(quantum);
1811  			fxpp->fx_pquantum = quantum;
1812  		}
1813  	} else if (quantum != FX_NOCHANGE && quantum != FX_TQDEF) {
1814  		if (quantum == FX_TQINF) {
1815  			fxpp->fx_pquantum = FX_TQINF;
1816  		} else {
1817  			FX_ADJUST_QUANTUM(quantum);
1818  			fxpp->fx_pquantum = quantum;
1819  		}
1820  	}
1821  
1822  	fx_change_priority(fxpp->fx_tp, fxpp);
1823  
1824  	thread_unlock(fxpp->fx_tp);
1825  	fx_list_release(fxpp);
1826  	return (0);
1827  }
1828  
1829  
1830  /*
1831   * return an iblock cookie for mutex initialization to be used in callbacks
1832   */
1833  void *
1834  fx_get_mutex_cookie()
1835  {
1836  	return ((void *)(uintptr_t)__ipltospl(DISP_LEVEL));
1837  }
1838  
1839  /*
1840   * return maximum relative priority
1841   */
1842  pri_t
1843  fx_get_maxpri()
1844  {
1845  	return (fx_maxumdpri);
1846  }
1847