xref: /titanic_50/usr/src/uts/common/disp/cpupart.c (revision 3db86aab554edbb4244c8d1a1c90f152eee768af)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/systm.h>
31 #include <sys/cmn_err.h>
32 #include <sys/cpuvar.h>
33 #include <sys/thread.h>
34 #include <sys/disp.h>
35 #include <sys/kmem.h>
36 #include <sys/debug.h>
37 #include <sys/cpupart.h>
38 #include <sys/pset.h>
39 #include <sys/var.h>
40 #include <sys/cyclic.h>
41 #include <sys/lgrp.h>
42 #include <sys/chip.h>
43 #include <sys/loadavg.h>
44 #include <sys/class.h>
45 #include <sys/fss.h>
46 #include <sys/pool.h>
47 #include <sys/pool_pset.h>
48 #include <sys/policy.h>
49 
50 /*
51  * Calling pool_lock() protects the pools configuration, which includes
52  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
53  * partitions from being created or destroyed while the lock is held.
54  * The lock ordering with respect to related locks is:
55  *
56  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
57  *
58  * Blocking memory allocations may be made while holding "pool_lock"
59  * or cpu_lock.
60  */
61 
62 /*
63  * The cp_default partition is allocated statically, but its lgroup load average
64  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
65  * saves some memory since the space allocated reflects the actual number of
66  * lgroups supported by the platform. The lgrp facility provides a temporary
67  * space to hold lpl information during system bootstrap.
68  */
69 
70 cpupart_t		*cp_list_head;
71 cpupart_t		cp_default;
72 static cpupartid_t	cp_id_next;
73 uint_t			cp_numparts;
74 uint_t			cp_numparts_nonempty;
75 
76 /*
77  * Need to limit total number of partitions to avoid slowing down the
78  * clock code too much.  The clock code traverses the list of
79  * partitions and needs to be able to execute in a reasonable amount
80  * of time (less than 1/hz seconds).  The maximum is sized based on
81  * max_ncpus so it shouldn't be a problem unless there are large
82  * numbers of empty partitions.
83  */
84 static uint_t		cp_max_numparts;
85 
86 /*
87  * Processor sets and CPU partitions are different but related concepts.
88  * A processor set is a user-level abstraction allowing users to create
89  * sets of CPUs and bind threads exclusively to those sets.  A CPU
90  * partition is a kernel dispatcher object consisting of a set of CPUs
91  * and a global dispatch queue.  The processor set abstraction is
92  * implemented via a CPU partition, and currently there is a 1-1
93  * mapping between processor sets and partitions (excluding the default
94  * partition, which is not visible as a processor set).  Hence, the
95  * numbering for processor sets and CPU partitions is identical.  This
96  * may not always be true in the future, and these macros could become
97  * less trivial if we support e.g. a processor set containing multiple
98  * CPU partitions.
99  */
100 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
101 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
102 
103 
104 /*
105  * Find a CPU partition given a processor set ID.
106  */
107 static cpupart_t *
108 cpupart_find_all(psetid_t psid)
109 {
110 	cpupart_t *cp;
111 	cpupartid_t cpid = PSTOCP(psid);
112 
113 	ASSERT(MUTEX_HELD(&cpu_lock));
114 
115 	/* default partition not visible as a processor set */
116 	if (psid == CP_DEFAULT)
117 		return (NULL);
118 
119 	if (psid == PS_MYID)
120 		return (curthread->t_cpupart);
121 
122 	cp = cp_list_head;
123 	do {
124 		if (cp->cp_id == cpid)
125 			return (cp);
126 		cp = cp->cp_next;
127 	} while (cp != cp_list_head);
128 	return (NULL);
129 }
130 
131 /*
132  * Find a CPU partition given a processor set ID if the processor set
133  * should be visible from the calling zone.
134  */
135 cpupart_t *
136 cpupart_find(psetid_t psid)
137 {
138 	cpupart_t *cp;
139 
140 	ASSERT(MUTEX_HELD(&cpu_lock));
141 	cp = cpupart_find_all(psid);
142 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
143 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
144 			return (NULL);
145 	return (cp);
146 }
147 
148 static int
149 cpupart_kstat_update(kstat_t *ksp, int rw)
150 {
151 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
152 	cpupart_kstat_t *cpksp = ksp->ks_data;
153 
154 	if (rw == KSTAT_WRITE)
155 		return (EACCES);
156 
157 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
158 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
159 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
160 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
161 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
162 	    (16 - FSHIFT);
163 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
164 	    (16 - FSHIFT);
165 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
166 	    (16 - FSHIFT);
167 	return (0);
168 }
169 
170 static void
171 cpupart_kstat_create(cpupart_t *cp)
172 {
173 	kstat_t *ksp;
174 	zoneid_t zoneid;
175 
176 	ASSERT(MUTEX_HELD(&cpu_lock));
177 
178 	/*
179 	 * We have a bit of a chicken-egg problem since this code will
180 	 * get called to create the kstats for CP_DEFAULT before the
181 	 * pools framework gets initialized.  We circumvent the problem
182 	 * by special-casing cp_default.
183 	 */
184 	if (cp != &cp_default && pool_pset_enabled())
185 		zoneid = GLOBAL_ZONEID;
186 	else
187 		zoneid = ALL_ZONES;
188 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
189 	    KSTAT_TYPE_NAMED,
190 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
191 	if (ksp != NULL) {
192 		cpupart_kstat_t *cpksp = ksp->ks_data;
193 
194 		kstat_named_init(&cpksp->cpk_updates, "updates",
195 		    KSTAT_DATA_UINT64);
196 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
197 		    KSTAT_DATA_UINT64);
198 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
199 		    KSTAT_DATA_UINT64);
200 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
201 		    KSTAT_DATA_UINT32);
202 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
203 		    KSTAT_DATA_UINT32);
204 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
205 		    KSTAT_DATA_UINT32);
206 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
207 		    KSTAT_DATA_UINT32);
208 
209 		ksp->ks_update = cpupart_kstat_update;
210 		ksp->ks_private = cp;
211 
212 		kstat_install(ksp);
213 	}
214 	cp->cp_kstat = ksp;
215 }
216 
217 /*
218  * Initialize the default partition and kpreempt disp queue.
219  */
220 void
221 cpupart_initialize_default(void)
222 {
223 	lgrp_id_t i;
224 
225 	cp_list_head = &cp_default;
226 	cp_default.cp_next = &cp_default;
227 	cp_default.cp_prev = &cp_default;
228 	cp_default.cp_id = CP_DEFAULT;
229 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
230 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
231 	cp_default.cp_kp_queue.disp_cpu = NULL;
232 	cp_default.cp_gen = 0;
233 	cp_default.cp_loadavg.lg_cur = 0;
234 	cp_default.cp_loadavg.lg_len = 0;
235 	cp_default.cp_loadavg.lg_total = 0;
236 	for (i = 0; i < S_LOADAVG_SZ; i++) {
237 		cp_default.cp_loadavg.lg_loads[i] = 0;
238 	}
239 	CPUSET_ZERO(cp_default.cp_haltset);
240 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
241 	cp_id_next = CP_DEFAULT + 1;
242 	cpupart_kstat_create(&cp_default);
243 	cp_numparts = 1;
244 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
245 		cp_max_numparts = max_ncpus * 2 + 1;
246 	/*
247 	 * Allocate space for cp_default list of lgrploads
248 	 */
249 	cp_default.cp_nlgrploads = lgrp_plat_max_lgrps();
250 	cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) *
251 	    cp_default.cp_nlgrploads, KM_SLEEP);
252 
253 	/*
254 	 * The initial lpl topology is created in a special lpl list
255 	 * lpl_bootstrap. It should be copied to cp_default.
256 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
257 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
258 	 */
259 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
260 	    cp_default.cp_nlgrploads);
261 
262 	for (i = 0; i < cp_default.cp_nlgrploads; i++) {
263 		cp_default.cp_lgrploads[i].lpl_lgrpid = i;
264 	}
265 	cp_default.cp_attr = PSET_NOESCAPE;
266 	cp_numparts_nonempty = 1;
267 	/*
268 	 * Set t0's home
269 	 */
270 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
271 }
272 
273 
274 static int
275 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
276 {
277 	cpupart_t *oldpp;
278 	cpu_t	*ncp, *newlist;
279 	kthread_t *t;
280 	int	move_threads = 1;
281 	lgrp_id_t lgrpid;
282 	proc_t 	*p;
283 	int lgrp_diff_lpl;
284 	lpl_t	*cpu_lpl;
285 	int	ret;
286 
287 	ASSERT(MUTEX_HELD(&cpu_lock));
288 	ASSERT(newpp != NULL);
289 
290 	oldpp = cp->cpu_part;
291 	ASSERT(oldpp != NULL);
292 	ASSERT(oldpp->cp_ncpus > 0);
293 
294 	if (newpp == oldpp) {
295 		/*
296 		 * Don't need to do anything.
297 		 */
298 		return (0);
299 	}
300 
301 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
302 
303 	if (!disp_bound_partition(cp, 0)) {
304 		/*
305 		 * Don't need to move threads if there are no threads in
306 		 * the partition.  Note that threads can't enter the
307 		 * partition while we're holding cpu_lock.
308 		 */
309 		move_threads = 0;
310 	} else if (oldpp->cp_ncpus == 1) {
311 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
312 		return (EBUSY);
313 	}
314 
315 	if (forced && (ret = cpu_unbind(cp->cpu_id)) != 0) {
316 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
317 		return (ret);
318 	}
319 
320 	/*
321 	 * Stop further threads weak binding to this cpu.
322 	 */
323 	cpu_inmotion = cp;
324 	membar_enter();
325 
326 again:
327 	if (move_threads) {
328 		int loop_count;
329 		/*
330 		 * Check for threads strong or weak bound to this CPU.
331 		 */
332 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
333 			if (loop_count >= 5) {
334 				cpu_state_change_notify(cp->cpu_id,
335 				    CPU_CPUPART_IN);
336 				cpu_inmotion = NULL;
337 				return (EBUSY);	/* some threads still bound */
338 			}
339 			delay(1);
340 		}
341 	}
342 
343 	/*
344 	 * Before we actually start changing data structures, notify
345 	 * the cyclic subsystem that we want to move this CPU out of its
346 	 * partition.
347 	 */
348 	if (!cyclic_move_out(cp)) {
349 		/*
350 		 * This CPU must be the last CPU in a processor set with
351 		 * a bound cyclic.
352 		 */
353 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
354 		cpu_inmotion = NULL;
355 		return (EBUSY);
356 	}
357 
358 	pause_cpus(cp);
359 
360 	if (move_threads) {
361 		/*
362 		 * The thread on cpu before the pause thread may have read
363 		 * cpu_inmotion before we raised the barrier above.  Check
364 		 * again.
365 		 */
366 		if (disp_bound_threads(cp, 1)) {
367 			start_cpus();
368 			goto again;
369 		}
370 
371 	}
372 
373 	/*
374 	 * Update the set of chip's being spanned
375 	 */
376 	chip_cpu_move_part(cp, oldpp, newpp);
377 
378 	/* save this cpu's lgroup -- it'll be the same in the new partition */
379 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
380 
381 	cpu_lpl = cp->cpu_lpl;
382 	/*
383 	 * let the lgroup framework know cp has left the partition
384 	 */
385 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
386 
387 	/* move out of old partition */
388 	oldpp->cp_ncpus--;
389 	if (oldpp->cp_ncpus > 0) {
390 
391 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
392 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
393 		if (oldpp->cp_cpulist == cp) {
394 			oldpp->cp_cpulist = ncp;
395 		}
396 	} else {
397 		ncp = oldpp->cp_cpulist = NULL;
398 		cp_numparts_nonempty--;
399 		ASSERT(cp_numparts_nonempty != 0);
400 	}
401 	oldpp->cp_gen++;
402 
403 	/* move into new partition */
404 	newlist = newpp->cp_cpulist;
405 	if (newlist == NULL) {
406 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
407 		cp_numparts_nonempty++;
408 		ASSERT(cp_numparts_nonempty != 0);
409 	} else {
410 		cp->cpu_next_part = newlist;
411 		cp->cpu_prev_part = newlist->cpu_prev_part;
412 		newlist->cpu_prev_part->cpu_next_part = cp;
413 		newlist->cpu_prev_part = cp;
414 	}
415 	cp->cpu_part = newpp;
416 	newpp->cp_ncpus++;
417 	newpp->cp_gen++;
418 
419 	ASSERT(CPUSET_ISNULL(newpp->cp_haltset));
420 	ASSERT(CPUSET_ISNULL(oldpp->cp_haltset));
421 
422 	/*
423 	 * let the lgroup framework know cp has entered the partition
424 	 */
425 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
426 
427 	/*
428 	 * If necessary, move threads off processor.
429 	 */
430 	if (move_threads) {
431 		ASSERT(ncp != NULL);
432 
433 		/*
434 		 * Walk thru the active process list to look for
435 		 * threads that need to have a new home lgroup,
436 		 * or the last CPU they run on is the same CPU
437 		 * being moved out of the partition.
438 		 */
439 
440 		for (p = practive; p != NULL; p = p->p_next) {
441 
442 			t = p->p_tlist;
443 
444 			if (t == NULL)
445 				continue;
446 
447 			lgrp_diff_lpl = 0;
448 
449 			do {
450 
451 				ASSERT(t->t_lpl != NULL);
452 
453 				/*
454 				 * Update the count of how many threads are
455 				 * in this CPU's lgroup but have a different lpl
456 				 */
457 
458 				if (t->t_lpl != cpu_lpl &&
459 				    t->t_lpl->lpl_lgrpid == lgrpid)
460 					lgrp_diff_lpl++;
461 				/*
462 				 * If the lgroup that t is assigned to no
463 				 * longer has any CPUs in t's partition,
464 				 * we'll have to choose a new lgroup for t.
465 				 */
466 
467 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
468 				    t->t_cpupart)) {
469 					lgrp_move_thread(t,
470 					    lgrp_choose(t, t->t_cpupart), 0);
471 				}
472 
473 				/*
474 				 * make sure lpl points to our own partition
475 				 */
476 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
477 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
478 					t->t_cpupart->cp_nlgrploads));
479 
480 				ASSERT(t->t_lpl->lpl_ncpu > 0);
481 
482 				/* Update CPU last ran on if it was this CPU */
483 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
484 				    t->t_bound_cpu != cp) {
485 					t->t_cpu = disp_lowpri_cpu(ncp,
486 					    t->t_lpl, t->t_pri, NULL);
487 				}
488 				t = t->t_forw;
489 			} while (t != p->p_tlist);
490 
491 			/*
492 			 * Didn't find any threads in the same lgroup as this
493 			 * CPU with a different lpl, so remove the lgroup from
494 			 * the process lgroup bitmask.
495 			 */
496 
497 			if (lgrp_diff_lpl)
498 				klgrpset_del(p->p_lgrpset, lgrpid);
499 		}
500 
501 		/*
502 		 * Walk thread list looking for threads that need to be
503 		 * rehomed, since there are some threads that are not in
504 		 * their process's p_tlist.
505 		 */
506 
507 		t = curthread;
508 
509 		do {
510 			ASSERT(t != NULL && t->t_lpl != NULL);
511 
512 			/*
513 			 * If the lgroup that t is assigned to no
514 			 * longer has any CPUs in t's partition,
515 			 * we'll have to choose a new lgroup for t.
516 			 * Also, choose best lgroup for home when
517 			 * thread has specified lgroup affinities,
518 			 * since there may be an lgroup with more
519 			 * affinity available after moving CPUs
520 			 * around.
521 			 */
522 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
523 			    t->t_cpupart) || t->t_lgrp_affinity) {
524 				lgrp_move_thread(t,
525 				    lgrp_choose(t, t->t_cpupart), 1);
526 			}
527 
528 			/* make sure lpl points to our own partition */
529 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
530 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
531 				t->t_cpupart->cp_nlgrploads));
532 
533 			ASSERT(t->t_lpl->lpl_ncpu > 0);
534 
535 			/* Update CPU last ran on if it was this CPU */
536 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
537 			    t->t_bound_cpu != cp) {
538 				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
539 				    t->t_pri, NULL);
540 			}
541 
542 			t = t->t_next;
543 		} while (t != curthread);
544 
545 		/*
546 		 * Clear off the CPU's run queue, and the kp queue if the
547 		 * partition is now empty.
548 		 */
549 		disp_cpu_inactive(cp);
550 
551 		/*
552 		 * Make cp switch to a thread from the new partition.
553 		 */
554 		cp->cpu_runrun = 1;
555 		cp->cpu_kprunrun = 1;
556 	}
557 
558 	cpu_inmotion = NULL;
559 	start_cpus();
560 
561 	/*
562 	 * Let anyone interested know that cpu has been added to the set.
563 	 */
564 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
565 
566 	/*
567 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
568 	 * bound to the new processor set.
569 	 */
570 	cyclic_move_in(cp);
571 
572 	return (0);
573 }
574 
575 /*
576  * Check if thread can be moved to a new cpu partition.  Called by
577  * cpupart_move_thread() and pset_bind_start().
578  */
579 int
580 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
581 {
582 	ASSERT(MUTEX_HELD(&cpu_lock));
583 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
584 	ASSERT(cp != NULL);
585 	ASSERT(THREAD_LOCK_HELD(tp));
586 
587 	/*
588 	 * CPU-bound threads can't be moved.
589 	 */
590 	if (!ignore) {
591 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
592 		    tp->t_weakbound_cpu;
593 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
594 			return (EBUSY);
595 	}
596 	return (0);
597 }
598 
599 /*
600  * Move thread to new partition.  If ignore is non-zero, then CPU
601  * bindings should be ignored (this is used when destroying a
602  * partition).
603  */
604 static int
605 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
606     void *projbuf, void *zonebuf)
607 {
608 	cpupart_t *oldpp = tp->t_cpupart;
609 	int ret;
610 
611 	ASSERT(MUTEX_HELD(&cpu_lock));
612 	ASSERT(MUTEX_HELD(&pidlock));
613 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
614 	ASSERT(newpp != NULL);
615 
616 	if (newpp->cp_cpulist == NULL)
617 		return (EINVAL);
618 
619 	/*
620 	 * Check for errors first.
621 	 */
622 	thread_lock(tp);
623 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
624 		thread_unlock(tp);
625 		return (ret);
626 	}
627 
628 	/* move the thread */
629 	if (oldpp != newpp) {
630 		/*
631 		 * Make the thread switch to the new partition.
632 		 */
633 		tp->t_cpupart = newpp;
634 		ASSERT(tp->t_lpl != NULL);
635 		/*
636 		 * Leave the thread on the same lgroup if possible; otherwise
637 		 * choose a new lgroup for it.  In either case, update its
638 		 * t_lpl.
639 		 */
640 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
641 		    tp->t_lgrp_affinity == NULL) {
642 			/*
643 			 * The thread's lgroup has CPUs in the thread's new
644 			 * partition, so the thread can stay assigned to the
645 			 * same lgroup.  Update its t_lpl to point to the
646 			 * lpl_t for its lgroup in its new partition.
647 			 */
648 			lgrp_move_thread(tp, &tp->t_cpupart->\
649 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
650 		} else {
651 			/*
652 			 * The thread's lgroup has no cpus in its new
653 			 * partition or it has specified lgroup affinities,
654 			 * so choose the best lgroup for the thread and
655 			 * assign it to that lgroup.
656 			 */
657 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
658 			    1);
659 		}
660 		/*
661 		 * make sure lpl points to our own partition
662 		 */
663 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
664 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
665 			tp->t_cpupart->cp_nlgrploads));
666 
667 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
668 
669 		if (tp->t_state == TS_ONPROC) {
670 			cpu_surrender(tp);
671 		} else if (tp->t_state == TS_RUN) {
672 			(void) dispdeq(tp);
673 			setbackdq(tp);
674 		}
675 	}
676 
677 	/*
678 	 * Our binding has changed; set TP_CHANGEBIND.
679 	 */
680 	tp->t_proc_flag |= TP_CHANGEBIND;
681 	aston(tp);
682 
683 	thread_unlock(tp);
684 	fss_changepset(tp, newpp, projbuf, zonebuf);
685 
686 	return (0);		/* success */
687 }
688 
689 
690 /*
691  * This function binds a thread to a partition.  Must be called with the
692  * p_lock of the containing process held (to keep the thread from going
693  * away), and thus also with cpu_lock held (since cpu_lock must be
694  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
695  * should be ignored (this is used when destroying a partition).
696  */
697 int
698 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
699     void *zonebuf)
700 {
701 	cpupart_t	*newpp;
702 
703 	ASSERT(pool_lock_held());
704 	ASSERT(MUTEX_HELD(&cpu_lock));
705 	ASSERT(MUTEX_HELD(&pidlock));
706 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
707 
708 	if (psid == PS_NONE)
709 		newpp = &cp_default;
710 	else {
711 		newpp = cpupart_find(psid);
712 		if (newpp == NULL) {
713 			return (EINVAL);
714 		}
715 	}
716 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
717 }
718 
719 
720 /*
721  * Create a new partition.  On MP systems, this also allocates a
722  * kpreempt disp queue for that partition.
723  */
724 int
725 cpupart_create(psetid_t *psid)
726 {
727 	cpupart_t	*pp;
728 	lgrp_id_t	i;
729 
730 	ASSERT(pool_lock_held());
731 
732 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
733 	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
734 	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
735 	    KM_SLEEP);
736 
737 	mutex_enter(&cpu_lock);
738 	if (cp_numparts == cp_max_numparts) {
739 		mutex_exit(&cpu_lock);
740 		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
741 		pp->cp_lgrploads = NULL;
742 		kmem_free(pp, sizeof (cpupart_t));
743 		return (ENOMEM);
744 	}
745 	cp_numparts++;
746 	/* find the next free partition ID */
747 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
748 		cp_id_next++;
749 	pp->cp_id = cp_id_next++;
750 	pp->cp_ncpus = 0;
751 	pp->cp_cpulist = NULL;
752 	pp->cp_attr = 0;
753 	klgrpset_clear(pp->cp_lgrpset);
754 	pp->cp_kp_queue.disp_maxrunpri = -1;
755 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
756 	pp->cp_kp_queue.disp_cpu = NULL;
757 	pp->cp_gen = 0;
758 	CPUSET_ZERO(pp->cp_haltset);
759 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
760 	*psid = CPTOPS(pp->cp_id);
761 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
762 	cpupart_kstat_create(pp);
763 	for (i = 0; i < pp->cp_nlgrploads; i++) {
764 		pp->cp_lgrploads[i].lpl_lgrpid = i;
765 	}
766 	CHIP_SET_ZERO(pp->cp_chipset);
767 
768 	/*
769 	 * Pause all CPUs while changing the partition list, to make sure
770 	 * the clock thread (which traverses the list without holding
771 	 * cpu_lock) isn't running.
772 	 */
773 	pause_cpus(NULL);
774 	pp->cp_next = cp_list_head;
775 	pp->cp_prev = cp_list_head->cp_prev;
776 	cp_list_head->cp_prev->cp_next = pp;
777 	cp_list_head->cp_prev = pp;
778 	start_cpus();
779 	mutex_exit(&cpu_lock);
780 
781 	return (0);
782 }
783 
784 
785 /*
786  * Destroy a partition.
787  */
788 int
789 cpupart_destroy(psetid_t psid)
790 {
791 	cpu_t	*cp, *first_cp;
792 	cpupart_t *pp, *newpp;
793 	int	err = 0;
794 	void 	*projbuf, *zonebuf;
795 	kthread_t *t;
796 	proc_t	*p;
797 
798 	ASSERT(pool_lock_held());
799 	mutex_enter(&cpu_lock);
800 
801 	pp = cpupart_find(psid);
802 	if (pp == NULL || pp == &cp_default) {
803 		mutex_exit(&cpu_lock);
804 		return (EINVAL);
805 	}
806 
807 	/*
808 	 * Pre-allocate enough buffers for FSS for all active projects and
809 	 * for all active zones on the system.  Unused buffers will be
810 	 * freed later by fss_freebuf().
811 	 */
812 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
813 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
814 
815 	/*
816 	 * First need to unbind all the threads currently bound to the
817 	 * partition.  Then do the actual destroy (which moves the CPUs).
818 	 */
819 	mutex_enter(&pidlock);
820 	t = curthread;
821 	do {
822 		if (t->t_bind_pset == psid) {
823 again:			p = ttoproc(t);
824 			mutex_enter(&p->p_lock);
825 			if (ttoproc(t) != p) {
826 				/*
827 				 * lwp_exit has changed this thread's process
828 				 * pointer before we grabbed its p_lock.
829 				 */
830 				mutex_exit(&p->p_lock);
831 				goto again;
832 			}
833 			err = cpupart_bind_thread(t, PS_NONE, 1,
834 			    projbuf, zonebuf);
835 			if (err) {
836 				mutex_exit(&p->p_lock);
837 				mutex_exit(&pidlock);
838 				mutex_exit(&cpu_lock);
839 				fss_freebuf(projbuf, FSS_ALLOC_PROJ);
840 				fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
841 				return (err);
842 			}
843 			t->t_bind_pset = PS_NONE;
844 			mutex_exit(&p->p_lock);
845 		}
846 		t = t->t_next;
847 	} while (t != curthread);
848 
849 	mutex_exit(&pidlock);
850 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
851 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
852 
853 	newpp = &cp_default;
854 	while ((cp = pp->cp_cpulist) != NULL) {
855 		if (err = cpupart_move_cpu(cp, newpp, 0)) {
856 			mutex_exit(&cpu_lock);
857 			return (err);
858 		}
859 	}
860 
861 	ASSERT(CHIP_SET_ISNULL(pp->cp_chipset));
862 	ASSERT(CPUSET_ISNULL(pp->cp_haltset));
863 
864 	/*
865 	 * Reset the pointers in any offline processors so they won't
866 	 * try to rejoin the destroyed partition when they're turned
867 	 * online.
868 	 */
869 	first_cp = cp = CPU;
870 	do {
871 		if (cp->cpu_part == pp) {
872 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
873 			cp->cpu_part = newpp;
874 		}
875 		cp = cp->cpu_next;
876 	} while (cp != first_cp);
877 
878 	/*
879 	 * Pause all CPUs while changing the partition list, to make sure
880 	 * the clock thread (which traverses the list without holding
881 	 * cpu_lock) isn't running.
882 	 */
883 	pause_cpus(NULL);
884 	pp->cp_prev->cp_next = pp->cp_next;
885 	pp->cp_next->cp_prev = pp->cp_prev;
886 	if (cp_list_head == pp)
887 		cp_list_head = pp->cp_next;
888 	start_cpus();
889 
890 	if (cp_id_next > pp->cp_id)
891 		cp_id_next = pp->cp_id;
892 
893 	if (pp->cp_kstat)
894 		kstat_delete(pp->cp_kstat);
895 
896 	cp_numparts--;
897 
898 	disp_kp_free(&pp->cp_kp_queue);
899 	kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
900 	pp->cp_lgrploads = NULL;
901 	kmem_free(pp, sizeof (cpupart_t));
902 	mutex_exit(&cpu_lock);
903 
904 	return (err);
905 }
906 
907 
908 /*
909  * Return the ID of the partition to which the specified processor belongs.
910  */
911 psetid_t
912 cpupart_query_cpu(cpu_t *cp)
913 {
914 	ASSERT(MUTEX_HELD(&cpu_lock));
915 
916 	return (CPTOPS(cp->cpu_part->cp_id));
917 }
918 
919 
920 /*
921  * Attach a processor to an existing partition.
922  */
923 int
924 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
925 {
926 	cpupart_t	*pp;
927 	int		err;
928 
929 	ASSERT(pool_lock_held());
930 	ASSERT(MUTEX_HELD(&cpu_lock));
931 
932 	pp = cpupart_find(psid);
933 	if (pp == NULL)
934 		return (EINVAL);
935 	if (cp->cpu_flags & CPU_OFFLINE)
936 		return (EINVAL);
937 
938 	err = cpupart_move_cpu(cp, pp, forced);
939 	return (err);
940 }
941 
942 /*
943  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
944  * this just checks for a valid partition.  If numcpus is non-NULL but
945  * cpulist is NULL, the current number of cpus is stored in *numcpus.
946  * If both are non-NULL, the current number of cpus is stored in *numcpus,
947  * and a list of those cpus up to the size originally in *numcpus is
948  * stored in cpulist[].  Also, store the processor set id in *psid.
949  * This is useful in case the processor set id passed in was PS_MYID.
950  */
951 int
952 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
953 {
954 	cpupart_t	*pp;
955 	uint_t		ncpus;
956 	cpu_t		*c;
957 	int		i;
958 
959 	mutex_enter(&cpu_lock);
960 	pp = cpupart_find(*psid);
961 	if (pp == NULL) {
962 		mutex_exit(&cpu_lock);
963 		return (EINVAL);
964 	}
965 	*psid = CPTOPS(pp->cp_id);
966 	ncpus = pp->cp_ncpus;
967 	if (numcpus) {
968 		if (ncpus > *numcpus) {
969 			/*
970 			 * Only copy as many cpus as were passed in, but
971 			 * pass back the real number.
972 			 */
973 			uint_t t = ncpus;
974 			ncpus = *numcpus;
975 			*numcpus = t;
976 		} else
977 			*numcpus = ncpus;
978 
979 		if (cpulist) {
980 			c = pp->cp_cpulist;
981 			for (i = 0; i < ncpus; i++) {
982 				ASSERT(c != NULL);
983 				cpulist[i] = c->cpu_id;
984 				c = c->cpu_next_part;
985 			}
986 		}
987 	}
988 	mutex_exit(&cpu_lock);
989 	return (0);
990 }
991 
992 /*
993  * Reallocate kpreempt queues for each CPU partition.  Called from
994  * disp_setup when a new scheduling class is loaded that increases the
995  * number of priorities in the system.
996  */
997 void
998 cpupart_kpqalloc(pri_t npri)
999 {
1000 	cpupart_t *cpp;
1001 
1002 	ASSERT(MUTEX_HELD(&cpu_lock));
1003 	cpp = cp_list_head;
1004 	do {
1005 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1006 		cpp = cpp->cp_next;
1007 	} while (cpp != cp_list_head);
1008 }
1009 
1010 int
1011 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1012 {
1013 	cpupart_t *cp;
1014 	int i;
1015 
1016 	ASSERT(nelem >= 0);
1017 	ASSERT(nelem <= LOADAVG_NSTATS);
1018 	ASSERT(MUTEX_HELD(&cpu_lock));
1019 
1020 	cp = cpupart_find(psid);
1021 	if (cp == NULL)
1022 		return (EINVAL);
1023 	for (i = 0; i < nelem; i++)
1024 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1025 
1026 	return (0);
1027 }
1028 
1029 
1030 uint_t
1031 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1032 {
1033 	uint_t numpart = 0;
1034 	cpupart_t *cp;
1035 
1036 	ASSERT(MUTEX_HELD(&cpu_lock));
1037 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1038 
1039 	if (list != NULL) {
1040 		cp = cp_list_head;
1041 		do {
1042 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1043 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1044 				if (numpart == nelem)
1045 					break;
1046 				list[numpart++] = CPTOPS(cp->cp_id);
1047 			}
1048 			cp = cp->cp_next;
1049 		} while (cp != cp_list_head);
1050 	}
1051 
1052 	ASSERT(numpart < cp_numparts);
1053 
1054 	if (flag == CP_ALL)
1055 		numpart = cp_numparts - 1; /* leave out default partition */
1056 	else if (flag == CP_NONEMPTY)
1057 		numpart = cp_numparts_nonempty;
1058 
1059 	return (numpart);
1060 }
1061 
1062 int
1063 cpupart_setattr(psetid_t psid, uint_t attr)
1064 {
1065 	cpupart_t *cp;
1066 
1067 	ASSERT(pool_lock_held());
1068 
1069 	mutex_enter(&cpu_lock);
1070 	if ((cp = cpupart_find(psid)) == NULL) {
1071 		mutex_exit(&cpu_lock);
1072 		return (EINVAL);
1073 	}
1074 	/*
1075 	 * PSET_NOESCAPE attribute for default cpu partition is always set
1076 	 */
1077 	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1078 		mutex_exit(&cpu_lock);
1079 		return (EINVAL);
1080 	}
1081 	cp->cp_attr = attr;
1082 	mutex_exit(&cpu_lock);
1083 	return (0);
1084 }
1085 
1086 int
1087 cpupart_getattr(psetid_t psid, uint_t *attrp)
1088 {
1089 	cpupart_t *cp;
1090 
1091 	mutex_enter(&cpu_lock);
1092 	if ((cp = cpupart_find(psid)) == NULL) {
1093 		mutex_exit(&cpu_lock);
1094 		return (EINVAL);
1095 	}
1096 	*attrp = cp->cp_attr;
1097 	mutex_exit(&cpu_lock);
1098 	return (0);
1099 }
1100