xref: /titanic_51/usr/src/uts/common/disp/cpupart.c (revision b86efd96f8acd85ddaa930a2f0c1d664237e4aaf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/thread.h>
33 #include <sys/disp.h>
34 #include <sys/kmem.h>
35 #include <sys/debug.h>
36 #include <sys/cpupart.h>
37 #include <sys/pset.h>
38 #include <sys/var.h>
39 #include <sys/cyclic.h>
40 #include <sys/lgrp.h>
41 #include <sys/chip.h>
42 #include <sys/loadavg.h>
43 #include <sys/class.h>
44 #include <sys/fss.h>
45 #include <sys/pool.h>
46 #include <sys/pool_pset.h>
47 #include <sys/policy.h>
48 
49 /*
50  * Calling pool_lock() protects the pools configuration, which includes
51  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
52  * partitions from being created or destroyed while the lock is held.
53  * The lock ordering with respect to related locks is:
54  *
55  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
56  *
57  * Blocking memory allocations may be made while holding "pool_lock"
58  * or cpu_lock.
59  */
60 
61 /*
62  * The cp_default partition is allocated statically, but its lgroup load average
63  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
64  * saves some memory since the space allocated reflects the actual number of
65  * lgroups supported by the platform. The lgrp facility provides a temporary
66  * space to hold lpl information during system bootstrap.
67  */
68 
69 cpupart_t		*cp_list_head;
70 cpupart_t		cp_default;
71 struct mach_cpupart	cp_default_mach;
72 static cpupartid_t	cp_id_next;
73 uint_t			cp_numparts;
74 uint_t			cp_numparts_nonempty;
75 
76 /*
77  * Need to limit total number of partitions to avoid slowing down the
78  * clock code too much.  The clock code traverses the list of
79  * partitions and needs to be able to execute in a reasonable amount
80  * of time (less than 1/hz seconds).  The maximum is sized based on
81  * max_ncpus so it shouldn't be a problem unless there are large
82  * numbers of empty partitions.
83  */
84 static uint_t		cp_max_numparts;
85 
86 /*
87  * Processor sets and CPU partitions are different but related concepts.
88  * A processor set is a user-level abstraction allowing users to create
89  * sets of CPUs and bind threads exclusively to those sets.  A CPU
90  * partition is a kernel dispatcher object consisting of a set of CPUs
91  * and a global dispatch queue.  The processor set abstraction is
92  * implemented via a CPU partition, and currently there is a 1-1
93  * mapping between processor sets and partitions (excluding the default
94  * partition, which is not visible as a processor set).  Hence, the
95  * numbering for processor sets and CPU partitions is identical.  This
96  * may not always be true in the future, and these macros could become
97  * less trivial if we support e.g. a processor set containing multiple
98  * CPU partitions.
99  */
100 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
101 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
102 
103 /*
104  * Find a CPU partition given a processor set ID.
105  */
106 static cpupart_t *
107 cpupart_find_all(psetid_t psid)
108 {
109 	cpupart_t *cp;
110 	cpupartid_t cpid = PSTOCP(psid);
111 
112 	ASSERT(MUTEX_HELD(&cpu_lock));
113 
114 	/* default partition not visible as a processor set */
115 	if (psid == CP_DEFAULT)
116 		return (NULL);
117 
118 	if (psid == PS_MYID)
119 		return (curthread->t_cpupart);
120 
121 	cp = cp_list_head;
122 	do {
123 		if (cp->cp_id == cpid)
124 			return (cp);
125 		cp = cp->cp_next;
126 	} while (cp != cp_list_head);
127 	return (NULL);
128 }
129 
130 /*
131  * Find a CPU partition given a processor set ID if the processor set
132  * should be visible from the calling zone.
133  */
134 cpupart_t *
135 cpupart_find(psetid_t psid)
136 {
137 	cpupart_t *cp;
138 
139 	ASSERT(MUTEX_HELD(&cpu_lock));
140 	cp = cpupart_find_all(psid);
141 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
142 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
143 			return (NULL);
144 	return (cp);
145 }
146 
147 static int
148 cpupart_kstat_update(kstat_t *ksp, int rw)
149 {
150 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
151 	cpupart_kstat_t *cpksp = ksp->ks_data;
152 
153 	if (rw == KSTAT_WRITE)
154 		return (EACCES);
155 
156 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
157 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
158 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
159 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
160 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
161 	    (16 - FSHIFT);
162 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
163 	    (16 - FSHIFT);
164 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
165 	    (16 - FSHIFT);
166 	return (0);
167 }
168 
169 static void
170 cpupart_kstat_create(cpupart_t *cp)
171 {
172 	kstat_t *ksp;
173 	zoneid_t zoneid;
174 
175 	ASSERT(MUTEX_HELD(&cpu_lock));
176 
177 	/*
178 	 * We have a bit of a chicken-egg problem since this code will
179 	 * get called to create the kstats for CP_DEFAULT before the
180 	 * pools framework gets initialized.  We circumvent the problem
181 	 * by special-casing cp_default.
182 	 */
183 	if (cp != &cp_default && pool_pset_enabled())
184 		zoneid = GLOBAL_ZONEID;
185 	else
186 		zoneid = ALL_ZONES;
187 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
188 	    KSTAT_TYPE_NAMED,
189 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
190 	if (ksp != NULL) {
191 		cpupart_kstat_t *cpksp = ksp->ks_data;
192 
193 		kstat_named_init(&cpksp->cpk_updates, "updates",
194 		    KSTAT_DATA_UINT64);
195 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
196 		    KSTAT_DATA_UINT64);
197 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
198 		    KSTAT_DATA_UINT64);
199 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
200 		    KSTAT_DATA_UINT32);
201 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
202 		    KSTAT_DATA_UINT32);
203 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
204 		    KSTAT_DATA_UINT32);
205 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
206 		    KSTAT_DATA_UINT32);
207 
208 		ksp->ks_update = cpupart_kstat_update;
209 		ksp->ks_private = cp;
210 
211 		kstat_install(ksp);
212 	}
213 	cp->cp_kstat = ksp;
214 }
215 
216 /*
217  * Initialize the default partition and kpreempt disp queue.
218  */
219 void
220 cpupart_initialize_default(void)
221 {
222 	lgrp_id_t i;
223 
224 	cp_list_head = &cp_default;
225 	cp_default.cp_next = &cp_default;
226 	cp_default.cp_prev = &cp_default;
227 	cp_default.cp_id = CP_DEFAULT;
228 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
229 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
230 	cp_default.cp_kp_queue.disp_cpu = NULL;
231 	cp_default.cp_gen = 0;
232 	cp_default.cp_loadavg.lg_cur = 0;
233 	cp_default.cp_loadavg.lg_len = 0;
234 	cp_default.cp_loadavg.lg_total = 0;
235 	for (i = 0; i < S_LOADAVG_SZ; i++) {
236 		cp_default.cp_loadavg.lg_loads[i] = 0;
237 	}
238 	CPUSET_ZERO(cp_default.cp_mach->mc_haltset);
239 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
240 	cp_id_next = CP_DEFAULT + 1;
241 	cpupart_kstat_create(&cp_default);
242 	cp_numparts = 1;
243 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
244 		cp_max_numparts = max_ncpus * 2 + 1;
245 	/*
246 	 * Allocate space for cp_default list of lgrploads
247 	 */
248 	cp_default.cp_nlgrploads = lgrp_plat_max_lgrps();
249 	cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) *
250 	    cp_default.cp_nlgrploads, KM_SLEEP);
251 
252 	/*
253 	 * The initial lpl topology is created in a special lpl list
254 	 * lpl_bootstrap. It should be copied to cp_default.
255 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
256 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
257 	 */
258 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
259 	    cp_default.cp_nlgrploads);
260 
261 	for (i = 0; i < cp_default.cp_nlgrploads; i++) {
262 		cp_default.cp_lgrploads[i].lpl_lgrpid = i;
263 	}
264 	cp_default.cp_attr = PSET_NOESCAPE;
265 	cp_numparts_nonempty = 1;
266 	/*
267 	 * Set t0's home
268 	 */
269 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
270 }
271 
272 
273 static int
274 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
275 {
276 	cpupart_t *oldpp;
277 	cpu_t	*ncp, *newlist;
278 	kthread_t *t;
279 	int	move_threads = 1;
280 	lgrp_id_t lgrpid;
281 	proc_t 	*p;
282 	int lgrp_diff_lpl;
283 	lpl_t	*cpu_lpl;
284 	int	ret;
285 
286 	ASSERT(MUTEX_HELD(&cpu_lock));
287 	ASSERT(newpp != NULL);
288 
289 	oldpp = cp->cpu_part;
290 	ASSERT(oldpp != NULL);
291 	ASSERT(oldpp->cp_ncpus > 0);
292 
293 	if (newpp == oldpp) {
294 		/*
295 		 * Don't need to do anything.
296 		 */
297 		return (0);
298 	}
299 
300 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
301 
302 	if (!disp_bound_partition(cp, 0)) {
303 		/*
304 		 * Don't need to move threads if there are no threads in
305 		 * the partition.  Note that threads can't enter the
306 		 * partition while we're holding cpu_lock.
307 		 */
308 		move_threads = 0;
309 	} else if (oldpp->cp_ncpus == 1) {
310 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
311 		return (EBUSY);
312 	}
313 
314 	if (forced && (ret = cpu_unbind(cp->cpu_id)) != 0) {
315 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
316 		return (ret);
317 	}
318 
319 	/*
320 	 * Stop further threads weak binding to this cpu.
321 	 */
322 	cpu_inmotion = cp;
323 	membar_enter();
324 
325 again:
326 	if (move_threads) {
327 		int loop_count;
328 		/*
329 		 * Check for threads strong or weak bound to this CPU.
330 		 */
331 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
332 			if (loop_count >= 5) {
333 				cpu_state_change_notify(cp->cpu_id,
334 				    CPU_CPUPART_IN);
335 				cpu_inmotion = NULL;
336 				return (EBUSY);	/* some threads still bound */
337 			}
338 			delay(1);
339 		}
340 	}
341 
342 	/*
343 	 * Before we actually start changing data structures, notify
344 	 * the cyclic subsystem that we want to move this CPU out of its
345 	 * partition.
346 	 */
347 	if (!cyclic_move_out(cp)) {
348 		/*
349 		 * This CPU must be the last CPU in a processor set with
350 		 * a bound cyclic.
351 		 */
352 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
353 		cpu_inmotion = NULL;
354 		return (EBUSY);
355 	}
356 
357 	pause_cpus(cp);
358 
359 	if (move_threads) {
360 		/*
361 		 * The thread on cpu before the pause thread may have read
362 		 * cpu_inmotion before we raised the barrier above.  Check
363 		 * again.
364 		 */
365 		if (disp_bound_threads(cp, 1)) {
366 			start_cpus();
367 			goto again;
368 		}
369 
370 	}
371 
372 	/*
373 	 * Update the set of chip's being spanned
374 	 */
375 	chip_cpu_move_part(cp, oldpp, newpp);
376 
377 	/* save this cpu's lgroup -- it'll be the same in the new partition */
378 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
379 
380 	cpu_lpl = cp->cpu_lpl;
381 	/*
382 	 * let the lgroup framework know cp has left the partition
383 	 */
384 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
385 
386 	/* move out of old partition */
387 	oldpp->cp_ncpus--;
388 	if (oldpp->cp_ncpus > 0) {
389 
390 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
391 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
392 		if (oldpp->cp_cpulist == cp) {
393 			oldpp->cp_cpulist = ncp;
394 		}
395 	} else {
396 		ncp = oldpp->cp_cpulist = NULL;
397 		cp_numparts_nonempty--;
398 		ASSERT(cp_numparts_nonempty != 0);
399 	}
400 	oldpp->cp_gen++;
401 
402 	/* move into new partition */
403 	newlist = newpp->cp_cpulist;
404 	if (newlist == NULL) {
405 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
406 		cp_numparts_nonempty++;
407 		ASSERT(cp_numparts_nonempty != 0);
408 	} else {
409 		cp->cpu_next_part = newlist;
410 		cp->cpu_prev_part = newlist->cpu_prev_part;
411 		newlist->cpu_prev_part->cpu_next_part = cp;
412 		newlist->cpu_prev_part = cp;
413 	}
414 	cp->cpu_part = newpp;
415 	newpp->cp_ncpus++;
416 	newpp->cp_gen++;
417 
418 	ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset));
419 	ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset));
420 
421 	/*
422 	 * let the lgroup framework know cp has entered the partition
423 	 */
424 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
425 
426 	/*
427 	 * If necessary, move threads off processor.
428 	 */
429 	if (move_threads) {
430 		ASSERT(ncp != NULL);
431 
432 		/*
433 		 * Walk thru the active process list to look for
434 		 * threads that need to have a new home lgroup,
435 		 * or the last CPU they run on is the same CPU
436 		 * being moved out of the partition.
437 		 */
438 
439 		for (p = practive; p != NULL; p = p->p_next) {
440 
441 			t = p->p_tlist;
442 
443 			if (t == NULL)
444 				continue;
445 
446 			lgrp_diff_lpl = 0;
447 
448 			do {
449 
450 				ASSERT(t->t_lpl != NULL);
451 
452 				/*
453 				 * Update the count of how many threads are
454 				 * in this CPU's lgroup but have a different lpl
455 				 */
456 
457 				if (t->t_lpl != cpu_lpl &&
458 				    t->t_lpl->lpl_lgrpid == lgrpid)
459 					lgrp_diff_lpl++;
460 				/*
461 				 * If the lgroup that t is assigned to no
462 				 * longer has any CPUs in t's partition,
463 				 * we'll have to choose a new lgroup for t.
464 				 */
465 
466 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
467 				    t->t_cpupart)) {
468 					lgrp_move_thread(t,
469 					    lgrp_choose(t, t->t_cpupart), 0);
470 				}
471 
472 				/*
473 				 * make sure lpl points to our own partition
474 				 */
475 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
476 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
477 					t->t_cpupart->cp_nlgrploads));
478 
479 				ASSERT(t->t_lpl->lpl_ncpu > 0);
480 
481 				/* Update CPU last ran on if it was this CPU */
482 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
483 				    t->t_bound_cpu != cp) {
484 					t->t_cpu = disp_lowpri_cpu(ncp,
485 					    t->t_lpl, t->t_pri, NULL);
486 				}
487 				t = t->t_forw;
488 			} while (t != p->p_tlist);
489 
490 			/*
491 			 * Didn't find any threads in the same lgroup as this
492 			 * CPU with a different lpl, so remove the lgroup from
493 			 * the process lgroup bitmask.
494 			 */
495 
496 			if (lgrp_diff_lpl)
497 				klgrpset_del(p->p_lgrpset, lgrpid);
498 		}
499 
500 		/*
501 		 * Walk thread list looking for threads that need to be
502 		 * rehomed, since there are some threads that are not in
503 		 * their process's p_tlist.
504 		 */
505 
506 		t = curthread;
507 
508 		do {
509 			ASSERT(t != NULL && t->t_lpl != NULL);
510 
511 			/*
512 			 * If the lgroup that t is assigned to no
513 			 * longer has any CPUs in t's partition,
514 			 * we'll have to choose a new lgroup for t.
515 			 * Also, choose best lgroup for home when
516 			 * thread has specified lgroup affinities,
517 			 * since there may be an lgroup with more
518 			 * affinity available after moving CPUs
519 			 * around.
520 			 */
521 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
522 			    t->t_cpupart) || t->t_lgrp_affinity) {
523 				lgrp_move_thread(t,
524 				    lgrp_choose(t, t->t_cpupart), 1);
525 			}
526 
527 			/* make sure lpl points to our own partition */
528 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
529 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
530 				t->t_cpupart->cp_nlgrploads));
531 
532 			ASSERT(t->t_lpl->lpl_ncpu > 0);
533 
534 			/* Update CPU last ran on if it was this CPU */
535 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
536 			    t->t_bound_cpu != cp) {
537 				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
538 				    t->t_pri, NULL);
539 			}
540 
541 			t = t->t_next;
542 		} while (t != curthread);
543 
544 		/*
545 		 * Clear off the CPU's run queue, and the kp queue if the
546 		 * partition is now empty.
547 		 */
548 		disp_cpu_inactive(cp);
549 
550 		/*
551 		 * Make cp switch to a thread from the new partition.
552 		 */
553 		cp->cpu_runrun = 1;
554 		cp->cpu_kprunrun = 1;
555 	}
556 
557 	cpu_inmotion = NULL;
558 	start_cpus();
559 
560 	/*
561 	 * Let anyone interested know that cpu has been added to the set.
562 	 */
563 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
564 
565 	/*
566 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
567 	 * bound to the new processor set.
568 	 */
569 	cyclic_move_in(cp);
570 
571 	return (0);
572 }
573 
574 /*
575  * Check if thread can be moved to a new cpu partition.  Called by
576  * cpupart_move_thread() and pset_bind_start().
577  */
578 int
579 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
580 {
581 	ASSERT(MUTEX_HELD(&cpu_lock));
582 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
583 	ASSERT(cp != NULL);
584 	ASSERT(THREAD_LOCK_HELD(tp));
585 
586 	/*
587 	 * CPU-bound threads can't be moved.
588 	 */
589 	if (!ignore) {
590 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
591 		    tp->t_weakbound_cpu;
592 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
593 			return (EBUSY);
594 	}
595 	return (0);
596 }
597 
598 /*
599  * Move thread to new partition.  If ignore is non-zero, then CPU
600  * bindings should be ignored (this is used when destroying a
601  * partition).
602  */
603 static int
604 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
605     void *projbuf, void *zonebuf)
606 {
607 	cpupart_t *oldpp = tp->t_cpupart;
608 	int ret;
609 
610 	ASSERT(MUTEX_HELD(&cpu_lock));
611 	ASSERT(MUTEX_HELD(&pidlock));
612 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
613 	ASSERT(newpp != NULL);
614 
615 	if (newpp->cp_cpulist == NULL)
616 		return (EINVAL);
617 
618 	/*
619 	 * Check for errors first.
620 	 */
621 	thread_lock(tp);
622 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
623 		thread_unlock(tp);
624 		return (ret);
625 	}
626 
627 	/* move the thread */
628 	if (oldpp != newpp) {
629 		/*
630 		 * Make the thread switch to the new partition.
631 		 */
632 		tp->t_cpupart = newpp;
633 		ASSERT(tp->t_lpl != NULL);
634 		/*
635 		 * Leave the thread on the same lgroup if possible; otherwise
636 		 * choose a new lgroup for it.  In either case, update its
637 		 * t_lpl.
638 		 */
639 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
640 		    tp->t_lgrp_affinity == NULL) {
641 			/*
642 			 * The thread's lgroup has CPUs in the thread's new
643 			 * partition, so the thread can stay assigned to the
644 			 * same lgroup.  Update its t_lpl to point to the
645 			 * lpl_t for its lgroup in its new partition.
646 			 */
647 			lgrp_move_thread(tp, &tp->t_cpupart->\
648 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
649 		} else {
650 			/*
651 			 * The thread's lgroup has no cpus in its new
652 			 * partition or it has specified lgroup affinities,
653 			 * so choose the best lgroup for the thread and
654 			 * assign it to that lgroup.
655 			 */
656 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
657 			    1);
658 		}
659 		/*
660 		 * make sure lpl points to our own partition
661 		 */
662 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
663 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
664 			tp->t_cpupart->cp_nlgrploads));
665 
666 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
667 
668 		if (tp->t_state == TS_ONPROC) {
669 			cpu_surrender(tp);
670 		} else if (tp->t_state == TS_RUN) {
671 			(void) dispdeq(tp);
672 			setbackdq(tp);
673 		}
674 	}
675 
676 	/*
677 	 * Our binding has changed; set TP_CHANGEBIND.
678 	 */
679 	tp->t_proc_flag |= TP_CHANGEBIND;
680 	aston(tp);
681 
682 	thread_unlock(tp);
683 	fss_changepset(tp, newpp, projbuf, zonebuf);
684 
685 	return (0);		/* success */
686 }
687 
688 
689 /*
690  * This function binds a thread to a partition.  Must be called with the
691  * p_lock of the containing process held (to keep the thread from going
692  * away), and thus also with cpu_lock held (since cpu_lock must be
693  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
694  * should be ignored (this is used when destroying a partition).
695  */
696 int
697 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
698     void *zonebuf)
699 {
700 	cpupart_t	*newpp;
701 
702 	ASSERT(pool_lock_held());
703 	ASSERT(MUTEX_HELD(&cpu_lock));
704 	ASSERT(MUTEX_HELD(&pidlock));
705 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
706 
707 	if (psid == PS_NONE)
708 		newpp = &cp_default;
709 	else {
710 		newpp = cpupart_find(psid);
711 		if (newpp == NULL) {
712 			return (EINVAL);
713 		}
714 	}
715 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
716 }
717 
718 
719 /*
720  * Create a new partition.  On MP systems, this also allocates a
721  * kpreempt disp queue for that partition.
722  */
723 int
724 cpupart_create(psetid_t *psid)
725 {
726 	cpupart_t	*pp;
727 	lgrp_id_t	i;
728 
729 	ASSERT(pool_lock_held());
730 
731 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
732 	pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP);
733 	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
734 	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
735 	    KM_SLEEP);
736 
737 	mutex_enter(&cpu_lock);
738 	if (cp_numparts == cp_max_numparts) {
739 		mutex_exit(&cpu_lock);
740 		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
741 		pp->cp_lgrploads = NULL;
742 		kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
743 		kmem_free(pp, sizeof (cpupart_t));
744 		return (ENOMEM);
745 	}
746 	cp_numparts++;
747 	/* find the next free partition ID */
748 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
749 		cp_id_next++;
750 	pp->cp_id = cp_id_next++;
751 	pp->cp_ncpus = 0;
752 	pp->cp_cpulist = NULL;
753 	pp->cp_attr = 0;
754 	klgrpset_clear(pp->cp_lgrpset);
755 	pp->cp_kp_queue.disp_maxrunpri = -1;
756 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
757 	pp->cp_kp_queue.disp_cpu = NULL;
758 	pp->cp_gen = 0;
759 	CPUSET_ZERO(pp->cp_mach->mc_haltset);
760 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
761 	*psid = CPTOPS(pp->cp_id);
762 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
763 	cpupart_kstat_create(pp);
764 	for (i = 0; i < pp->cp_nlgrploads; i++) {
765 		pp->cp_lgrploads[i].lpl_lgrpid = i;
766 	}
767 	CHIP_SET_ZERO(pp->cp_mach->mc_chipset);
768 
769 	/*
770 	 * Pause all CPUs while changing the partition list, to make sure
771 	 * the clock thread (which traverses the list without holding
772 	 * cpu_lock) isn't running.
773 	 */
774 	pause_cpus(NULL);
775 	pp->cp_next = cp_list_head;
776 	pp->cp_prev = cp_list_head->cp_prev;
777 	cp_list_head->cp_prev->cp_next = pp;
778 	cp_list_head->cp_prev = pp;
779 	start_cpus();
780 	mutex_exit(&cpu_lock);
781 
782 	return (0);
783 }
784 
785 
786 /*
787  * Destroy a partition.
788  */
789 int
790 cpupart_destroy(psetid_t psid)
791 {
792 	cpu_t	*cp, *first_cp;
793 	cpupart_t *pp, *newpp;
794 	int	err = 0;
795 	void 	*projbuf, *zonebuf;
796 	kthread_t *t;
797 	proc_t	*p;
798 
799 	ASSERT(pool_lock_held());
800 	mutex_enter(&cpu_lock);
801 
802 	pp = cpupart_find(psid);
803 	if (pp == NULL || pp == &cp_default) {
804 		mutex_exit(&cpu_lock);
805 		return (EINVAL);
806 	}
807 
808 	/*
809 	 * Pre-allocate enough buffers for FSS for all active projects and
810 	 * for all active zones on the system.  Unused buffers will be
811 	 * freed later by fss_freebuf().
812 	 */
813 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
814 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
815 
816 	/*
817 	 * First need to unbind all the threads currently bound to the
818 	 * partition.  Then do the actual destroy (which moves the CPUs).
819 	 */
820 	mutex_enter(&pidlock);
821 	t = curthread;
822 	do {
823 		if (t->t_bind_pset == psid) {
824 again:			p = ttoproc(t);
825 			mutex_enter(&p->p_lock);
826 			if (ttoproc(t) != p) {
827 				/*
828 				 * lwp_exit has changed this thread's process
829 				 * pointer before we grabbed its p_lock.
830 				 */
831 				mutex_exit(&p->p_lock);
832 				goto again;
833 			}
834 			err = cpupart_bind_thread(t, PS_NONE, 1,
835 			    projbuf, zonebuf);
836 			if (err) {
837 				mutex_exit(&p->p_lock);
838 				mutex_exit(&pidlock);
839 				mutex_exit(&cpu_lock);
840 				fss_freebuf(projbuf, FSS_ALLOC_PROJ);
841 				fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
842 				return (err);
843 			}
844 			t->t_bind_pset = PS_NONE;
845 			mutex_exit(&p->p_lock);
846 		}
847 		t = t->t_next;
848 	} while (t != curthread);
849 
850 	mutex_exit(&pidlock);
851 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
852 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
853 
854 	newpp = &cp_default;
855 	while ((cp = pp->cp_cpulist) != NULL) {
856 		if (err = cpupart_move_cpu(cp, newpp, 0)) {
857 			mutex_exit(&cpu_lock);
858 			return (err);
859 		}
860 	}
861 
862 	ASSERT(CHIP_SET_ISNULL(pp->cp_mach->mc_chipset));
863 	ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset));
864 
865 	/*
866 	 * Reset the pointers in any offline processors so they won't
867 	 * try to rejoin the destroyed partition when they're turned
868 	 * online.
869 	 */
870 	first_cp = cp = CPU;
871 	do {
872 		if (cp->cpu_part == pp) {
873 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
874 			cp->cpu_part = newpp;
875 		}
876 		cp = cp->cpu_next;
877 	} while (cp != first_cp);
878 
879 	/*
880 	 * Pause all CPUs while changing the partition list, to make sure
881 	 * the clock thread (which traverses the list without holding
882 	 * cpu_lock) isn't running.
883 	 */
884 	pause_cpus(NULL);
885 	pp->cp_prev->cp_next = pp->cp_next;
886 	pp->cp_next->cp_prev = pp->cp_prev;
887 	if (cp_list_head == pp)
888 		cp_list_head = pp->cp_next;
889 	start_cpus();
890 
891 	if (cp_id_next > pp->cp_id)
892 		cp_id_next = pp->cp_id;
893 
894 	if (pp->cp_kstat)
895 		kstat_delete(pp->cp_kstat);
896 
897 	cp_numparts--;
898 
899 	disp_kp_free(&pp->cp_kp_queue);
900 	kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
901 	pp->cp_lgrploads = NULL;
902 	kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
903 	kmem_free(pp, sizeof (cpupart_t));
904 	mutex_exit(&cpu_lock);
905 
906 	return (err);
907 }
908 
909 
910 /*
911  * Return the ID of the partition to which the specified processor belongs.
912  */
913 psetid_t
914 cpupart_query_cpu(cpu_t *cp)
915 {
916 	ASSERT(MUTEX_HELD(&cpu_lock));
917 
918 	return (CPTOPS(cp->cpu_part->cp_id));
919 }
920 
921 
922 /*
923  * Attach a processor to an existing partition.
924  */
925 int
926 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
927 {
928 	cpupart_t	*pp;
929 	int		err;
930 
931 	ASSERT(pool_lock_held());
932 	ASSERT(MUTEX_HELD(&cpu_lock));
933 
934 	pp = cpupart_find(psid);
935 	if (pp == NULL)
936 		return (EINVAL);
937 	if (cp->cpu_flags & CPU_OFFLINE)
938 		return (EINVAL);
939 
940 	err = cpupart_move_cpu(cp, pp, forced);
941 	return (err);
942 }
943 
944 /*
945  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
946  * this just checks for a valid partition.  If numcpus is non-NULL but
947  * cpulist is NULL, the current number of cpus is stored in *numcpus.
948  * If both are non-NULL, the current number of cpus is stored in *numcpus,
949  * and a list of those cpus up to the size originally in *numcpus is
950  * stored in cpulist[].  Also, store the processor set id in *psid.
951  * This is useful in case the processor set id passed in was PS_MYID.
952  */
953 int
954 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
955 {
956 	cpupart_t	*pp;
957 	uint_t		ncpus;
958 	cpu_t		*c;
959 	int		i;
960 
961 	mutex_enter(&cpu_lock);
962 	pp = cpupart_find(*psid);
963 	if (pp == NULL) {
964 		mutex_exit(&cpu_lock);
965 		return (EINVAL);
966 	}
967 	*psid = CPTOPS(pp->cp_id);
968 	ncpus = pp->cp_ncpus;
969 	if (numcpus) {
970 		if (ncpus > *numcpus) {
971 			/*
972 			 * Only copy as many cpus as were passed in, but
973 			 * pass back the real number.
974 			 */
975 			uint_t t = ncpus;
976 			ncpus = *numcpus;
977 			*numcpus = t;
978 		} else
979 			*numcpus = ncpus;
980 
981 		if (cpulist) {
982 			c = pp->cp_cpulist;
983 			for (i = 0; i < ncpus; i++) {
984 				ASSERT(c != NULL);
985 				cpulist[i] = c->cpu_id;
986 				c = c->cpu_next_part;
987 			}
988 		}
989 	}
990 	mutex_exit(&cpu_lock);
991 	return (0);
992 }
993 
994 /*
995  * Reallocate kpreempt queues for each CPU partition.  Called from
996  * disp_setup when a new scheduling class is loaded that increases the
997  * number of priorities in the system.
998  */
999 void
1000 cpupart_kpqalloc(pri_t npri)
1001 {
1002 	cpupart_t *cpp;
1003 
1004 	ASSERT(MUTEX_HELD(&cpu_lock));
1005 	cpp = cp_list_head;
1006 	do {
1007 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1008 		cpp = cpp->cp_next;
1009 	} while (cpp != cp_list_head);
1010 }
1011 
1012 int
1013 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1014 {
1015 	cpupart_t *cp;
1016 	int i;
1017 
1018 	ASSERT(nelem >= 0);
1019 	ASSERT(nelem <= LOADAVG_NSTATS);
1020 	ASSERT(MUTEX_HELD(&cpu_lock));
1021 
1022 	cp = cpupart_find(psid);
1023 	if (cp == NULL)
1024 		return (EINVAL);
1025 	for (i = 0; i < nelem; i++)
1026 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1027 
1028 	return (0);
1029 }
1030 
1031 
1032 uint_t
1033 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1034 {
1035 	uint_t numpart = 0;
1036 	cpupart_t *cp;
1037 
1038 	ASSERT(MUTEX_HELD(&cpu_lock));
1039 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1040 
1041 	if (list != NULL) {
1042 		cp = cp_list_head;
1043 		do {
1044 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1045 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1046 				if (numpart == nelem)
1047 					break;
1048 				list[numpart++] = CPTOPS(cp->cp_id);
1049 			}
1050 			cp = cp->cp_next;
1051 		} while (cp != cp_list_head);
1052 	}
1053 
1054 	ASSERT(numpart < cp_numparts);
1055 
1056 	if (flag == CP_ALL)
1057 		numpart = cp_numparts - 1; /* leave out default partition */
1058 	else if (flag == CP_NONEMPTY)
1059 		numpart = cp_numparts_nonempty;
1060 
1061 	return (numpart);
1062 }
1063 
1064 int
1065 cpupart_setattr(psetid_t psid, uint_t attr)
1066 {
1067 	cpupart_t *cp;
1068 
1069 	ASSERT(pool_lock_held());
1070 
1071 	mutex_enter(&cpu_lock);
1072 	if ((cp = cpupart_find(psid)) == NULL) {
1073 		mutex_exit(&cpu_lock);
1074 		return (EINVAL);
1075 	}
1076 	/*
1077 	 * PSET_NOESCAPE attribute for default cpu partition is always set
1078 	 */
1079 	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1080 		mutex_exit(&cpu_lock);
1081 		return (EINVAL);
1082 	}
1083 	cp->cp_attr = attr;
1084 	mutex_exit(&cpu_lock);
1085 	return (0);
1086 }
1087 
1088 int
1089 cpupart_getattr(psetid_t psid, uint_t *attrp)
1090 {
1091 	cpupart_t *cp;
1092 
1093 	mutex_enter(&cpu_lock);
1094 	if ((cp = cpupart_find(psid)) == NULL) {
1095 		mutex_exit(&cpu_lock);
1096 		return (EINVAL);
1097 	}
1098 	*attrp = cp->cp_attr;
1099 	mutex_exit(&cpu_lock);
1100 	return (0);
1101 }
1102