xref: /illumos-gate/usr/src/uts/common/disp/cpupart.c (revision e7cbe64f7a72dae5cb44f100db60ca88f3313c65)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/thread.h>
33 #include <sys/disp.h>
34 #include <sys/kmem.h>
35 #include <sys/debug.h>
36 #include <sys/cpupart.h>
37 #include <sys/pset.h>
38 #include <sys/var.h>
39 #include <sys/cyclic.h>
40 #include <sys/lgrp.h>
41 #include <sys/pghw.h>
42 #include <sys/loadavg.h>
43 #include <sys/class.h>
44 #include <sys/fss.h>
45 #include <sys/pool.h>
46 #include <sys/pool_pset.h>
47 #include <sys/policy.h>
48 
49 /*
50  * Calling pool_lock() protects the pools configuration, which includes
51  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
52  * partitions from being created or destroyed while the lock is held.
53  * The lock ordering with respect to related locks is:
54  *
55  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
56  *
57  * Blocking memory allocations may be made while holding "pool_lock"
58  * or cpu_lock.
59  */
60 
61 /*
62  * The cp_default partition is allocated statically, but its lgroup load average
63  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
64  * saves some memory since the space allocated reflects the actual number of
65  * lgroups supported by the platform. The lgrp facility provides a temporary
66  * space to hold lpl information during system bootstrap.
67  */
68 
69 cpupart_t		*cp_list_head;
70 cpupart_t		cp_default;
71 struct mach_cpupart	cp_default_mach;
72 static cpupartid_t	cp_id_next;
73 uint_t			cp_numparts;
74 uint_t			cp_numparts_nonempty;
75 
76 /*
77  * Need to limit total number of partitions to avoid slowing down the
78  * clock code too much.  The clock code traverses the list of
79  * partitions and needs to be able to execute in a reasonable amount
80  * of time (less than 1/hz seconds).  The maximum is sized based on
81  * max_ncpus so it shouldn't be a problem unless there are large
82  * numbers of empty partitions.
83  */
84 static uint_t		cp_max_numparts;
85 
86 /*
87  * Processor sets and CPU partitions are different but related concepts.
88  * A processor set is a user-level abstraction allowing users to create
89  * sets of CPUs and bind threads exclusively to those sets.  A CPU
90  * partition is a kernel dispatcher object consisting of a set of CPUs
91  * and a global dispatch queue.  The processor set abstraction is
92  * implemented via a CPU partition, and currently there is a 1-1
93  * mapping between processor sets and partitions (excluding the default
94  * partition, which is not visible as a processor set).  Hence, the
95  * numbering for processor sets and CPU partitions is identical.  This
96  * may not always be true in the future, and these macros could become
97  * less trivial if we support e.g. a processor set containing multiple
98  * CPU partitions.
99  */
100 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
101 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
102 
103 
104 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
105 
106 /*
107  * Find a CPU partition given a processor set ID.
108  */
109 static cpupart_t *
110 cpupart_find_all(psetid_t psid)
111 {
112 	cpupart_t *cp;
113 	cpupartid_t cpid = PSTOCP(psid);
114 
115 	ASSERT(MUTEX_HELD(&cpu_lock));
116 
117 	/* default partition not visible as a processor set */
118 	if (psid == CP_DEFAULT)
119 		return (NULL);
120 
121 	if (psid == PS_MYID)
122 		return (curthread->t_cpupart);
123 
124 	cp = cp_list_head;
125 	do {
126 		if (cp->cp_id == cpid)
127 			return (cp);
128 		cp = cp->cp_next;
129 	} while (cp != cp_list_head);
130 	return (NULL);
131 }
132 
133 /*
134  * Find a CPU partition given a processor set ID if the processor set
135  * should be visible from the calling zone.
136  */
137 cpupart_t *
138 cpupart_find(psetid_t psid)
139 {
140 	cpupart_t *cp;
141 
142 	ASSERT(MUTEX_HELD(&cpu_lock));
143 	cp = cpupart_find_all(psid);
144 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
145 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
146 			return (NULL);
147 	return (cp);
148 }
149 
150 static int
151 cpupart_kstat_update(kstat_t *ksp, int rw)
152 {
153 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
154 	cpupart_kstat_t *cpksp = ksp->ks_data;
155 
156 	if (rw == KSTAT_WRITE)
157 		return (EACCES);
158 
159 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
160 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
161 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
162 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
163 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
164 	    (16 - FSHIFT);
165 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
166 	    (16 - FSHIFT);
167 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
168 	    (16 - FSHIFT);
169 	return (0);
170 }
171 
172 static void
173 cpupart_kstat_create(cpupart_t *cp)
174 {
175 	kstat_t *ksp;
176 	zoneid_t zoneid;
177 
178 	ASSERT(MUTEX_HELD(&cpu_lock));
179 
180 	/*
181 	 * We have a bit of a chicken-egg problem since this code will
182 	 * get called to create the kstats for CP_DEFAULT before the
183 	 * pools framework gets initialized.  We circumvent the problem
184 	 * by special-casing cp_default.
185 	 */
186 	if (cp != &cp_default && pool_pset_enabled())
187 		zoneid = GLOBAL_ZONEID;
188 	else
189 		zoneid = ALL_ZONES;
190 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
191 	    KSTAT_TYPE_NAMED,
192 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
193 	if (ksp != NULL) {
194 		cpupart_kstat_t *cpksp = ksp->ks_data;
195 
196 		kstat_named_init(&cpksp->cpk_updates, "updates",
197 		    KSTAT_DATA_UINT64);
198 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
199 		    KSTAT_DATA_UINT64);
200 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
201 		    KSTAT_DATA_UINT64);
202 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
203 		    KSTAT_DATA_UINT32);
204 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
205 		    KSTAT_DATA_UINT32);
206 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
207 		    KSTAT_DATA_UINT32);
208 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
209 		    KSTAT_DATA_UINT32);
210 
211 		ksp->ks_update = cpupart_kstat_update;
212 		ksp->ks_private = cp;
213 
214 		kstat_install(ksp);
215 	}
216 	cp->cp_kstat = ksp;
217 }
218 
219 /*
220  * Initialize the default partition and kpreempt disp queue.
221  */
222 void
223 cpupart_initialize_default(void)
224 {
225 	lgrp_id_t i;
226 
227 	cp_list_head = &cp_default;
228 	cp_default.cp_next = &cp_default;
229 	cp_default.cp_prev = &cp_default;
230 	cp_default.cp_id = CP_DEFAULT;
231 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
232 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
233 	cp_default.cp_kp_queue.disp_cpu = NULL;
234 	cp_default.cp_gen = 0;
235 	cp_default.cp_loadavg.lg_cur = 0;
236 	cp_default.cp_loadavg.lg_len = 0;
237 	cp_default.cp_loadavg.lg_total = 0;
238 	for (i = 0; i < S_LOADAVG_SZ; i++) {
239 		cp_default.cp_loadavg.lg_loads[i] = 0;
240 	}
241 	CPUSET_ZERO(cp_default.cp_mach->mc_haltset);
242 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
243 	cp_id_next = CP_DEFAULT + 1;
244 	cpupart_kstat_create(&cp_default);
245 	cp_numparts = 1;
246 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
247 		cp_max_numparts = max_ncpus * 2 + 1;
248 	/*
249 	 * Allocate space for cp_default list of lgrploads
250 	 */
251 	cp_default.cp_nlgrploads = lgrp_plat_max_lgrps();
252 	cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) *
253 	    cp_default.cp_nlgrploads, KM_SLEEP);
254 
255 	/*
256 	 * The initial lpl topology is created in a special lpl list
257 	 * lpl_bootstrap. It should be copied to cp_default.
258 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
259 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
260 	 */
261 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
262 	    cp_default.cp_nlgrploads);
263 
264 	for (i = 0; i < cp_default.cp_nlgrploads; i++) {
265 		cp_default.cp_lgrploads[i].lpl_lgrpid = i;
266 	}
267 	cp_default.cp_attr = PSET_NOESCAPE;
268 	cp_numparts_nonempty = 1;
269 	/*
270 	 * Set t0's home
271 	 */
272 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
273 
274 	bitset_init(&cp_default.cp_cmt_pgs);
275 }
276 
277 
278 static int
279 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
280 {
281 	cpupart_t *oldpp;
282 	cpu_t	*ncp, *newlist;
283 	kthread_t *t;
284 	int	move_threads = 1;
285 	lgrp_id_t lgrpid;
286 	proc_t 	*p;
287 	int lgrp_diff_lpl;
288 	lpl_t	*cpu_lpl;
289 	int	ret;
290 	boolean_t unbind_all_threads = (forced != 0);
291 
292 	ASSERT(MUTEX_HELD(&cpu_lock));
293 	ASSERT(newpp != NULL);
294 
295 	oldpp = cp->cpu_part;
296 	ASSERT(oldpp != NULL);
297 	ASSERT(oldpp->cp_ncpus > 0);
298 
299 	if (newpp == oldpp) {
300 		/*
301 		 * Don't need to do anything.
302 		 */
303 		return (0);
304 	}
305 
306 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
307 
308 	if (!disp_bound_partition(cp, 0)) {
309 		/*
310 		 * Don't need to move threads if there are no threads in
311 		 * the partition.  Note that threads can't enter the
312 		 * partition while we're holding cpu_lock.
313 		 */
314 		move_threads = 0;
315 	} else if (oldpp->cp_ncpus == 1) {
316 		/*
317 		 * The last CPU is removed from a partition which has threads
318 		 * running in it. Some of these threads may be bound to this
319 		 * CPU.
320 		 *
321 		 * Attempt to unbind threads from the CPU and from the processor
322 		 * set. Note that no threads should be bound to this CPU since
323 		 * cpupart_move_threads will refuse to move bound threads to
324 		 * other CPUs.
325 		 */
326 		(void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
327 		(void) cpupart_unbind_threads(oldpp, B_FALSE);
328 
329 		if (!disp_bound_partition(cp, 0)) {
330 			/*
331 			 * No bound threads in this partition any more
332 			 */
333 			move_threads = 0;
334 		} else {
335 			/*
336 			 * There are still threads bound to the partition
337 			 */
338 			cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
339 			return (EBUSY);
340 		}
341 	}
342 
343 	/*
344 	 * If forced flag is set unbind any threads from this CPU.
345 	 * Otherwise unbind soft-bound threads only.
346 	 */
347 	if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
348 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
349 		return (ret);
350 	}
351 
352 	/*
353 	 * Stop further threads weak binding to this cpu.
354 	 */
355 	cpu_inmotion = cp;
356 	membar_enter();
357 
358 	/*
359 	 * Notify the Processor Groups subsystem that the CPU
360 	 * will be moving cpu partitions. This is done before
361 	 * CPUs are paused to provide an opportunity for any
362 	 * needed memory allocations.
363 	 */
364 	pg_cpupart_out(cp, oldpp);
365 	pg_cpupart_in(cp, newpp);
366 
367 again:
368 	if (move_threads) {
369 		int loop_count;
370 		/*
371 		 * Check for threads strong or weak bound to this CPU.
372 		 */
373 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
374 			if (loop_count >= 5) {
375 				cpu_state_change_notify(cp->cpu_id,
376 				    CPU_CPUPART_IN);
377 				pg_cpupart_out(cp, newpp);
378 				pg_cpupart_in(cp, oldpp);
379 				cpu_inmotion = NULL;
380 				return (EBUSY);	/* some threads still bound */
381 			}
382 			delay(1);
383 		}
384 	}
385 
386 	/*
387 	 * Before we actually start changing data structures, notify
388 	 * the cyclic subsystem that we want to move this CPU out of its
389 	 * partition.
390 	 */
391 	if (!cyclic_move_out(cp)) {
392 		/*
393 		 * This CPU must be the last CPU in a processor set with
394 		 * a bound cyclic.
395 		 */
396 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
397 		pg_cpupart_out(cp, newpp);
398 		pg_cpupart_in(cp, oldpp);
399 		cpu_inmotion = NULL;
400 		return (EBUSY);
401 	}
402 
403 	pause_cpus(cp);
404 
405 	if (move_threads) {
406 		/*
407 		 * The thread on cpu before the pause thread may have read
408 		 * cpu_inmotion before we raised the barrier above.  Check
409 		 * again.
410 		 */
411 		if (disp_bound_threads(cp, 1)) {
412 			start_cpus();
413 			goto again;
414 		}
415 
416 	}
417 
418 	/*
419 	 * Now that CPUs are paused, let the PG subsystem perform
420 	 * any necessary data structure updates.
421 	 */
422 	pg_cpupart_move(cp, oldpp, newpp);
423 
424 	/* save this cpu's lgroup -- it'll be the same in the new partition */
425 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
426 
427 	cpu_lpl = cp->cpu_lpl;
428 	/*
429 	 * let the lgroup framework know cp has left the partition
430 	 */
431 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
432 
433 	/* move out of old partition */
434 	oldpp->cp_ncpus--;
435 	if (oldpp->cp_ncpus > 0) {
436 
437 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
438 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
439 		if (oldpp->cp_cpulist == cp) {
440 			oldpp->cp_cpulist = ncp;
441 		}
442 	} else {
443 		ncp = oldpp->cp_cpulist = NULL;
444 		cp_numparts_nonempty--;
445 		ASSERT(cp_numparts_nonempty != 0);
446 	}
447 	oldpp->cp_gen++;
448 
449 	/* move into new partition */
450 	newlist = newpp->cp_cpulist;
451 	if (newlist == NULL) {
452 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
453 		cp_numparts_nonempty++;
454 		ASSERT(cp_numparts_nonempty != 0);
455 	} else {
456 		cp->cpu_next_part = newlist;
457 		cp->cpu_prev_part = newlist->cpu_prev_part;
458 		newlist->cpu_prev_part->cpu_next_part = cp;
459 		newlist->cpu_prev_part = cp;
460 	}
461 	cp->cpu_part = newpp;
462 	newpp->cp_ncpus++;
463 	newpp->cp_gen++;
464 
465 	ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset));
466 	ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset));
467 
468 	/*
469 	 * let the lgroup framework know cp has entered the partition
470 	 */
471 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
472 
473 	/*
474 	 * If necessary, move threads off processor.
475 	 */
476 	if (move_threads) {
477 		ASSERT(ncp != NULL);
478 
479 		/*
480 		 * Walk thru the active process list to look for
481 		 * threads that need to have a new home lgroup,
482 		 * or the last CPU they run on is the same CPU
483 		 * being moved out of the partition.
484 		 */
485 
486 		for (p = practive; p != NULL; p = p->p_next) {
487 
488 			t = p->p_tlist;
489 
490 			if (t == NULL)
491 				continue;
492 
493 			lgrp_diff_lpl = 0;
494 
495 			do {
496 
497 				ASSERT(t->t_lpl != NULL);
498 
499 				/*
500 				 * Update the count of how many threads are
501 				 * in this CPU's lgroup but have a different lpl
502 				 */
503 
504 				if (t->t_lpl != cpu_lpl &&
505 				    t->t_lpl->lpl_lgrpid == lgrpid)
506 					lgrp_diff_lpl++;
507 				/*
508 				 * If the lgroup that t is assigned to no
509 				 * longer has any CPUs in t's partition,
510 				 * we'll have to choose a new lgroup for t.
511 				 */
512 
513 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
514 				    t->t_cpupart)) {
515 					lgrp_move_thread(t,
516 					    lgrp_choose(t, t->t_cpupart), 0);
517 				}
518 
519 				/*
520 				 * make sure lpl points to our own partition
521 				 */
522 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
523 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
524 					t->t_cpupart->cp_nlgrploads));
525 
526 				ASSERT(t->t_lpl->lpl_ncpu > 0);
527 
528 				/* Update CPU last ran on if it was this CPU */
529 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
530 				    t->t_bound_cpu != cp) {
531 					t->t_cpu = disp_lowpri_cpu(ncp,
532 					    t->t_lpl, t->t_pri, NULL);
533 				}
534 				t = t->t_forw;
535 			} while (t != p->p_tlist);
536 
537 			/*
538 			 * Didn't find any threads in the same lgroup as this
539 			 * CPU with a different lpl, so remove the lgroup from
540 			 * the process lgroup bitmask.
541 			 */
542 
543 			if (lgrp_diff_lpl)
544 				klgrpset_del(p->p_lgrpset, lgrpid);
545 		}
546 
547 		/*
548 		 * Walk thread list looking for threads that need to be
549 		 * rehomed, since there are some threads that are not in
550 		 * their process's p_tlist.
551 		 */
552 
553 		t = curthread;
554 
555 		do {
556 			ASSERT(t != NULL && t->t_lpl != NULL);
557 
558 			/*
559 			 * If the lgroup that t is assigned to no
560 			 * longer has any CPUs in t's partition,
561 			 * we'll have to choose a new lgroup for t.
562 			 * Also, choose best lgroup for home when
563 			 * thread has specified lgroup affinities,
564 			 * since there may be an lgroup with more
565 			 * affinity available after moving CPUs
566 			 * around.
567 			 */
568 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
569 			    t->t_cpupart) || t->t_lgrp_affinity) {
570 				lgrp_move_thread(t,
571 				    lgrp_choose(t, t->t_cpupart), 1);
572 			}
573 
574 			/* make sure lpl points to our own partition */
575 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
576 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
577 				t->t_cpupart->cp_nlgrploads));
578 
579 			ASSERT(t->t_lpl->lpl_ncpu > 0);
580 
581 			/* Update CPU last ran on if it was this CPU */
582 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
583 			    t->t_bound_cpu != cp) {
584 				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
585 				    t->t_pri, NULL);
586 			}
587 
588 			t = t->t_next;
589 		} while (t != curthread);
590 
591 		/*
592 		 * Clear off the CPU's run queue, and the kp queue if the
593 		 * partition is now empty.
594 		 */
595 		disp_cpu_inactive(cp);
596 
597 		/*
598 		 * Make cp switch to a thread from the new partition.
599 		 */
600 		cp->cpu_runrun = 1;
601 		cp->cpu_kprunrun = 1;
602 	}
603 
604 	cpu_inmotion = NULL;
605 	start_cpus();
606 
607 	/*
608 	 * Let anyone interested know that cpu has been added to the set.
609 	 */
610 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
611 
612 	/*
613 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
614 	 * bound to the new processor set.
615 	 */
616 	cyclic_move_in(cp);
617 
618 	return (0);
619 }
620 
621 /*
622  * Check if thread can be moved to a new cpu partition.  Called by
623  * cpupart_move_thread() and pset_bind_start().
624  */
625 int
626 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
627 {
628 	ASSERT(MUTEX_HELD(&cpu_lock));
629 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
630 	ASSERT(cp != NULL);
631 	ASSERT(THREAD_LOCK_HELD(tp));
632 
633 	/*
634 	 * CPU-bound threads can't be moved.
635 	 */
636 	if (!ignore) {
637 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
638 		    tp->t_weakbound_cpu;
639 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
640 			return (EBUSY);
641 	}
642 	return (0);
643 }
644 
645 /*
646  * Move thread to new partition.  If ignore is non-zero, then CPU
647  * bindings should be ignored (this is used when destroying a
648  * partition).
649  */
650 static int
651 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
652     void *projbuf, void *zonebuf)
653 {
654 	cpupart_t *oldpp = tp->t_cpupart;
655 	int ret;
656 
657 	ASSERT(MUTEX_HELD(&cpu_lock));
658 	ASSERT(MUTEX_HELD(&pidlock));
659 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
660 	ASSERT(newpp != NULL);
661 
662 	if (newpp->cp_cpulist == NULL)
663 		return (EINVAL);
664 
665 	/*
666 	 * Check for errors first.
667 	 */
668 	thread_lock(tp);
669 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
670 		thread_unlock(tp);
671 		return (ret);
672 	}
673 
674 	/* move the thread */
675 	if (oldpp != newpp) {
676 		/*
677 		 * Make the thread switch to the new partition.
678 		 */
679 		tp->t_cpupart = newpp;
680 		ASSERT(tp->t_lpl != NULL);
681 		/*
682 		 * Leave the thread on the same lgroup if possible; otherwise
683 		 * choose a new lgroup for it.  In either case, update its
684 		 * t_lpl.
685 		 */
686 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
687 		    tp->t_lgrp_affinity == NULL) {
688 			/*
689 			 * The thread's lgroup has CPUs in the thread's new
690 			 * partition, so the thread can stay assigned to the
691 			 * same lgroup.  Update its t_lpl to point to the
692 			 * lpl_t for its lgroup in its new partition.
693 			 */
694 			lgrp_move_thread(tp, &tp->t_cpupart->\
695 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
696 		} else {
697 			/*
698 			 * The thread's lgroup has no cpus in its new
699 			 * partition or it has specified lgroup affinities,
700 			 * so choose the best lgroup for the thread and
701 			 * assign it to that lgroup.
702 			 */
703 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
704 			    1);
705 		}
706 		/*
707 		 * make sure lpl points to our own partition
708 		 */
709 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
710 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
711 			tp->t_cpupart->cp_nlgrploads));
712 
713 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
714 
715 		if (tp->t_state == TS_ONPROC) {
716 			cpu_surrender(tp);
717 		} else if (tp->t_state == TS_RUN) {
718 			(void) dispdeq(tp);
719 			setbackdq(tp);
720 		}
721 	}
722 
723 	/*
724 	 * Our binding has changed; set TP_CHANGEBIND.
725 	 */
726 	tp->t_proc_flag |= TP_CHANGEBIND;
727 	aston(tp);
728 
729 	thread_unlock(tp);
730 	fss_changepset(tp, newpp, projbuf, zonebuf);
731 
732 	return (0);		/* success */
733 }
734 
735 
736 /*
737  * This function binds a thread to a partition.  Must be called with the
738  * p_lock of the containing process held (to keep the thread from going
739  * away), and thus also with cpu_lock held (since cpu_lock must be
740  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
741  * should be ignored (this is used when destroying a partition).
742  */
743 int
744 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
745     void *zonebuf)
746 {
747 	cpupart_t	*newpp;
748 
749 	ASSERT(pool_lock_held());
750 	ASSERT(MUTEX_HELD(&cpu_lock));
751 	ASSERT(MUTEX_HELD(&pidlock));
752 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
753 
754 	if (psid == PS_NONE)
755 		newpp = &cp_default;
756 	else {
757 		newpp = cpupart_find(psid);
758 		if (newpp == NULL) {
759 			return (EINVAL);
760 		}
761 	}
762 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
763 }
764 
765 
766 /*
767  * Create a new partition.  On MP systems, this also allocates a
768  * kpreempt disp queue for that partition.
769  */
770 int
771 cpupart_create(psetid_t *psid)
772 {
773 	cpupart_t	*pp;
774 	lgrp_id_t	i;
775 
776 	ASSERT(pool_lock_held());
777 
778 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
779 	pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP);
780 	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
781 	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
782 	    KM_SLEEP);
783 
784 	mutex_enter(&cpu_lock);
785 	if (cp_numparts == cp_max_numparts) {
786 		mutex_exit(&cpu_lock);
787 		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
788 		pp->cp_lgrploads = NULL;
789 		kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
790 		kmem_free(pp, sizeof (cpupart_t));
791 		return (ENOMEM);
792 	}
793 	cp_numparts++;
794 	/* find the next free partition ID */
795 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
796 		cp_id_next++;
797 	pp->cp_id = cp_id_next++;
798 	pp->cp_ncpus = 0;
799 	pp->cp_cpulist = NULL;
800 	pp->cp_attr = 0;
801 	klgrpset_clear(pp->cp_lgrpset);
802 	pp->cp_kp_queue.disp_maxrunpri = -1;
803 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
804 	pp->cp_kp_queue.disp_cpu = NULL;
805 	pp->cp_gen = 0;
806 	CPUSET_ZERO(pp->cp_mach->mc_haltset);
807 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
808 	*psid = CPTOPS(pp->cp_id);
809 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
810 	cpupart_kstat_create(pp);
811 	for (i = 0; i < pp->cp_nlgrploads; i++) {
812 		pp->cp_lgrploads[i].lpl_lgrpid = i;
813 	}
814 	bitset_init(&pp->cp_cmt_pgs);
815 
816 	/*
817 	 * Pause all CPUs while changing the partition list, to make sure
818 	 * the clock thread (which traverses the list without holding
819 	 * cpu_lock) isn't running.
820 	 */
821 	pause_cpus(NULL);
822 	pp->cp_next = cp_list_head;
823 	pp->cp_prev = cp_list_head->cp_prev;
824 	cp_list_head->cp_prev->cp_next = pp;
825 	cp_list_head->cp_prev = pp;
826 	start_cpus();
827 	mutex_exit(&cpu_lock);
828 
829 	return (0);
830 }
831 
832 /*
833  * Move threads from specified partition to cp_default. If `force' is specified,
834  * move all threads, otherwise move only soft-bound threads.
835  */
836 static int
837 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
838 {
839 	void 	*projbuf, *zonebuf;
840 	kthread_t *t;
841 	proc_t	*p;
842 	int	err = 0;
843 	psetid_t psid = pp->cp_id;
844 
845 	ASSERT(pool_lock_held());
846 	ASSERT(MUTEX_HELD(&cpu_lock));
847 
848 	if (pp == NULL || pp == &cp_default) {
849 		return (EINVAL);
850 	}
851 
852 	/*
853 	 * Pre-allocate enough buffers for FSS for all active projects and
854 	 * for all active zones on the system.  Unused buffers will be
855 	 * freed later by fss_freebuf().
856 	 */
857 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
858 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
859 
860 	mutex_enter(&pidlock);
861 	t = curthread;
862 	do {
863 		if (t->t_bind_pset == psid) {
864 again:			p = ttoproc(t);
865 			mutex_enter(&p->p_lock);
866 			if (ttoproc(t) != p) {
867 				/*
868 				 * lwp_exit has changed this thread's process
869 				 * pointer before we grabbed its p_lock.
870 				 */
871 				mutex_exit(&p->p_lock);
872 				goto again;
873 			}
874 
875 			/*
876 			 * Can only unbind threads which have revocable binding
877 			 * unless force unbinding requested.
878 			 */
879 			if (unbind_all || TB_PSET_IS_SOFT(t)) {
880 				err = cpupart_bind_thread(t, PS_NONE, 1,
881 				    projbuf, zonebuf);
882 				if (err) {
883 					mutex_exit(&p->p_lock);
884 					mutex_exit(&pidlock);
885 					fss_freebuf(projbuf, FSS_ALLOC_PROJ);
886 					fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
887 					return (err);
888 				}
889 				t->t_bind_pset = PS_NONE;
890 			}
891 			mutex_exit(&p->p_lock);
892 		}
893 		t = t->t_next;
894 	} while (t != curthread);
895 
896 	mutex_exit(&pidlock);
897 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
898 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
899 	return (err);
900 }
901 
902 /*
903  * Destroy a partition.
904  */
905 int
906 cpupart_destroy(psetid_t psid)
907 {
908 	cpu_t	*cp, *first_cp;
909 	cpupart_t *pp, *newpp;
910 	int	err = 0;
911 
912 	ASSERT(pool_lock_held());
913 	mutex_enter(&cpu_lock);
914 
915 	pp = cpupart_find(psid);
916 	if (pp == NULL || pp == &cp_default) {
917 		mutex_exit(&cpu_lock);
918 		return (EINVAL);
919 	}
920 
921 	/*
922 	 * Unbind all the threads currently bound to the partition.
923 	 */
924 	err = cpupart_unbind_threads(pp, B_TRUE);
925 	if (err) {
926 		mutex_exit(&cpu_lock);
927 		return (err);
928 	}
929 
930 	newpp = &cp_default;
931 	while ((cp = pp->cp_cpulist) != NULL) {
932 		if (err = cpupart_move_cpu(cp, newpp, 0)) {
933 			mutex_exit(&cpu_lock);
934 			return (err);
935 		}
936 	}
937 
938 	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
939 	ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset));
940 
941 	/*
942 	 * Teardown the partition's group of active CMT PGs now that
943 	 * all of the CPUs have left.
944 	 */
945 	bitset_fini(&pp->cp_cmt_pgs);
946 
947 	/*
948 	 * Reset the pointers in any offline processors so they won't
949 	 * try to rejoin the destroyed partition when they're turned
950 	 * online.
951 	 */
952 	first_cp = cp = CPU;
953 	do {
954 		if (cp->cpu_part == pp) {
955 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
956 			cp->cpu_part = newpp;
957 		}
958 		cp = cp->cpu_next;
959 	} while (cp != first_cp);
960 
961 	/*
962 	 * Pause all CPUs while changing the partition list, to make sure
963 	 * the clock thread (which traverses the list without holding
964 	 * cpu_lock) isn't running.
965 	 */
966 	pause_cpus(NULL);
967 	pp->cp_prev->cp_next = pp->cp_next;
968 	pp->cp_next->cp_prev = pp->cp_prev;
969 	if (cp_list_head == pp)
970 		cp_list_head = pp->cp_next;
971 	start_cpus();
972 
973 	if (cp_id_next > pp->cp_id)
974 		cp_id_next = pp->cp_id;
975 
976 	if (pp->cp_kstat)
977 		kstat_delete(pp->cp_kstat);
978 
979 	cp_numparts--;
980 
981 	disp_kp_free(&pp->cp_kp_queue);
982 	kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
983 	pp->cp_lgrploads = NULL;
984 	kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
985 	kmem_free(pp, sizeof (cpupart_t));
986 	mutex_exit(&cpu_lock);
987 
988 	return (err);
989 }
990 
991 
992 /*
993  * Return the ID of the partition to which the specified processor belongs.
994  */
995 psetid_t
996 cpupart_query_cpu(cpu_t *cp)
997 {
998 	ASSERT(MUTEX_HELD(&cpu_lock));
999 
1000 	return (CPTOPS(cp->cpu_part->cp_id));
1001 }
1002 
1003 
1004 /*
1005  * Attach a processor to an existing partition.
1006  */
1007 int
1008 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1009 {
1010 	cpupart_t	*pp;
1011 	int		err;
1012 
1013 	ASSERT(pool_lock_held());
1014 	ASSERT(MUTEX_HELD(&cpu_lock));
1015 
1016 	pp = cpupart_find(psid);
1017 	if (pp == NULL)
1018 		return (EINVAL);
1019 	if (cp->cpu_flags & CPU_OFFLINE)
1020 		return (EINVAL);
1021 
1022 	err = cpupart_move_cpu(cp, pp, forced);
1023 	return (err);
1024 }
1025 
1026 /*
1027  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1028  * this just checks for a valid partition.  If numcpus is non-NULL but
1029  * cpulist is NULL, the current number of cpus is stored in *numcpus.
1030  * If both are non-NULL, the current number of cpus is stored in *numcpus,
1031  * and a list of those cpus up to the size originally in *numcpus is
1032  * stored in cpulist[].  Also, store the processor set id in *psid.
1033  * This is useful in case the processor set id passed in was PS_MYID.
1034  */
1035 int
1036 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1037 {
1038 	cpupart_t	*pp;
1039 	uint_t		ncpus;
1040 	cpu_t		*c;
1041 	int		i;
1042 
1043 	mutex_enter(&cpu_lock);
1044 	pp = cpupart_find(*psid);
1045 	if (pp == NULL) {
1046 		mutex_exit(&cpu_lock);
1047 		return (EINVAL);
1048 	}
1049 	*psid = CPTOPS(pp->cp_id);
1050 	ncpus = pp->cp_ncpus;
1051 	if (numcpus) {
1052 		if (ncpus > *numcpus) {
1053 			/*
1054 			 * Only copy as many cpus as were passed in, but
1055 			 * pass back the real number.
1056 			 */
1057 			uint_t t = ncpus;
1058 			ncpus = *numcpus;
1059 			*numcpus = t;
1060 		} else
1061 			*numcpus = ncpus;
1062 
1063 		if (cpulist) {
1064 			c = pp->cp_cpulist;
1065 			for (i = 0; i < ncpus; i++) {
1066 				ASSERT(c != NULL);
1067 				cpulist[i] = c->cpu_id;
1068 				c = c->cpu_next_part;
1069 			}
1070 		}
1071 	}
1072 	mutex_exit(&cpu_lock);
1073 	return (0);
1074 }
1075 
1076 /*
1077  * Reallocate kpreempt queues for each CPU partition.  Called from
1078  * disp_setup when a new scheduling class is loaded that increases the
1079  * number of priorities in the system.
1080  */
1081 void
1082 cpupart_kpqalloc(pri_t npri)
1083 {
1084 	cpupart_t *cpp;
1085 
1086 	ASSERT(MUTEX_HELD(&cpu_lock));
1087 	cpp = cp_list_head;
1088 	do {
1089 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1090 		cpp = cpp->cp_next;
1091 	} while (cpp != cp_list_head);
1092 }
1093 
1094 int
1095 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1096 {
1097 	cpupart_t *cp;
1098 	int i;
1099 
1100 	ASSERT(nelem >= 0);
1101 	ASSERT(nelem <= LOADAVG_NSTATS);
1102 	ASSERT(MUTEX_HELD(&cpu_lock));
1103 
1104 	cp = cpupart_find(psid);
1105 	if (cp == NULL)
1106 		return (EINVAL);
1107 	for (i = 0; i < nelem; i++)
1108 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1109 
1110 	return (0);
1111 }
1112 
1113 
1114 uint_t
1115 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1116 {
1117 	uint_t numpart = 0;
1118 	cpupart_t *cp;
1119 
1120 	ASSERT(MUTEX_HELD(&cpu_lock));
1121 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1122 
1123 	if (list != NULL) {
1124 		cp = cp_list_head;
1125 		do {
1126 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1127 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1128 				if (numpart == nelem)
1129 					break;
1130 				list[numpart++] = CPTOPS(cp->cp_id);
1131 			}
1132 			cp = cp->cp_next;
1133 		} while (cp != cp_list_head);
1134 	}
1135 
1136 	ASSERT(numpart < cp_numparts);
1137 
1138 	if (flag == CP_ALL)
1139 		numpart = cp_numparts - 1; /* leave out default partition */
1140 	else if (flag == CP_NONEMPTY)
1141 		numpart = cp_numparts_nonempty;
1142 
1143 	return (numpart);
1144 }
1145 
1146 int
1147 cpupart_setattr(psetid_t psid, uint_t attr)
1148 {
1149 	cpupart_t *cp;
1150 
1151 	ASSERT(pool_lock_held());
1152 
1153 	mutex_enter(&cpu_lock);
1154 	if ((cp = cpupart_find(psid)) == NULL) {
1155 		mutex_exit(&cpu_lock);
1156 		return (EINVAL);
1157 	}
1158 	/*
1159 	 * PSET_NOESCAPE attribute for default cpu partition is always set
1160 	 */
1161 	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1162 		mutex_exit(&cpu_lock);
1163 		return (EINVAL);
1164 	}
1165 	cp->cp_attr = attr;
1166 	mutex_exit(&cpu_lock);
1167 	return (0);
1168 }
1169 
1170 int
1171 cpupart_getattr(psetid_t psid, uint_t *attrp)
1172 {
1173 	cpupart_t *cp;
1174 
1175 	mutex_enter(&cpu_lock);
1176 	if ((cp = cpupart_find(psid)) == NULL) {
1177 		mutex_exit(&cpu_lock);
1178 		return (EINVAL);
1179 	}
1180 	*attrp = cp->cp_attr;
1181 	mutex_exit(&cpu_lock);
1182 	return (0);
1183 }
1184