xref: /illumos-gate/usr/src/uts/common/disp/cpupart.c (revision e8d712970f7ec76e09d5013b0b9aa5f0e0cf3e62)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
23  *
24  * Copyright 2018 Joyent, Inc.
25  * Copyright (c) 2017 by Delphix. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/thread.h>
33 #include <sys/disp.h>
34 #include <sys/kmem.h>
35 #include <sys/debug.h>
36 #include <sys/cpupart.h>
37 #include <sys/pset.h>
38 #include <sys/var.h>
39 #include <sys/cyclic.h>
40 #include <sys/lgrp.h>
41 #include <sys/pghw.h>
42 #include <sys/loadavg.h>
43 #include <sys/class.h>
44 #include <sys/fss.h>
45 #include <sys/pool.h>
46 #include <sys/pool_pset.h>
47 #include <sys/policy.h>
48 
49 /*
50  * Calling pool_lock() protects the pools configuration, which includes
51  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
52  * partitions from being created or destroyed while the lock is held.
53  * The lock ordering with respect to related locks is:
54  *
55  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
56  *
57  * Blocking memory allocations may be made while holding "pool_lock"
58  * or cpu_lock.
59  */
60 
61 /*
62  * The cp_default partition is allocated statically, but its lgroup load average
63  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
64  * saves some memory since the space allocated reflects the actual number of
65  * lgroups supported by the platform. The lgrp facility provides a temporary
66  * space to hold lpl information during system bootstrap.
67  */
68 
69 cpupart_t		*cp_list_head;
70 cpupart_t		cp_default;
71 static cpupartid_t	cp_id_next;
72 uint_t			cp_numparts;
73 uint_t			cp_numparts_nonempty;
74 
75 /*
76  * Need to limit total number of partitions to avoid slowing down the
77  * clock code too much.  The clock code traverses the list of
78  * partitions and needs to be able to execute in a reasonable amount
79  * of time (less than 1/hz seconds).  The maximum is sized based on
80  * max_ncpus so it shouldn't be a problem unless there are large
81  * numbers of empty partitions.
82  */
83 static uint_t		cp_max_numparts;
84 
85 /*
86  * Processor sets and CPU partitions are different but related concepts.
87  * A processor set is a user-level abstraction allowing users to create
88  * sets of CPUs and bind threads exclusively to those sets.  A CPU
89  * partition is a kernel dispatcher object consisting of a set of CPUs
90  * and a global dispatch queue.  The processor set abstraction is
91  * implemented via a CPU partition, and currently there is a 1-1
92  * mapping between processor sets and partitions (excluding the default
93  * partition, which is not visible as a processor set).  Hence, the
94  * numbering for processor sets and CPU partitions is identical.  This
95  * may not always be true in the future, and these macros could become
96  * less trivial if we support e.g. a processor set containing multiple
97  * CPU partitions.
98  */
99 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
100 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
101 
102 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
103 
104 /*
105  * Find a CPU partition given a processor set ID.
106  */
107 static cpupart_t *
108 cpupart_find_all(psetid_t psid)
109 {
110 	cpupart_t *cp;
111 	cpupartid_t cpid = PSTOCP(psid);
112 
113 	ASSERT(MUTEX_HELD(&cpu_lock));
114 
115 	/* default partition not visible as a processor set */
116 	if (psid == CP_DEFAULT)
117 		return (NULL);
118 
119 	if (psid == PS_MYID)
120 		return (curthread->t_cpupart);
121 
122 	cp = cp_list_head;
123 	do {
124 		if (cp->cp_id == cpid)
125 			return (cp);
126 		cp = cp->cp_next;
127 	} while (cp != cp_list_head);
128 	return (NULL);
129 }
130 
131 /*
132  * Find a CPU partition given a processor set ID if the processor set
133  * should be visible from the calling zone.
134  */
135 cpupart_t *
136 cpupart_find(psetid_t psid)
137 {
138 	cpupart_t *cp;
139 
140 	ASSERT(MUTEX_HELD(&cpu_lock));
141 	cp = cpupart_find_all(psid);
142 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
143 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
144 			return (NULL);
145 	return (cp);
146 }
147 
148 static int
149 cpupart_kstat_update(kstat_t *ksp, int rw)
150 {
151 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
152 	cpupart_kstat_t *cpksp = ksp->ks_data;
153 
154 	if (rw == KSTAT_WRITE)
155 		return (EACCES);
156 
157 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
158 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
159 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
160 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
161 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
162 	    (16 - FSHIFT);
163 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
164 	    (16 - FSHIFT);
165 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
166 	    (16 - FSHIFT);
167 	return (0);
168 }
169 
170 static void
171 cpupart_kstat_create(cpupart_t *cp)
172 {
173 	kstat_t *ksp;
174 	zoneid_t zoneid;
175 
176 	ASSERT(MUTEX_HELD(&cpu_lock));
177 
178 	/*
179 	 * We have a bit of a chicken-egg problem since this code will
180 	 * get called to create the kstats for CP_DEFAULT before the
181 	 * pools framework gets initialized.  We circumvent the problem
182 	 * by special-casing cp_default.
183 	 */
184 	if (cp != &cp_default && pool_pset_enabled())
185 		zoneid = GLOBAL_ZONEID;
186 	else
187 		zoneid = ALL_ZONES;
188 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
189 	    KSTAT_TYPE_NAMED,
190 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
191 	if (ksp != NULL) {
192 		cpupart_kstat_t *cpksp = ksp->ks_data;
193 
194 		kstat_named_init(&cpksp->cpk_updates, "updates",
195 		    KSTAT_DATA_UINT64);
196 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
197 		    KSTAT_DATA_UINT64);
198 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
199 		    KSTAT_DATA_UINT64);
200 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
201 		    KSTAT_DATA_UINT32);
202 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
203 		    KSTAT_DATA_UINT32);
204 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
205 		    KSTAT_DATA_UINT32);
206 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
207 		    KSTAT_DATA_UINT32);
208 
209 		ksp->ks_update = cpupart_kstat_update;
210 		ksp->ks_private = cp;
211 
212 		kstat_install(ksp);
213 	}
214 	cp->cp_kstat = ksp;
215 }
216 
217 /*
218  * Initialize the cpupart's lgrp partions (lpls)
219  */
220 static void
221 cpupart_lpl_initialize(cpupart_t *cp)
222 {
223 	int i, sz;
224 
225 	sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
226 	cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
227 
228 	for (i = 0; i < sz; i++) {
229 		/*
230 		 * The last entry of the lpl's resource set is always NULL
231 		 * by design (to facilitate iteration)...hence the "oversizing"
232 		 * by 1.
233 		 */
234 		cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
235 		cp->cp_lgrploads[i].lpl_rset =
236 		    kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
237 		cp->cp_lgrploads[i].lpl_id2rset =
238 		    kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
239 		cp->cp_lgrploads[i].lpl_lgrpid = i;
240 	}
241 }
242 
243 /*
244  * Teardown the cpupart's lgrp partitions
245  */
246 static void
247 cpupart_lpl_teardown(cpupart_t *cp)
248 {
249 	int i, sz;
250 	lpl_t *lpl;
251 
252 	for (i = 0; i < cp->cp_nlgrploads; i++) {
253 		lpl = &cp->cp_lgrploads[i];
254 
255 		sz = lpl->lpl_rset_sz;
256 		kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
257 		kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
258 		lpl->lpl_rset = NULL;
259 		lpl->lpl_id2rset = NULL;
260 	}
261 	kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
262 	cp->cp_lgrploads = NULL;
263 }
264 
265 /*
266  * Initialize the default partition and kpreempt disp queue.
267  */
268 void
269 cpupart_initialize_default(void)
270 {
271 	lgrp_id_t i;
272 
273 	cp_list_head = &cp_default;
274 	cp_default.cp_next = &cp_default;
275 	cp_default.cp_prev = &cp_default;
276 	cp_default.cp_id = CP_DEFAULT;
277 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
278 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
279 	cp_default.cp_kp_queue.disp_cpu = NULL;
280 	cp_default.cp_gen = 0;
281 	cp_default.cp_loadavg.lg_cur = 0;
282 	cp_default.cp_loadavg.lg_len = 0;
283 	cp_default.cp_loadavg.lg_total = 0;
284 	for (i = 0; i < S_LOADAVG_SZ; i++) {
285 		cp_default.cp_loadavg.lg_loads[i] = 0;
286 	}
287 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
288 	cp_id_next = CP_DEFAULT + 1;
289 	cpupart_kstat_create(&cp_default);
290 	cp_numparts = 1;
291 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
292 		cp_max_numparts = max_ncpus * 2 + 1;
293 	/*
294 	 * Allocate space for cp_default list of lgrploads
295 	 */
296 	cpupart_lpl_initialize(&cp_default);
297 
298 	/*
299 	 * The initial lpl topology is created in a special lpl list
300 	 * lpl_bootstrap. It should be copied to cp_default.
301 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
302 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
303 	 */
304 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
305 	    cp_default.cp_nlgrploads);
306 
307 
308 	cp_default.cp_attr = PSET_NOESCAPE;
309 	cp_numparts_nonempty = 1;
310 	/*
311 	 * Set t0's home
312 	 */
313 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
314 
315 	bitset_init(&cp_default.cp_cmt_pgs);
316 	bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
317 
318 	bitset_resize(&cp_default.cp_haltset, max_ncpus);
319 }
320 
321 
322 static int
323 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
324 {
325 	cpupart_t *oldpp;
326 	cpu_t	*ncp, *newlist;
327 	kthread_t *t;
328 	int	move_threads = 1;
329 	lgrp_id_t lgrpid;
330 	proc_t	*p;
331 	int lgrp_diff_lpl;
332 	lpl_t	*cpu_lpl;
333 	int	ret;
334 	boolean_t unbind_all_threads = (forced != 0);
335 
336 	ASSERT(MUTEX_HELD(&cpu_lock));
337 	ASSERT(newpp != NULL);
338 
339 	oldpp = cp->cpu_part;
340 	ASSERT(oldpp != NULL);
341 	ASSERT(oldpp->cp_ncpus > 0);
342 
343 	if (newpp == oldpp) {
344 		/*
345 		 * Don't need to do anything.
346 		 */
347 		return (0);
348 	}
349 
350 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
351 
352 	if (!disp_bound_partition(cp, 0)) {
353 		/*
354 		 * Don't need to move threads if there are no threads in
355 		 * the partition.  Note that threads can't enter the
356 		 * partition while we're holding cpu_lock.
357 		 */
358 		move_threads = 0;
359 	} else if (oldpp->cp_ncpus == 1) {
360 		/*
361 		 * The last CPU is removed from a partition which has threads
362 		 * running in it. Some of these threads may be bound to this
363 		 * CPU.
364 		 *
365 		 * Attempt to unbind threads from the CPU and from the processor
366 		 * set. Note that no threads should be bound to this CPU since
367 		 * cpupart_move_threads will refuse to move bound threads to
368 		 * other CPUs.
369 		 */
370 		(void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
371 		(void) cpupart_unbind_threads(oldpp, B_FALSE);
372 
373 		if (!disp_bound_partition(cp, 0)) {
374 			/*
375 			 * No bound threads in this partition any more
376 			 */
377 			move_threads = 0;
378 		} else {
379 			/*
380 			 * There are still threads bound to the partition
381 			 */
382 			cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
383 			return (EBUSY);
384 		}
385 	}
386 
387 	/*
388 	 * If forced flag is set unbind any threads from this CPU.
389 	 * Otherwise unbind soft-bound threads only.
390 	 */
391 	if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
392 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
393 		return (ret);
394 	}
395 
396 	/*
397 	 * Stop further threads weak binding to this cpu.
398 	 */
399 	cpu_inmotion = cp;
400 	membar_enter();
401 
402 	/*
403 	 * Notify the Processor Groups subsystem that the CPU
404 	 * will be moving cpu partitions. This is done before
405 	 * CPUs are paused to provide an opportunity for any
406 	 * needed memory allocations.
407 	 */
408 	pg_cpupart_out(cp, oldpp);
409 	pg_cpupart_in(cp, newpp);
410 
411 again:
412 	if (move_threads) {
413 		int loop_count;
414 		/*
415 		 * Check for threads strong or weak bound to this CPU.
416 		 */
417 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
418 			if (loop_count >= 5) {
419 				cpu_state_change_notify(cp->cpu_id,
420 				    CPU_CPUPART_IN);
421 				pg_cpupart_out(cp, newpp);
422 				pg_cpupart_in(cp, oldpp);
423 				cpu_inmotion = NULL;
424 				return (EBUSY);	/* some threads still bound */
425 			}
426 			delay(1);
427 		}
428 	}
429 
430 	/*
431 	 * Before we actually start changing data structures, notify
432 	 * the cyclic subsystem that we want to move this CPU out of its
433 	 * partition.
434 	 */
435 	if (!cyclic_move_out(cp)) {
436 		/*
437 		 * This CPU must be the last CPU in a processor set with
438 		 * a bound cyclic.
439 		 */
440 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
441 		pg_cpupart_out(cp, newpp);
442 		pg_cpupart_in(cp, oldpp);
443 		cpu_inmotion = NULL;
444 		return (EBUSY);
445 	}
446 
447 	pause_cpus(cp, NULL);
448 
449 	if (move_threads) {
450 		/*
451 		 * The thread on cpu before the pause thread may have read
452 		 * cpu_inmotion before we raised the barrier above.  Check
453 		 * again.
454 		 */
455 		if (disp_bound_threads(cp, 1)) {
456 			start_cpus();
457 			goto again;
458 		}
459 
460 	}
461 
462 	/*
463 	 * Now that CPUs are paused, let the PG subsystem perform
464 	 * any necessary data structure updates.
465 	 */
466 	pg_cpupart_move(cp, oldpp, newpp);
467 
468 	/* save this cpu's lgroup -- it'll be the same in the new partition */
469 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
470 
471 	cpu_lpl = cp->cpu_lpl;
472 	/*
473 	 * let the lgroup framework know cp has left the partition
474 	 */
475 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
476 
477 	/* move out of old partition */
478 	oldpp->cp_ncpus--;
479 	if (oldpp->cp_ncpus > 0) {
480 
481 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
482 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
483 		if (oldpp->cp_cpulist == cp) {
484 			oldpp->cp_cpulist = ncp;
485 		}
486 	} else {
487 		ncp = oldpp->cp_cpulist = NULL;
488 		cp_numparts_nonempty--;
489 		ASSERT(cp_numparts_nonempty != 0);
490 	}
491 	oldpp->cp_gen++;
492 
493 	/* move into new partition */
494 	newlist = newpp->cp_cpulist;
495 	if (newlist == NULL) {
496 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
497 		cp_numparts_nonempty++;
498 		ASSERT(cp_numparts_nonempty != 0);
499 	} else {
500 		cp->cpu_next_part = newlist;
501 		cp->cpu_prev_part = newlist->cpu_prev_part;
502 		newlist->cpu_prev_part->cpu_next_part = cp;
503 		newlist->cpu_prev_part = cp;
504 	}
505 	cp->cpu_part = newpp;
506 	newpp->cp_ncpus++;
507 	newpp->cp_gen++;
508 
509 	ASSERT(bitset_is_null(&newpp->cp_haltset));
510 	ASSERT(bitset_is_null(&oldpp->cp_haltset));
511 
512 	/*
513 	 * let the lgroup framework know cp has entered the partition
514 	 */
515 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
516 
517 	/*
518 	 * If necessary, move threads off processor.
519 	 */
520 	if (move_threads) {
521 		ASSERT(ncp != NULL);
522 
523 		/*
524 		 * Walk thru the active process list to look for
525 		 * threads that need to have a new home lgroup,
526 		 * or the last CPU they run on is the same CPU
527 		 * being moved out of the partition.
528 		 */
529 
530 		for (p = practive; p != NULL; p = p->p_next) {
531 
532 			t = p->p_tlist;
533 
534 			if (t == NULL)
535 				continue;
536 
537 			lgrp_diff_lpl = 0;
538 
539 			do {
540 
541 				ASSERT(t->t_lpl != NULL);
542 
543 				/*
544 				 * Update the count of how many threads are
545 				 * in this CPU's lgroup but have a different lpl
546 				 */
547 
548 				if (t->t_lpl != cpu_lpl &&
549 				    t->t_lpl->lpl_lgrpid == lgrpid)
550 					lgrp_diff_lpl++;
551 				/*
552 				 * If the lgroup that t is assigned to no
553 				 * longer has any CPUs in t's partition,
554 				 * we'll have to choose a new lgroup for t.
555 				 */
556 
557 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
558 				    t->t_cpupart)) {
559 					lgrp_move_thread(t,
560 					    lgrp_choose(t, t->t_cpupart), 0);
561 				}
562 
563 				/*
564 				 * make sure lpl points to our own partition
565 				 */
566 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
567 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
568 				    t->t_cpupart->cp_nlgrploads));
569 
570 				ASSERT(t->t_lpl->lpl_ncpu > 0);
571 
572 				/* Update CPU last ran on if it was this CPU */
573 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
574 				    t->t_bound_cpu != cp) {
575 					t->t_cpu = disp_lowpri_cpu(ncp, t,
576 					    t->t_pri);
577 				}
578 				t = t->t_forw;
579 			} while (t != p->p_tlist);
580 
581 			/*
582 			 * Didn't find any threads in the same lgroup as this
583 			 * CPU with a different lpl, so remove the lgroup from
584 			 * the process lgroup bitmask.
585 			 */
586 
587 			if (lgrp_diff_lpl)
588 				klgrpset_del(p->p_lgrpset, lgrpid);
589 		}
590 
591 		/*
592 		 * Walk thread list looking for threads that need to be
593 		 * rehomed, since there are some threads that are not in
594 		 * their process's p_tlist.
595 		 */
596 
597 		t = curthread;
598 
599 		do {
600 			ASSERT(t != NULL && t->t_lpl != NULL);
601 
602 			/*
603 			 * If the lgroup that t is assigned to no
604 			 * longer has any CPUs in t's partition,
605 			 * we'll have to choose a new lgroup for t.
606 			 * Also, choose best lgroup for home when
607 			 * thread has specified lgroup affinities,
608 			 * since there may be an lgroup with more
609 			 * affinity available after moving CPUs
610 			 * around.
611 			 */
612 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
613 			    t->t_cpupart) || t->t_lgrp_affinity) {
614 				lgrp_move_thread(t,
615 				    lgrp_choose(t, t->t_cpupart), 1);
616 			}
617 
618 			/* make sure lpl points to our own partition */
619 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
620 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
621 			    t->t_cpupart->cp_nlgrploads));
622 
623 			ASSERT(t->t_lpl->lpl_ncpu > 0);
624 
625 			/* Update CPU last ran on if it was this CPU */
626 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
627 			    t->t_bound_cpu != cp) {
628 				t->t_cpu = disp_lowpri_cpu(ncp, t,
629 				    t->t_pri);
630 			}
631 
632 			t = t->t_next;
633 		} while (t != curthread);
634 
635 		/*
636 		 * Clear off the CPU's run queue, and the kp queue if the
637 		 * partition is now empty.
638 		 */
639 		disp_cpu_inactive(cp);
640 
641 		/*
642 		 * Make cp switch to a thread from the new partition.
643 		 */
644 		cp->cpu_runrun = 1;
645 		cp->cpu_kprunrun = 1;
646 	}
647 
648 	cpu_inmotion = NULL;
649 	start_cpus();
650 
651 	/*
652 	 * Let anyone interested know that cpu has been added to the set.
653 	 */
654 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
655 
656 	/*
657 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
658 	 * bound to the new processor set.
659 	 */
660 	cyclic_move_in(cp);
661 
662 	return (0);
663 }
664 
665 /*
666  * Check if thread can be moved to a new cpu partition.  Called by
667  * cpupart_move_thread() and pset_bind_start().
668  */
669 int
670 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
671 {
672 	ASSERT(MUTEX_HELD(&cpu_lock));
673 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
674 	ASSERT(cp != NULL);
675 	ASSERT(THREAD_LOCK_HELD(tp));
676 
677 	/*
678 	 * CPU-bound threads can't be moved.
679 	 */
680 	if (!ignore) {
681 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
682 		    tp->t_weakbound_cpu;
683 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
684 			return (EBUSY);
685 	}
686 
687 	if (tp->t_cid == sysdccid) {
688 		return (EINVAL);	/* For now, sysdc threads can't move */
689 	}
690 
691 	return (0);
692 }
693 
694 /*
695  * Move thread to new partition.  If ignore is non-zero, then CPU
696  * bindings should be ignored (this is used when destroying a
697  * partition).
698  */
699 static int
700 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
701     void *projbuf, void *zonebuf)
702 {
703 	cpupart_t *oldpp = tp->t_cpupart;
704 	int ret;
705 
706 	ASSERT(MUTEX_HELD(&cpu_lock));
707 	ASSERT(MUTEX_HELD(&pidlock));
708 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
709 	ASSERT(newpp != NULL);
710 
711 	if (newpp->cp_cpulist == NULL)
712 		return (EINVAL);
713 
714 	/*
715 	 * Check for errors first.
716 	 */
717 	thread_lock(tp);
718 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
719 		thread_unlock(tp);
720 		return (ret);
721 	}
722 
723 	/* move the thread */
724 	if (oldpp != newpp) {
725 		/*
726 		 * Make the thread switch to the new partition.
727 		 */
728 		tp->t_cpupart = newpp;
729 		ASSERT(tp->t_lpl != NULL);
730 		/*
731 		 * Leave the thread on the same lgroup if possible; otherwise
732 		 * choose a new lgroup for it.  In either case, update its
733 		 * t_lpl.
734 		 */
735 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
736 		    tp->t_lgrp_affinity == NULL) {
737 			/*
738 			 * The thread's lgroup has CPUs in the thread's new
739 			 * partition, so the thread can stay assigned to the
740 			 * same lgroup.  Update its t_lpl to point to the
741 			 * lpl_t for its lgroup in its new partition.
742 			 */
743 			lgrp_move_thread(tp, &tp->t_cpupart->\
744 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
745 		} else {
746 			/*
747 			 * The thread's lgroup has no cpus in its new
748 			 * partition or it has specified lgroup affinities,
749 			 * so choose the best lgroup for the thread and
750 			 * assign it to that lgroup.
751 			 */
752 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
753 			    1);
754 		}
755 		/*
756 		 * make sure lpl points to our own partition
757 		 */
758 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
759 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
760 		    tp->t_cpupart->cp_nlgrploads));
761 
762 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
763 
764 		if (tp->t_state == TS_ONPROC) {
765 			cpu_surrender(tp);
766 		} else if (tp->t_state == TS_RUN) {
767 			(void) dispdeq(tp);
768 			setbackdq(tp);
769 		}
770 	}
771 
772 	/*
773 	 * Our binding has changed; set TP_CHANGEBIND.
774 	 */
775 	tp->t_proc_flag |= TP_CHANGEBIND;
776 	aston(tp);
777 
778 	thread_unlock(tp);
779 	fss_changepset(tp, newpp, projbuf, zonebuf);
780 
781 	return (0);		/* success */
782 }
783 
784 
785 /*
786  * This function binds a thread to a partition.  Must be called with the
787  * p_lock of the containing process held (to keep the thread from going
788  * away), and thus also with cpu_lock held (since cpu_lock must be
789  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
790  * should be ignored (this is used when destroying a partition).
791  */
792 int
793 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
794     void *zonebuf)
795 {
796 	cpupart_t	*newpp;
797 
798 	ASSERT(pool_lock_held());
799 	ASSERT(MUTEX_HELD(&cpu_lock));
800 	ASSERT(MUTEX_HELD(&pidlock));
801 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
802 
803 	if (psid == PS_NONE)
804 		newpp = &cp_default;
805 	else {
806 		newpp = cpupart_find(psid);
807 		if (newpp == NULL) {
808 			return (EINVAL);
809 		}
810 	}
811 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
812 }
813 
814 
815 /*
816  * Create a new partition.  On MP systems, this also allocates a
817  * kpreempt disp queue for that partition.
818  */
819 int
820 cpupart_create(psetid_t *psid)
821 {
822 	cpupart_t	*pp;
823 
824 	ASSERT(pool_lock_held());
825 
826 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
827 
828 	mutex_enter(&cpu_lock);
829 	if (cp_numparts == cp_max_numparts) {
830 		mutex_exit(&cpu_lock);
831 		kmem_free(pp, sizeof (cpupart_t));
832 		return (ENOMEM);
833 	}
834 	cp_numparts++;
835 	/* find the next free partition ID */
836 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
837 		cp_id_next++;
838 	pp->cp_id = cp_id_next++;
839 	pp->cp_ncpus = 0;
840 	pp->cp_cpulist = NULL;
841 	pp->cp_attr = 0;
842 	klgrpset_clear(pp->cp_lgrpset);
843 	pp->cp_kp_queue.disp_maxrunpri = -1;
844 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
845 	pp->cp_kp_queue.disp_cpu = NULL;
846 	pp->cp_gen = 0;
847 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
848 	*psid = CPTOPS(pp->cp_id);
849 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
850 	cpupart_kstat_create(pp);
851 	cpupart_lpl_initialize(pp);
852 
853 	bitset_init(&pp->cp_cmt_pgs);
854 
855 	/*
856 	 * Initialize and size the partition's bitset of halted CPUs.
857 	 */
858 	bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
859 	bitset_resize(&pp->cp_haltset, max_ncpus);
860 
861 	/*
862 	 * Pause all CPUs while changing the partition list, to make sure
863 	 * the clock thread (which traverses the list without holding
864 	 * cpu_lock) isn't running.
865 	 */
866 	pause_cpus(NULL, NULL);
867 	pp->cp_next = cp_list_head;
868 	pp->cp_prev = cp_list_head->cp_prev;
869 	cp_list_head->cp_prev->cp_next = pp;
870 	cp_list_head->cp_prev = pp;
871 	start_cpus();
872 	mutex_exit(&cpu_lock);
873 
874 	return (0);
875 }
876 
877 /*
878  * Move threads from specified partition to cp_default. If `force' is specified,
879  * move all threads, otherwise move only soft-bound threads.
880  */
881 static int
882 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
883 {
884 	void	*projbuf, *zonebuf;
885 	kthread_t *t;
886 	proc_t	*p;
887 	int	err = 0;
888 	psetid_t psid;
889 
890 	ASSERT(pool_lock_held());
891 	ASSERT(MUTEX_HELD(&cpu_lock));
892 
893 	if (pp == NULL || pp == &cp_default) {
894 		return (EINVAL);
895 	}
896 	psid = pp->cp_id;
897 
898 	/*
899 	 * Pre-allocate enough buffers for FSS for all active projects and
900 	 * for all active zones on the system.  Unused buffers will be
901 	 * freed later by fss_freebuf().
902 	 */
903 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
904 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
905 
906 	mutex_enter(&pidlock);
907 	t = curthread;
908 	do {
909 		if (t->t_bind_pset == psid) {
910 again:			p = ttoproc(t);
911 			mutex_enter(&p->p_lock);
912 			if (ttoproc(t) != p) {
913 				/*
914 				 * lwp_exit has changed this thread's process
915 				 * pointer before we grabbed its p_lock.
916 				 */
917 				mutex_exit(&p->p_lock);
918 				goto again;
919 			}
920 
921 			/*
922 			 * Can only unbind threads which have revocable binding
923 			 * unless force unbinding requested.
924 			 */
925 			if (unbind_all || TB_PSET_IS_SOFT(t)) {
926 				err = cpupart_bind_thread(t, PS_NONE, 1,
927 				    projbuf, zonebuf);
928 				if (err) {
929 					mutex_exit(&p->p_lock);
930 					mutex_exit(&pidlock);
931 					fss_freebuf(projbuf, FSS_ALLOC_PROJ);
932 					fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
933 					return (err);
934 				}
935 				t->t_bind_pset = PS_NONE;
936 			}
937 			mutex_exit(&p->p_lock);
938 		}
939 		t = t->t_next;
940 	} while (t != curthread);
941 
942 	mutex_exit(&pidlock);
943 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
944 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
945 	return (err);
946 }
947 
948 /*
949  * Destroy a partition.
950  */
951 int
952 cpupart_destroy(psetid_t psid)
953 {
954 	cpu_t	*cp, *first_cp;
955 	cpupart_t *pp, *newpp;
956 	int	err = 0;
957 
958 	ASSERT(pool_lock_held());
959 	mutex_enter(&cpu_lock);
960 
961 	pp = cpupart_find(psid);
962 	if (pp == NULL || pp == &cp_default) {
963 		mutex_exit(&cpu_lock);
964 		return (EINVAL);
965 	}
966 
967 	/*
968 	 * Unbind all the threads currently bound to the partition.
969 	 */
970 	err = cpupart_unbind_threads(pp, B_TRUE);
971 	if (err) {
972 		mutex_exit(&cpu_lock);
973 		return (err);
974 	}
975 
976 	newpp = &cp_default;
977 	while ((cp = pp->cp_cpulist) != NULL) {
978 		if ((err = cpupart_move_cpu(cp, newpp, 0)) != 0) {
979 			mutex_exit(&cpu_lock);
980 			return (err);
981 		}
982 	}
983 
984 	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
985 	ASSERT(bitset_is_null(&pp->cp_haltset));
986 
987 	/*
988 	 * Teardown the partition's group of active CMT PGs and halted
989 	 * CPUs now that they have all left.
990 	 */
991 	bitset_fini(&pp->cp_cmt_pgs);
992 	bitset_fini(&pp->cp_haltset);
993 
994 	/*
995 	 * Reset the pointers in any offline processors so they won't
996 	 * try to rejoin the destroyed partition when they're turned
997 	 * online.
998 	 */
999 	first_cp = cp = CPU;
1000 	do {
1001 		if (cp->cpu_part == pp) {
1002 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
1003 			cp->cpu_part = newpp;
1004 		}
1005 		cp = cp->cpu_next;
1006 	} while (cp != first_cp);
1007 
1008 	/*
1009 	 * Pause all CPUs while changing the partition list, to make sure
1010 	 * the clock thread (which traverses the list without holding
1011 	 * cpu_lock) isn't running.
1012 	 */
1013 	pause_cpus(NULL, NULL);
1014 	pp->cp_prev->cp_next = pp->cp_next;
1015 	pp->cp_next->cp_prev = pp->cp_prev;
1016 	if (cp_list_head == pp)
1017 		cp_list_head = pp->cp_next;
1018 	start_cpus();
1019 
1020 	if (cp_id_next > pp->cp_id)
1021 		cp_id_next = pp->cp_id;
1022 
1023 	if (pp->cp_kstat)
1024 		kstat_delete(pp->cp_kstat);
1025 
1026 	cp_numparts--;
1027 
1028 	disp_kp_free(&pp->cp_kp_queue);
1029 
1030 	cpupart_lpl_teardown(pp);
1031 
1032 	kmem_free(pp, sizeof (cpupart_t));
1033 	mutex_exit(&cpu_lock);
1034 
1035 	return (err);
1036 }
1037 
1038 
1039 /*
1040  * Return the ID of the partition to which the specified processor belongs.
1041  */
1042 psetid_t
1043 cpupart_query_cpu(cpu_t *cp)
1044 {
1045 	ASSERT(MUTEX_HELD(&cpu_lock));
1046 
1047 	return (CPTOPS(cp->cpu_part->cp_id));
1048 }
1049 
1050 
1051 /*
1052  * Attach a processor to an existing partition.
1053  */
1054 int
1055 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1056 {
1057 	cpupart_t	*pp;
1058 	int		err;
1059 
1060 	ASSERT(pool_lock_held());
1061 	ASSERT(MUTEX_HELD(&cpu_lock));
1062 
1063 	pp = cpupart_find(psid);
1064 	if (pp == NULL)
1065 		return (EINVAL);
1066 	if (cp->cpu_flags & CPU_OFFLINE)
1067 		return (EINVAL);
1068 
1069 	err = cpupart_move_cpu(cp, pp, forced);
1070 	return (err);
1071 }
1072 
1073 /*
1074  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1075  * this just checks for a valid partition.  If numcpus is non-NULL but
1076  * cpulist is NULL, the current number of cpus is stored in *numcpus.
1077  * If both are non-NULL, the current number of cpus is stored in *numcpus,
1078  * and a list of those cpus up to the size originally in *numcpus is
1079  * stored in cpulist[].  Also, store the processor set id in *psid.
1080  * This is useful in case the processor set id passed in was PS_MYID.
1081  */
1082 int
1083 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1084 {
1085 	cpupart_t	*pp;
1086 	uint_t		ncpus;
1087 	cpu_t		*c;
1088 	int		i;
1089 
1090 	mutex_enter(&cpu_lock);
1091 	pp = cpupart_find(*psid);
1092 	if (pp == NULL) {
1093 		mutex_exit(&cpu_lock);
1094 		return (EINVAL);
1095 	}
1096 	*psid = CPTOPS(pp->cp_id);
1097 	ncpus = pp->cp_ncpus;
1098 	if (numcpus) {
1099 		if (ncpus > *numcpus) {
1100 			/*
1101 			 * Only copy as many cpus as were passed in, but
1102 			 * pass back the real number.
1103 			 */
1104 			uint_t t = ncpus;
1105 			ncpus = *numcpus;
1106 			*numcpus = t;
1107 		} else
1108 			*numcpus = ncpus;
1109 
1110 		if (cpulist) {
1111 			c = pp->cp_cpulist;
1112 			for (i = 0; i < ncpus; i++) {
1113 				ASSERT(c != NULL);
1114 				cpulist[i] = c->cpu_id;
1115 				c = c->cpu_next_part;
1116 			}
1117 		}
1118 	}
1119 	mutex_exit(&cpu_lock);
1120 	return (0);
1121 }
1122 
1123 /*
1124  * Reallocate kpreempt queues for each CPU partition.  Called from
1125  * disp_setup when a new scheduling class is loaded that increases the
1126  * number of priorities in the system.
1127  */
1128 void
1129 cpupart_kpqalloc(pri_t npri)
1130 {
1131 	cpupart_t *cpp;
1132 
1133 	ASSERT(MUTEX_HELD(&cpu_lock));
1134 	cpp = cp_list_head;
1135 	do {
1136 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1137 		cpp = cpp->cp_next;
1138 	} while (cpp != cp_list_head);
1139 }
1140 
1141 int
1142 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1143 {
1144 	cpupart_t *cp;
1145 	int i;
1146 
1147 	ASSERT(nelem >= 0);
1148 	ASSERT(nelem <= LOADAVG_NSTATS);
1149 	ASSERT(MUTEX_HELD(&cpu_lock));
1150 
1151 	cp = cpupart_find(psid);
1152 	if (cp == NULL)
1153 		return (EINVAL);
1154 	for (i = 0; i < nelem; i++)
1155 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1156 
1157 	return (0);
1158 }
1159 
1160 
1161 uint_t
1162 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1163 {
1164 	uint_t numpart = 0;
1165 	cpupart_t *cp;
1166 
1167 	ASSERT(MUTEX_HELD(&cpu_lock));
1168 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1169 
1170 	if (list != NULL) {
1171 		cp = cp_list_head;
1172 		do {
1173 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1174 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1175 				if (numpart == nelem)
1176 					break;
1177 				list[numpart++] = CPTOPS(cp->cp_id);
1178 			}
1179 			cp = cp->cp_next;
1180 		} while (cp != cp_list_head);
1181 	}
1182 
1183 	ASSERT(numpart < cp_numparts);
1184 
1185 	if (flag == CP_ALL)
1186 		numpart = cp_numparts - 1; /* leave out default partition */
1187 	else if (flag == CP_NONEMPTY)
1188 		numpart = cp_numparts_nonempty;
1189 
1190 	return (numpart);
1191 }
1192 
1193 int
1194 cpupart_setattr(psetid_t psid, uint_t attr)
1195 {
1196 	cpupart_t *cp;
1197 
1198 	ASSERT(pool_lock_held());
1199 
1200 	mutex_enter(&cpu_lock);
1201 	if ((cp = cpupart_find(psid)) == NULL) {
1202 		mutex_exit(&cpu_lock);
1203 		return (EINVAL);
1204 	}
1205 	/*
1206 	 * PSET_NOESCAPE attribute for default cpu partition is always set
1207 	 */
1208 	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1209 		mutex_exit(&cpu_lock);
1210 		return (EINVAL);
1211 	}
1212 	cp->cp_attr = attr;
1213 	mutex_exit(&cpu_lock);
1214 	return (0);
1215 }
1216 
1217 int
1218 cpupart_getattr(psetid_t psid, uint_t *attrp)
1219 {
1220 	cpupart_t *cp;
1221 
1222 	mutex_enter(&cpu_lock);
1223 	if ((cp = cpupart_find(psid)) == NULL) {
1224 		mutex_exit(&cpu_lock);
1225 		return (EINVAL);
1226 	}
1227 	*attrp = cp->cp_attr;
1228 	mutex_exit(&cpu_lock);
1229 	return (0);
1230 }
1231