xref: /titanic_44/usr/src/uts/common/disp/cpupart.c (revision 96d9f183facd90dbbc2268c9a51689be0b6a0b46)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/systm.h>
28 #include <sys/cmn_err.h>
29 #include <sys/cpuvar.h>
30 #include <sys/thread.h>
31 #include <sys/disp.h>
32 #include <sys/kmem.h>
33 #include <sys/debug.h>
34 #include <sys/cpupart.h>
35 #include <sys/pset.h>
36 #include <sys/var.h>
37 #include <sys/cyclic.h>
38 #include <sys/lgrp.h>
39 #include <sys/pghw.h>
40 #include <sys/loadavg.h>
41 #include <sys/class.h>
42 #include <sys/fss.h>
43 #include <sys/pool.h>
44 #include <sys/pool_pset.h>
45 #include <sys/policy.h>
46 
47 /*
48  * Calling pool_lock() protects the pools configuration, which includes
49  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
50  * partitions from being created or destroyed while the lock is held.
51  * The lock ordering with respect to related locks is:
52  *
53  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
54  *
55  * Blocking memory allocations may be made while holding "pool_lock"
56  * or cpu_lock.
57  */
58 
59 /*
60  * The cp_default partition is allocated statically, but its lgroup load average
61  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
62  * saves some memory since the space allocated reflects the actual number of
63  * lgroups supported by the platform. The lgrp facility provides a temporary
64  * space to hold lpl information during system bootstrap.
65  */
66 
67 cpupart_t		*cp_list_head;
68 cpupart_t		cp_default;
69 static cpupartid_t	cp_id_next;
70 uint_t			cp_numparts;
71 uint_t			cp_numparts_nonempty;
72 
73 /*
74  * Need to limit total number of partitions to avoid slowing down the
75  * clock code too much.  The clock code traverses the list of
76  * partitions and needs to be able to execute in a reasonable amount
77  * of time (less than 1/hz seconds).  The maximum is sized based on
78  * max_ncpus so it shouldn't be a problem unless there are large
79  * numbers of empty partitions.
80  */
81 static uint_t		cp_max_numparts;
82 
83 /*
84  * Processor sets and CPU partitions are different but related concepts.
85  * A processor set is a user-level abstraction allowing users to create
86  * sets of CPUs and bind threads exclusively to those sets.  A CPU
87  * partition is a kernel dispatcher object consisting of a set of CPUs
88  * and a global dispatch queue.  The processor set abstraction is
89  * implemented via a CPU partition, and currently there is a 1-1
90  * mapping between processor sets and partitions (excluding the default
91  * partition, which is not visible as a processor set).  Hence, the
92  * numbering for processor sets and CPU partitions is identical.  This
93  * may not always be true in the future, and these macros could become
94  * less trivial if we support e.g. a processor set containing multiple
95  * CPU partitions.
96  */
97 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
98 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
99 
100 
101 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
102 
103 /*
104  * Find a CPU partition given a processor set ID.
105  */
106 static cpupart_t *
107 cpupart_find_all(psetid_t psid)
108 {
109 	cpupart_t *cp;
110 	cpupartid_t cpid = PSTOCP(psid);
111 
112 	ASSERT(MUTEX_HELD(&cpu_lock));
113 
114 	/* default partition not visible as a processor set */
115 	if (psid == CP_DEFAULT)
116 		return (NULL);
117 
118 	if (psid == PS_MYID)
119 		return (curthread->t_cpupart);
120 
121 	cp = cp_list_head;
122 	do {
123 		if (cp->cp_id == cpid)
124 			return (cp);
125 		cp = cp->cp_next;
126 	} while (cp != cp_list_head);
127 	return (NULL);
128 }
129 
130 /*
131  * Find a CPU partition given a processor set ID if the processor set
132  * should be visible from the calling zone.
133  */
134 cpupart_t *
135 cpupart_find(psetid_t psid)
136 {
137 	cpupart_t *cp;
138 
139 	ASSERT(MUTEX_HELD(&cpu_lock));
140 	cp = cpupart_find_all(psid);
141 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
142 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
143 			return (NULL);
144 	return (cp);
145 }
146 
147 static int
148 cpupart_kstat_update(kstat_t *ksp, int rw)
149 {
150 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
151 	cpupart_kstat_t *cpksp = ksp->ks_data;
152 
153 	if (rw == KSTAT_WRITE)
154 		return (EACCES);
155 
156 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
157 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
158 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
159 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
160 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
161 	    (16 - FSHIFT);
162 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
163 	    (16 - FSHIFT);
164 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
165 	    (16 - FSHIFT);
166 	return (0);
167 }
168 
169 static void
170 cpupart_kstat_create(cpupart_t *cp)
171 {
172 	kstat_t *ksp;
173 	zoneid_t zoneid;
174 
175 	ASSERT(MUTEX_HELD(&cpu_lock));
176 
177 	/*
178 	 * We have a bit of a chicken-egg problem since this code will
179 	 * get called to create the kstats for CP_DEFAULT before the
180 	 * pools framework gets initialized.  We circumvent the problem
181 	 * by special-casing cp_default.
182 	 */
183 	if (cp != &cp_default && pool_pset_enabled())
184 		zoneid = GLOBAL_ZONEID;
185 	else
186 		zoneid = ALL_ZONES;
187 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
188 	    KSTAT_TYPE_NAMED,
189 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
190 	if (ksp != NULL) {
191 		cpupart_kstat_t *cpksp = ksp->ks_data;
192 
193 		kstat_named_init(&cpksp->cpk_updates, "updates",
194 		    KSTAT_DATA_UINT64);
195 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
196 		    KSTAT_DATA_UINT64);
197 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
198 		    KSTAT_DATA_UINT64);
199 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
200 		    KSTAT_DATA_UINT32);
201 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
202 		    KSTAT_DATA_UINT32);
203 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
204 		    KSTAT_DATA_UINT32);
205 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
206 		    KSTAT_DATA_UINT32);
207 
208 		ksp->ks_update = cpupart_kstat_update;
209 		ksp->ks_private = cp;
210 
211 		kstat_install(ksp);
212 	}
213 	cp->cp_kstat = ksp;
214 }
215 
216 /*
217  * Initialize the cpupart's lgrp partions (lpls)
218  */
219 static void
220 cpupart_lpl_initialize(cpupart_t *cp)
221 {
222 	int i, sz;
223 
224 	sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
225 	cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
226 
227 	for (i = 0; i < sz; i++) {
228 		/*
229 		 * The last entry of the lpl's resource set is always NULL
230 		 * by design (to facilitate iteration)...hence the "oversizing"
231 		 * by 1.
232 		 */
233 		cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
234 		cp->cp_lgrploads[i].lpl_rset =
235 		    kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
236 		cp->cp_lgrploads[i].lpl_id2rset =
237 		    kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
238 		cp->cp_lgrploads[i].lpl_lgrpid = i;
239 	}
240 }
241 
242 /*
243  * Teardown the cpupart's lgrp partitions
244  */
245 static void
246 cpupart_lpl_teardown(cpupart_t *cp)
247 {
248 	int i, sz;
249 	lpl_t *lpl;
250 
251 	for (i = 0; i < cp->cp_nlgrploads; i++) {
252 		lpl = &cp->cp_lgrploads[i];
253 
254 		sz = lpl->lpl_rset_sz;
255 		kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
256 		kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
257 		lpl->lpl_rset = NULL;
258 		lpl->lpl_id2rset = NULL;
259 	}
260 	kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
261 	cp->cp_lgrploads = NULL;
262 }
263 
264 /*
265  * Initialize the default partition and kpreempt disp queue.
266  */
267 void
268 cpupart_initialize_default(void)
269 {
270 	lgrp_id_t i;
271 
272 	cp_list_head = &cp_default;
273 	cp_default.cp_next = &cp_default;
274 	cp_default.cp_prev = &cp_default;
275 	cp_default.cp_id = CP_DEFAULT;
276 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
277 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
278 	cp_default.cp_kp_queue.disp_cpu = NULL;
279 	cp_default.cp_gen = 0;
280 	cp_default.cp_loadavg.lg_cur = 0;
281 	cp_default.cp_loadavg.lg_len = 0;
282 	cp_default.cp_loadavg.lg_total = 0;
283 	for (i = 0; i < S_LOADAVG_SZ; i++) {
284 		cp_default.cp_loadavg.lg_loads[i] = 0;
285 	}
286 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
287 	cp_id_next = CP_DEFAULT + 1;
288 	cpupart_kstat_create(&cp_default);
289 	cp_numparts = 1;
290 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
291 		cp_max_numparts = max_ncpus * 2 + 1;
292 	/*
293 	 * Allocate space for cp_default list of lgrploads
294 	 */
295 	cpupart_lpl_initialize(&cp_default);
296 
297 	/*
298 	 * The initial lpl topology is created in a special lpl list
299 	 * lpl_bootstrap. It should be copied to cp_default.
300 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
301 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
302 	 */
303 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
304 	    cp_default.cp_nlgrploads);
305 
306 
307 	cp_default.cp_attr = PSET_NOESCAPE;
308 	cp_numparts_nonempty = 1;
309 	/*
310 	 * Set t0's home
311 	 */
312 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
313 
314 	bitset_init(&cp_default.cp_cmt_pgs);
315 	bitset_init(&cp_default.cp_haltset);
316 	bitset_resize(&cp_default.cp_haltset, max_ncpus);
317 }
318 
319 
320 static int
321 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
322 {
323 	cpupart_t *oldpp;
324 	cpu_t	*ncp, *newlist;
325 	kthread_t *t;
326 	int	move_threads = 1;
327 	lgrp_id_t lgrpid;
328 	proc_t 	*p;
329 	int lgrp_diff_lpl;
330 	lpl_t	*cpu_lpl;
331 	int	ret;
332 	boolean_t unbind_all_threads = (forced != 0);
333 
334 	ASSERT(MUTEX_HELD(&cpu_lock));
335 	ASSERT(newpp != NULL);
336 
337 	oldpp = cp->cpu_part;
338 	ASSERT(oldpp != NULL);
339 	ASSERT(oldpp->cp_ncpus > 0);
340 
341 	if (newpp == oldpp) {
342 		/*
343 		 * Don't need to do anything.
344 		 */
345 		return (0);
346 	}
347 
348 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
349 
350 	if (!disp_bound_partition(cp, 0)) {
351 		/*
352 		 * Don't need to move threads if there are no threads in
353 		 * the partition.  Note that threads can't enter the
354 		 * partition while we're holding cpu_lock.
355 		 */
356 		move_threads = 0;
357 	} else if (oldpp->cp_ncpus == 1) {
358 		/*
359 		 * The last CPU is removed from a partition which has threads
360 		 * running in it. Some of these threads may be bound to this
361 		 * CPU.
362 		 *
363 		 * Attempt to unbind threads from the CPU and from the processor
364 		 * set. Note that no threads should be bound to this CPU since
365 		 * cpupart_move_threads will refuse to move bound threads to
366 		 * other CPUs.
367 		 */
368 		(void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
369 		(void) cpupart_unbind_threads(oldpp, B_FALSE);
370 
371 		if (!disp_bound_partition(cp, 0)) {
372 			/*
373 			 * No bound threads in this partition any more
374 			 */
375 			move_threads = 0;
376 		} else {
377 			/*
378 			 * There are still threads bound to the partition
379 			 */
380 			cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
381 			return (EBUSY);
382 		}
383 	}
384 
385 	/*
386 	 * If forced flag is set unbind any threads from this CPU.
387 	 * Otherwise unbind soft-bound threads only.
388 	 */
389 	if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
390 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
391 		return (ret);
392 	}
393 
394 	/*
395 	 * Stop further threads weak binding to this cpu.
396 	 */
397 	cpu_inmotion = cp;
398 	membar_enter();
399 
400 	/*
401 	 * Notify the Processor Groups subsystem that the CPU
402 	 * will be moving cpu partitions. This is done before
403 	 * CPUs are paused to provide an opportunity for any
404 	 * needed memory allocations.
405 	 */
406 	pg_cpupart_out(cp, oldpp);
407 	pg_cpupart_in(cp, newpp);
408 
409 again:
410 	if (move_threads) {
411 		int loop_count;
412 		/*
413 		 * Check for threads strong or weak bound to this CPU.
414 		 */
415 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
416 			if (loop_count >= 5) {
417 				cpu_state_change_notify(cp->cpu_id,
418 				    CPU_CPUPART_IN);
419 				pg_cpupart_out(cp, newpp);
420 				pg_cpupart_in(cp, oldpp);
421 				cpu_inmotion = NULL;
422 				return (EBUSY);	/* some threads still bound */
423 			}
424 			delay(1);
425 		}
426 	}
427 
428 	/*
429 	 * Before we actually start changing data structures, notify
430 	 * the cyclic subsystem that we want to move this CPU out of its
431 	 * partition.
432 	 */
433 	if (!cyclic_move_out(cp)) {
434 		/*
435 		 * This CPU must be the last CPU in a processor set with
436 		 * a bound cyclic.
437 		 */
438 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
439 		pg_cpupart_out(cp, newpp);
440 		pg_cpupart_in(cp, oldpp);
441 		cpu_inmotion = NULL;
442 		return (EBUSY);
443 	}
444 
445 	pause_cpus(cp);
446 
447 	if (move_threads) {
448 		/*
449 		 * The thread on cpu before the pause thread may have read
450 		 * cpu_inmotion before we raised the barrier above.  Check
451 		 * again.
452 		 */
453 		if (disp_bound_threads(cp, 1)) {
454 			start_cpus();
455 			goto again;
456 		}
457 
458 	}
459 
460 	/*
461 	 * Now that CPUs are paused, let the PG subsystem perform
462 	 * any necessary data structure updates.
463 	 */
464 	pg_cpupart_move(cp, oldpp, newpp);
465 
466 	/* save this cpu's lgroup -- it'll be the same in the new partition */
467 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
468 
469 	cpu_lpl = cp->cpu_lpl;
470 	/*
471 	 * let the lgroup framework know cp has left the partition
472 	 */
473 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
474 
475 	/* move out of old partition */
476 	oldpp->cp_ncpus--;
477 	if (oldpp->cp_ncpus > 0) {
478 
479 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
480 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
481 		if (oldpp->cp_cpulist == cp) {
482 			oldpp->cp_cpulist = ncp;
483 		}
484 	} else {
485 		ncp = oldpp->cp_cpulist = NULL;
486 		cp_numparts_nonempty--;
487 		ASSERT(cp_numparts_nonempty != 0);
488 	}
489 	oldpp->cp_gen++;
490 
491 	/* move into new partition */
492 	newlist = newpp->cp_cpulist;
493 	if (newlist == NULL) {
494 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
495 		cp_numparts_nonempty++;
496 		ASSERT(cp_numparts_nonempty != 0);
497 	} else {
498 		cp->cpu_next_part = newlist;
499 		cp->cpu_prev_part = newlist->cpu_prev_part;
500 		newlist->cpu_prev_part->cpu_next_part = cp;
501 		newlist->cpu_prev_part = cp;
502 	}
503 	cp->cpu_part = newpp;
504 	newpp->cp_ncpus++;
505 	newpp->cp_gen++;
506 
507 	ASSERT(bitset_is_null(&newpp->cp_haltset));
508 	ASSERT(bitset_is_null(&oldpp->cp_haltset));
509 
510 	/*
511 	 * let the lgroup framework know cp has entered the partition
512 	 */
513 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
514 
515 	/*
516 	 * If necessary, move threads off processor.
517 	 */
518 	if (move_threads) {
519 		ASSERT(ncp != NULL);
520 
521 		/*
522 		 * Walk thru the active process list to look for
523 		 * threads that need to have a new home lgroup,
524 		 * or the last CPU they run on is the same CPU
525 		 * being moved out of the partition.
526 		 */
527 
528 		for (p = practive; p != NULL; p = p->p_next) {
529 
530 			t = p->p_tlist;
531 
532 			if (t == NULL)
533 				continue;
534 
535 			lgrp_diff_lpl = 0;
536 
537 			do {
538 
539 				ASSERT(t->t_lpl != NULL);
540 
541 				/*
542 				 * Update the count of how many threads are
543 				 * in this CPU's lgroup but have a different lpl
544 				 */
545 
546 				if (t->t_lpl != cpu_lpl &&
547 				    t->t_lpl->lpl_lgrpid == lgrpid)
548 					lgrp_diff_lpl++;
549 				/*
550 				 * If the lgroup that t is assigned to no
551 				 * longer has any CPUs in t's partition,
552 				 * we'll have to choose a new lgroup for t.
553 				 */
554 
555 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
556 				    t->t_cpupart)) {
557 					lgrp_move_thread(t,
558 					    lgrp_choose(t, t->t_cpupart), 0);
559 				}
560 
561 				/*
562 				 * make sure lpl points to our own partition
563 				 */
564 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
565 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
566 				    t->t_cpupart->cp_nlgrploads));
567 
568 				ASSERT(t->t_lpl->lpl_ncpu > 0);
569 
570 				/* Update CPU last ran on if it was this CPU */
571 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
572 				    t->t_bound_cpu != cp) {
573 					t->t_cpu = disp_lowpri_cpu(ncp,
574 					    t->t_lpl, t->t_pri, NULL);
575 				}
576 				t = t->t_forw;
577 			} while (t != p->p_tlist);
578 
579 			/*
580 			 * Didn't find any threads in the same lgroup as this
581 			 * CPU with a different lpl, so remove the lgroup from
582 			 * the process lgroup bitmask.
583 			 */
584 
585 			if (lgrp_diff_lpl)
586 				klgrpset_del(p->p_lgrpset, lgrpid);
587 		}
588 
589 		/*
590 		 * Walk thread list looking for threads that need to be
591 		 * rehomed, since there are some threads that are not in
592 		 * their process's p_tlist.
593 		 */
594 
595 		t = curthread;
596 
597 		do {
598 			ASSERT(t != NULL && t->t_lpl != NULL);
599 
600 			/*
601 			 * If the lgroup that t is assigned to no
602 			 * longer has any CPUs in t's partition,
603 			 * we'll have to choose a new lgroup for t.
604 			 * Also, choose best lgroup for home when
605 			 * thread has specified lgroup affinities,
606 			 * since there may be an lgroup with more
607 			 * affinity available after moving CPUs
608 			 * around.
609 			 */
610 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
611 			    t->t_cpupart) || t->t_lgrp_affinity) {
612 				lgrp_move_thread(t,
613 				    lgrp_choose(t, t->t_cpupart), 1);
614 			}
615 
616 			/* make sure lpl points to our own partition */
617 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
618 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
619 			    t->t_cpupart->cp_nlgrploads));
620 
621 			ASSERT(t->t_lpl->lpl_ncpu > 0);
622 
623 			/* Update CPU last ran on if it was this CPU */
624 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
625 			    t->t_bound_cpu != cp) {
626 				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
627 				    t->t_pri, NULL);
628 			}
629 
630 			t = t->t_next;
631 		} while (t != curthread);
632 
633 		/*
634 		 * Clear off the CPU's run queue, and the kp queue if the
635 		 * partition is now empty.
636 		 */
637 		disp_cpu_inactive(cp);
638 
639 		/*
640 		 * Make cp switch to a thread from the new partition.
641 		 */
642 		cp->cpu_runrun = 1;
643 		cp->cpu_kprunrun = 1;
644 	}
645 
646 	cpu_inmotion = NULL;
647 	start_cpus();
648 
649 	/*
650 	 * Let anyone interested know that cpu has been added to the set.
651 	 */
652 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
653 
654 	/*
655 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
656 	 * bound to the new processor set.
657 	 */
658 	cyclic_move_in(cp);
659 
660 	return (0);
661 }
662 
663 /*
664  * Check if thread can be moved to a new cpu partition.  Called by
665  * cpupart_move_thread() and pset_bind_start().
666  */
667 int
668 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
669 {
670 	ASSERT(MUTEX_HELD(&cpu_lock));
671 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
672 	ASSERT(cp != NULL);
673 	ASSERT(THREAD_LOCK_HELD(tp));
674 
675 	/*
676 	 * CPU-bound threads can't be moved.
677 	 */
678 	if (!ignore) {
679 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
680 		    tp->t_weakbound_cpu;
681 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
682 			return (EBUSY);
683 	}
684 
685 	if (tp->t_cid == sysdccid) {
686 		return (EINVAL);	/* For now, sysdc threads can't move */
687 	}
688 
689 	return (0);
690 }
691 
692 /*
693  * Move thread to new partition.  If ignore is non-zero, then CPU
694  * bindings should be ignored (this is used when destroying a
695  * partition).
696  */
697 static int
698 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
699     void *projbuf, void *zonebuf)
700 {
701 	cpupart_t *oldpp = tp->t_cpupart;
702 	int ret;
703 
704 	ASSERT(MUTEX_HELD(&cpu_lock));
705 	ASSERT(MUTEX_HELD(&pidlock));
706 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
707 	ASSERT(newpp != NULL);
708 
709 	if (newpp->cp_cpulist == NULL)
710 		return (EINVAL);
711 
712 	/*
713 	 * Check for errors first.
714 	 */
715 	thread_lock(tp);
716 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
717 		thread_unlock(tp);
718 		return (ret);
719 	}
720 
721 	/* move the thread */
722 	if (oldpp != newpp) {
723 		/*
724 		 * Make the thread switch to the new partition.
725 		 */
726 		tp->t_cpupart = newpp;
727 		ASSERT(tp->t_lpl != NULL);
728 		/*
729 		 * Leave the thread on the same lgroup if possible; otherwise
730 		 * choose a new lgroup for it.  In either case, update its
731 		 * t_lpl.
732 		 */
733 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
734 		    tp->t_lgrp_affinity == NULL) {
735 			/*
736 			 * The thread's lgroup has CPUs in the thread's new
737 			 * partition, so the thread can stay assigned to the
738 			 * same lgroup.  Update its t_lpl to point to the
739 			 * lpl_t for its lgroup in its new partition.
740 			 */
741 			lgrp_move_thread(tp, &tp->t_cpupart->\
742 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
743 		} else {
744 			/*
745 			 * The thread's lgroup has no cpus in its new
746 			 * partition or it has specified lgroup affinities,
747 			 * so choose the best lgroup for the thread and
748 			 * assign it to that lgroup.
749 			 */
750 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
751 			    1);
752 		}
753 		/*
754 		 * make sure lpl points to our own partition
755 		 */
756 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
757 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
758 		    tp->t_cpupart->cp_nlgrploads));
759 
760 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
761 
762 		if (tp->t_state == TS_ONPROC) {
763 			cpu_surrender(tp);
764 		} else if (tp->t_state == TS_RUN) {
765 			(void) dispdeq(tp);
766 			setbackdq(tp);
767 		}
768 	}
769 
770 	/*
771 	 * Our binding has changed; set TP_CHANGEBIND.
772 	 */
773 	tp->t_proc_flag |= TP_CHANGEBIND;
774 	aston(tp);
775 
776 	thread_unlock(tp);
777 	fss_changepset(tp, newpp, projbuf, zonebuf);
778 
779 	return (0);		/* success */
780 }
781 
782 
783 /*
784  * This function binds a thread to a partition.  Must be called with the
785  * p_lock of the containing process held (to keep the thread from going
786  * away), and thus also with cpu_lock held (since cpu_lock must be
787  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
788  * should be ignored (this is used when destroying a partition).
789  */
790 int
791 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
792     void *zonebuf)
793 {
794 	cpupart_t	*newpp;
795 
796 	ASSERT(pool_lock_held());
797 	ASSERT(MUTEX_HELD(&cpu_lock));
798 	ASSERT(MUTEX_HELD(&pidlock));
799 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
800 
801 	if (psid == PS_NONE)
802 		newpp = &cp_default;
803 	else {
804 		newpp = cpupart_find(psid);
805 		if (newpp == NULL) {
806 			return (EINVAL);
807 		}
808 	}
809 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
810 }
811 
812 
813 /*
814  * Create a new partition.  On MP systems, this also allocates a
815  * kpreempt disp queue for that partition.
816  */
817 int
818 cpupart_create(psetid_t *psid)
819 {
820 	cpupart_t	*pp;
821 
822 	ASSERT(pool_lock_held());
823 
824 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
825 	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
826 	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
827 	    KM_SLEEP);
828 
829 	mutex_enter(&cpu_lock);
830 	if (cp_numparts == cp_max_numparts) {
831 		mutex_exit(&cpu_lock);
832 		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
833 		pp->cp_lgrploads = NULL;
834 		kmem_free(pp, sizeof (cpupart_t));
835 		return (ENOMEM);
836 	}
837 	cp_numparts++;
838 	/* find the next free partition ID */
839 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
840 		cp_id_next++;
841 	pp->cp_id = cp_id_next++;
842 	pp->cp_ncpus = 0;
843 	pp->cp_cpulist = NULL;
844 	pp->cp_attr = 0;
845 	klgrpset_clear(pp->cp_lgrpset);
846 	pp->cp_kp_queue.disp_maxrunpri = -1;
847 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
848 	pp->cp_kp_queue.disp_cpu = NULL;
849 	pp->cp_gen = 0;
850 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
851 	*psid = CPTOPS(pp->cp_id);
852 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
853 	cpupart_kstat_create(pp);
854 	cpupart_lpl_initialize(pp);
855 
856 	bitset_init(&pp->cp_cmt_pgs);
857 
858 	/*
859 	 * Initialize and size the partition's bitset of halted CPUs
860 	 */
861 	bitset_init(&pp->cp_haltset);
862 	bitset_resize(&pp->cp_haltset, max_ncpus);
863 
864 	/*
865 	 * Pause all CPUs while changing the partition list, to make sure
866 	 * the clock thread (which traverses the list without holding
867 	 * cpu_lock) isn't running.
868 	 */
869 	pause_cpus(NULL);
870 	pp->cp_next = cp_list_head;
871 	pp->cp_prev = cp_list_head->cp_prev;
872 	cp_list_head->cp_prev->cp_next = pp;
873 	cp_list_head->cp_prev = pp;
874 	start_cpus();
875 	mutex_exit(&cpu_lock);
876 
877 	return (0);
878 }
879 
880 /*
881  * Move threads from specified partition to cp_default. If `force' is specified,
882  * move all threads, otherwise move only soft-bound threads.
883  */
884 static int
885 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
886 {
887 	void 	*projbuf, *zonebuf;
888 	kthread_t *t;
889 	proc_t	*p;
890 	int	err = 0;
891 	psetid_t psid = pp->cp_id;
892 
893 	ASSERT(pool_lock_held());
894 	ASSERT(MUTEX_HELD(&cpu_lock));
895 
896 	if (pp == NULL || pp == &cp_default) {
897 		return (EINVAL);
898 	}
899 
900 	/*
901 	 * Pre-allocate enough buffers for FSS for all active projects and
902 	 * for all active zones on the system.  Unused buffers will be
903 	 * freed later by fss_freebuf().
904 	 */
905 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
906 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
907 
908 	mutex_enter(&pidlock);
909 	t = curthread;
910 	do {
911 		if (t->t_bind_pset == psid) {
912 again:			p = ttoproc(t);
913 			mutex_enter(&p->p_lock);
914 			if (ttoproc(t) != p) {
915 				/*
916 				 * lwp_exit has changed this thread's process
917 				 * pointer before we grabbed its p_lock.
918 				 */
919 				mutex_exit(&p->p_lock);
920 				goto again;
921 			}
922 
923 			/*
924 			 * Can only unbind threads which have revocable binding
925 			 * unless force unbinding requested.
926 			 */
927 			if (unbind_all || TB_PSET_IS_SOFT(t)) {
928 				err = cpupart_bind_thread(t, PS_NONE, 1,
929 				    projbuf, zonebuf);
930 				if (err) {
931 					mutex_exit(&p->p_lock);
932 					mutex_exit(&pidlock);
933 					fss_freebuf(projbuf, FSS_ALLOC_PROJ);
934 					fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
935 					return (err);
936 				}
937 				t->t_bind_pset = PS_NONE;
938 			}
939 			mutex_exit(&p->p_lock);
940 		}
941 		t = t->t_next;
942 	} while (t != curthread);
943 
944 	mutex_exit(&pidlock);
945 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
946 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
947 	return (err);
948 }
949 
950 /*
951  * Destroy a partition.
952  */
953 int
954 cpupart_destroy(psetid_t psid)
955 {
956 	cpu_t	*cp, *first_cp;
957 	cpupart_t *pp, *newpp;
958 	int	err = 0;
959 
960 	ASSERT(pool_lock_held());
961 	mutex_enter(&cpu_lock);
962 
963 	pp = cpupart_find(psid);
964 	if (pp == NULL || pp == &cp_default) {
965 		mutex_exit(&cpu_lock);
966 		return (EINVAL);
967 	}
968 
969 	/*
970 	 * Unbind all the threads currently bound to the partition.
971 	 */
972 	err = cpupart_unbind_threads(pp, B_TRUE);
973 	if (err) {
974 		mutex_exit(&cpu_lock);
975 		return (err);
976 	}
977 
978 	newpp = &cp_default;
979 	while ((cp = pp->cp_cpulist) != NULL) {
980 		if (err = cpupart_move_cpu(cp, newpp, 0)) {
981 			mutex_exit(&cpu_lock);
982 			return (err);
983 		}
984 	}
985 
986 	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
987 	ASSERT(bitset_is_null(&pp->cp_haltset));
988 
989 	/*
990 	 * Teardown the partition's group of active CMT PGs and halted
991 	 * CPUs now that they have all left.
992 	 */
993 	bitset_fini(&pp->cp_cmt_pgs);
994 	bitset_fini(&pp->cp_haltset);
995 
996 	/*
997 	 * Reset the pointers in any offline processors so they won't
998 	 * try to rejoin the destroyed partition when they're turned
999 	 * online.
1000 	 */
1001 	first_cp = cp = CPU;
1002 	do {
1003 		if (cp->cpu_part == pp) {
1004 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
1005 			cp->cpu_part = newpp;
1006 		}
1007 		cp = cp->cpu_next;
1008 	} while (cp != first_cp);
1009 
1010 	/*
1011 	 * Pause all CPUs while changing the partition list, to make sure
1012 	 * the clock thread (which traverses the list without holding
1013 	 * cpu_lock) isn't running.
1014 	 */
1015 	pause_cpus(NULL);
1016 	pp->cp_prev->cp_next = pp->cp_next;
1017 	pp->cp_next->cp_prev = pp->cp_prev;
1018 	if (cp_list_head == pp)
1019 		cp_list_head = pp->cp_next;
1020 	start_cpus();
1021 
1022 	if (cp_id_next > pp->cp_id)
1023 		cp_id_next = pp->cp_id;
1024 
1025 	if (pp->cp_kstat)
1026 		kstat_delete(pp->cp_kstat);
1027 
1028 	cp_numparts--;
1029 
1030 	disp_kp_free(&pp->cp_kp_queue);
1031 
1032 	cpupart_lpl_teardown(pp);
1033 
1034 	kmem_free(pp, sizeof (cpupart_t));
1035 	mutex_exit(&cpu_lock);
1036 
1037 	return (err);
1038 }
1039 
1040 
1041 /*
1042  * Return the ID of the partition to which the specified processor belongs.
1043  */
1044 psetid_t
1045 cpupart_query_cpu(cpu_t *cp)
1046 {
1047 	ASSERT(MUTEX_HELD(&cpu_lock));
1048 
1049 	return (CPTOPS(cp->cpu_part->cp_id));
1050 }
1051 
1052 
1053 /*
1054  * Attach a processor to an existing partition.
1055  */
1056 int
1057 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1058 {
1059 	cpupart_t	*pp;
1060 	int		err;
1061 
1062 	ASSERT(pool_lock_held());
1063 	ASSERT(MUTEX_HELD(&cpu_lock));
1064 
1065 	pp = cpupart_find(psid);
1066 	if (pp == NULL)
1067 		return (EINVAL);
1068 	if (cp->cpu_flags & CPU_OFFLINE)
1069 		return (EINVAL);
1070 
1071 	err = cpupart_move_cpu(cp, pp, forced);
1072 	return (err);
1073 }
1074 
1075 /*
1076  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1077  * this just checks for a valid partition.  If numcpus is non-NULL but
1078  * cpulist is NULL, the current number of cpus is stored in *numcpus.
1079  * If both are non-NULL, the current number of cpus is stored in *numcpus,
1080  * and a list of those cpus up to the size originally in *numcpus is
1081  * stored in cpulist[].  Also, store the processor set id in *psid.
1082  * This is useful in case the processor set id passed in was PS_MYID.
1083  */
1084 int
1085 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1086 {
1087 	cpupart_t	*pp;
1088 	uint_t		ncpus;
1089 	cpu_t		*c;
1090 	int		i;
1091 
1092 	mutex_enter(&cpu_lock);
1093 	pp = cpupart_find(*psid);
1094 	if (pp == NULL) {
1095 		mutex_exit(&cpu_lock);
1096 		return (EINVAL);
1097 	}
1098 	*psid = CPTOPS(pp->cp_id);
1099 	ncpus = pp->cp_ncpus;
1100 	if (numcpus) {
1101 		if (ncpus > *numcpus) {
1102 			/*
1103 			 * Only copy as many cpus as were passed in, but
1104 			 * pass back the real number.
1105 			 */
1106 			uint_t t = ncpus;
1107 			ncpus = *numcpus;
1108 			*numcpus = t;
1109 		} else
1110 			*numcpus = ncpus;
1111 
1112 		if (cpulist) {
1113 			c = pp->cp_cpulist;
1114 			for (i = 0; i < ncpus; i++) {
1115 				ASSERT(c != NULL);
1116 				cpulist[i] = c->cpu_id;
1117 				c = c->cpu_next_part;
1118 			}
1119 		}
1120 	}
1121 	mutex_exit(&cpu_lock);
1122 	return (0);
1123 }
1124 
1125 /*
1126  * Reallocate kpreempt queues for each CPU partition.  Called from
1127  * disp_setup when a new scheduling class is loaded that increases the
1128  * number of priorities in the system.
1129  */
1130 void
1131 cpupart_kpqalloc(pri_t npri)
1132 {
1133 	cpupart_t *cpp;
1134 
1135 	ASSERT(MUTEX_HELD(&cpu_lock));
1136 	cpp = cp_list_head;
1137 	do {
1138 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1139 		cpp = cpp->cp_next;
1140 	} while (cpp != cp_list_head);
1141 }
1142 
1143 int
1144 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1145 {
1146 	cpupart_t *cp;
1147 	int i;
1148 
1149 	ASSERT(nelem >= 0);
1150 	ASSERT(nelem <= LOADAVG_NSTATS);
1151 	ASSERT(MUTEX_HELD(&cpu_lock));
1152 
1153 	cp = cpupart_find(psid);
1154 	if (cp == NULL)
1155 		return (EINVAL);
1156 	for (i = 0; i < nelem; i++)
1157 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1158 
1159 	return (0);
1160 }
1161 
1162 
1163 uint_t
1164 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1165 {
1166 	uint_t numpart = 0;
1167 	cpupart_t *cp;
1168 
1169 	ASSERT(MUTEX_HELD(&cpu_lock));
1170 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1171 
1172 	if (list != NULL) {
1173 		cp = cp_list_head;
1174 		do {
1175 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1176 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1177 				if (numpart == nelem)
1178 					break;
1179 				list[numpart++] = CPTOPS(cp->cp_id);
1180 			}
1181 			cp = cp->cp_next;
1182 		} while (cp != cp_list_head);
1183 	}
1184 
1185 	ASSERT(numpart < cp_numparts);
1186 
1187 	if (flag == CP_ALL)
1188 		numpart = cp_numparts - 1; /* leave out default partition */
1189 	else if (flag == CP_NONEMPTY)
1190 		numpart = cp_numparts_nonempty;
1191 
1192 	return (numpart);
1193 }
1194 
1195 int
1196 cpupart_setattr(psetid_t psid, uint_t attr)
1197 {
1198 	cpupart_t *cp;
1199 
1200 	ASSERT(pool_lock_held());
1201 
1202 	mutex_enter(&cpu_lock);
1203 	if ((cp = cpupart_find(psid)) == NULL) {
1204 		mutex_exit(&cpu_lock);
1205 		return (EINVAL);
1206 	}
1207 	/*
1208 	 * PSET_NOESCAPE attribute for default cpu partition is always set
1209 	 */
1210 	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1211 		mutex_exit(&cpu_lock);
1212 		return (EINVAL);
1213 	}
1214 	cp->cp_attr = attr;
1215 	mutex_exit(&cpu_lock);
1216 	return (0);
1217 }
1218 
1219 int
1220 cpupart_getattr(psetid_t psid, uint_t *attrp)
1221 {
1222 	cpupart_t *cp;
1223 
1224 	mutex_enter(&cpu_lock);
1225 	if ((cp = cpupart_find(psid)) == NULL) {
1226 		mutex_exit(&cpu_lock);
1227 		return (EINVAL);
1228 	}
1229 	*attrp = cp->cp_attr;
1230 	mutex_exit(&cpu_lock);
1231 	return (0);
1232 }
1233