xref: /titanic_41/usr/src/uts/common/disp/cpupart.c (revision 4b22b9337f359bfd063322244f5336cc7c6ffcfa)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/thread.h>
33 #include <sys/disp.h>
34 #include <sys/kmem.h>
35 #include <sys/debug.h>
36 #include <sys/cpupart.h>
37 #include <sys/pset.h>
38 #include <sys/var.h>
39 #include <sys/cyclic.h>
40 #include <sys/lgrp.h>
41 #include <sys/pghw.h>
42 #include <sys/loadavg.h>
43 #include <sys/class.h>
44 #include <sys/fss.h>
45 #include <sys/pool.h>
46 #include <sys/pool_pset.h>
47 #include <sys/policy.h>
48 
49 /*
50  * Calling pool_lock() protects the pools configuration, which includes
51  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
52  * partitions from being created or destroyed while the lock is held.
53  * The lock ordering with respect to related locks is:
54  *
55  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
56  *
57  * Blocking memory allocations may be made while holding "pool_lock"
58  * or cpu_lock.
59  */
60 
61 /*
62  * The cp_default partition is allocated statically, but its lgroup load average
63  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
64  * saves some memory since the space allocated reflects the actual number of
65  * lgroups supported by the platform. The lgrp facility provides a temporary
66  * space to hold lpl information during system bootstrap.
67  */
68 
69 cpupart_t		*cp_list_head;
70 cpupart_t		cp_default;
71 struct mach_cpupart	cp_default_mach;
72 static cpupartid_t	cp_id_next;
73 uint_t			cp_numparts;
74 uint_t			cp_numparts_nonempty;
75 
76 /*
77  * Need to limit total number of partitions to avoid slowing down the
78  * clock code too much.  The clock code traverses the list of
79  * partitions and needs to be able to execute in a reasonable amount
80  * of time (less than 1/hz seconds).  The maximum is sized based on
81  * max_ncpus so it shouldn't be a problem unless there are large
82  * numbers of empty partitions.
83  */
84 static uint_t		cp_max_numparts;
85 
86 /*
87  * Processor sets and CPU partitions are different but related concepts.
88  * A processor set is a user-level abstraction allowing users to create
89  * sets of CPUs and bind threads exclusively to those sets.  A CPU
90  * partition is a kernel dispatcher object consisting of a set of CPUs
91  * and a global dispatch queue.  The processor set abstraction is
92  * implemented via a CPU partition, and currently there is a 1-1
93  * mapping between processor sets and partitions (excluding the default
94  * partition, which is not visible as a processor set).  Hence, the
95  * numbering for processor sets and CPU partitions is identical.  This
96  * may not always be true in the future, and these macros could become
97  * less trivial if we support e.g. a processor set containing multiple
98  * CPU partitions.
99  */
100 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
101 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
102 
103 /*
104  * Find a CPU partition given a processor set ID.
105  */
106 static cpupart_t *
107 cpupart_find_all(psetid_t psid)
108 {
109 	cpupart_t *cp;
110 	cpupartid_t cpid = PSTOCP(psid);
111 
112 	ASSERT(MUTEX_HELD(&cpu_lock));
113 
114 	/* default partition not visible as a processor set */
115 	if (psid == CP_DEFAULT)
116 		return (NULL);
117 
118 	if (psid == PS_MYID)
119 		return (curthread->t_cpupart);
120 
121 	cp = cp_list_head;
122 	do {
123 		if (cp->cp_id == cpid)
124 			return (cp);
125 		cp = cp->cp_next;
126 	} while (cp != cp_list_head);
127 	return (NULL);
128 }
129 
130 /*
131  * Find a CPU partition given a processor set ID if the processor set
132  * should be visible from the calling zone.
133  */
134 cpupart_t *
135 cpupart_find(psetid_t psid)
136 {
137 	cpupart_t *cp;
138 
139 	ASSERT(MUTEX_HELD(&cpu_lock));
140 	cp = cpupart_find_all(psid);
141 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
142 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
143 			return (NULL);
144 	return (cp);
145 }
146 
147 static int
148 cpupart_kstat_update(kstat_t *ksp, int rw)
149 {
150 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
151 	cpupart_kstat_t *cpksp = ksp->ks_data;
152 
153 	if (rw == KSTAT_WRITE)
154 		return (EACCES);
155 
156 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
157 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
158 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
159 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
160 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
161 	    (16 - FSHIFT);
162 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
163 	    (16 - FSHIFT);
164 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
165 	    (16 - FSHIFT);
166 	return (0);
167 }
168 
169 static void
170 cpupart_kstat_create(cpupart_t *cp)
171 {
172 	kstat_t *ksp;
173 	zoneid_t zoneid;
174 
175 	ASSERT(MUTEX_HELD(&cpu_lock));
176 
177 	/*
178 	 * We have a bit of a chicken-egg problem since this code will
179 	 * get called to create the kstats for CP_DEFAULT before the
180 	 * pools framework gets initialized.  We circumvent the problem
181 	 * by special-casing cp_default.
182 	 */
183 	if (cp != &cp_default && pool_pset_enabled())
184 		zoneid = GLOBAL_ZONEID;
185 	else
186 		zoneid = ALL_ZONES;
187 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
188 	    KSTAT_TYPE_NAMED,
189 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
190 	if (ksp != NULL) {
191 		cpupart_kstat_t *cpksp = ksp->ks_data;
192 
193 		kstat_named_init(&cpksp->cpk_updates, "updates",
194 		    KSTAT_DATA_UINT64);
195 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
196 		    KSTAT_DATA_UINT64);
197 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
198 		    KSTAT_DATA_UINT64);
199 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
200 		    KSTAT_DATA_UINT32);
201 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
202 		    KSTAT_DATA_UINT32);
203 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
204 		    KSTAT_DATA_UINT32);
205 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
206 		    KSTAT_DATA_UINT32);
207 
208 		ksp->ks_update = cpupart_kstat_update;
209 		ksp->ks_private = cp;
210 
211 		kstat_install(ksp);
212 	}
213 	cp->cp_kstat = ksp;
214 }
215 
216 /*
217  * Initialize the default partition and kpreempt disp queue.
218  */
219 void
220 cpupart_initialize_default(void)
221 {
222 	lgrp_id_t i;
223 
224 	cp_list_head = &cp_default;
225 	cp_default.cp_next = &cp_default;
226 	cp_default.cp_prev = &cp_default;
227 	cp_default.cp_id = CP_DEFAULT;
228 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
229 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
230 	cp_default.cp_kp_queue.disp_cpu = NULL;
231 	cp_default.cp_gen = 0;
232 	cp_default.cp_loadavg.lg_cur = 0;
233 	cp_default.cp_loadavg.lg_len = 0;
234 	cp_default.cp_loadavg.lg_total = 0;
235 	for (i = 0; i < S_LOADAVG_SZ; i++) {
236 		cp_default.cp_loadavg.lg_loads[i] = 0;
237 	}
238 	CPUSET_ZERO(cp_default.cp_mach->mc_haltset);
239 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
240 	cp_id_next = CP_DEFAULT + 1;
241 	cpupart_kstat_create(&cp_default);
242 	cp_numparts = 1;
243 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
244 		cp_max_numparts = max_ncpus * 2 + 1;
245 	/*
246 	 * Allocate space for cp_default list of lgrploads
247 	 */
248 	cp_default.cp_nlgrploads = lgrp_plat_max_lgrps();
249 	cp_default.cp_lgrploads = kmem_zalloc(sizeof (lpl_t) *
250 	    cp_default.cp_nlgrploads, KM_SLEEP);
251 
252 	/*
253 	 * The initial lpl topology is created in a special lpl list
254 	 * lpl_bootstrap. It should be copied to cp_default.
255 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
256 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
257 	 */
258 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
259 	    cp_default.cp_nlgrploads);
260 
261 	for (i = 0; i < cp_default.cp_nlgrploads; i++) {
262 		cp_default.cp_lgrploads[i].lpl_lgrpid = i;
263 	}
264 	cp_default.cp_attr = PSET_NOESCAPE;
265 	cp_numparts_nonempty = 1;
266 	/*
267 	 * Set t0's home
268 	 */
269 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
270 
271 	bitset_init(&cp_default.cp_cmt_pgs);
272 }
273 
274 
275 static int
276 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
277 {
278 	cpupart_t *oldpp;
279 	cpu_t	*ncp, *newlist;
280 	kthread_t *t;
281 	int	move_threads = 1;
282 	lgrp_id_t lgrpid;
283 	proc_t 	*p;
284 	int lgrp_diff_lpl;
285 	lpl_t	*cpu_lpl;
286 	int	ret;
287 
288 	ASSERT(MUTEX_HELD(&cpu_lock));
289 	ASSERT(newpp != NULL);
290 
291 	oldpp = cp->cpu_part;
292 	ASSERT(oldpp != NULL);
293 	ASSERT(oldpp->cp_ncpus > 0);
294 
295 	if (newpp == oldpp) {
296 		/*
297 		 * Don't need to do anything.
298 		 */
299 		return (0);
300 	}
301 
302 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
303 
304 	if (!disp_bound_partition(cp, 0)) {
305 		/*
306 		 * Don't need to move threads if there are no threads in
307 		 * the partition.  Note that threads can't enter the
308 		 * partition while we're holding cpu_lock.
309 		 */
310 		move_threads = 0;
311 	} else if (oldpp->cp_ncpus == 1) {
312 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
313 		return (EBUSY);
314 	}
315 
316 	if (forced && (ret = cpu_unbind(cp->cpu_id)) != 0) {
317 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
318 		return (ret);
319 	}
320 
321 	/*
322 	 * Stop further threads weak binding to this cpu.
323 	 */
324 	cpu_inmotion = cp;
325 	membar_enter();
326 
327 	/*
328 	 * Notify the Processor Groups subsystem that the CPU
329 	 * will be moving cpu partitions. This is done before
330 	 * CPUs are paused to provide an opportunity for any
331 	 * needed memory allocations.
332 	 */
333 	pg_cpupart_out(cp, oldpp);
334 	pg_cpupart_in(cp, newpp);
335 
336 again:
337 	if (move_threads) {
338 		int loop_count;
339 		/*
340 		 * Check for threads strong or weak bound to this CPU.
341 		 */
342 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
343 			if (loop_count >= 5) {
344 				cpu_state_change_notify(cp->cpu_id,
345 				    CPU_CPUPART_IN);
346 				pg_cpupart_out(cp, newpp);
347 				pg_cpupart_in(cp, oldpp);
348 				cpu_inmotion = NULL;
349 				return (EBUSY);	/* some threads still bound */
350 			}
351 			delay(1);
352 		}
353 	}
354 
355 	/*
356 	 * Before we actually start changing data structures, notify
357 	 * the cyclic subsystem that we want to move this CPU out of its
358 	 * partition.
359 	 */
360 	if (!cyclic_move_out(cp)) {
361 		/*
362 		 * This CPU must be the last CPU in a processor set with
363 		 * a bound cyclic.
364 		 */
365 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
366 		pg_cpupart_out(cp, newpp);
367 		pg_cpupart_in(cp, oldpp);
368 		cpu_inmotion = NULL;
369 		return (EBUSY);
370 	}
371 
372 	pause_cpus(cp);
373 
374 	if (move_threads) {
375 		/*
376 		 * The thread on cpu before the pause thread may have read
377 		 * cpu_inmotion before we raised the barrier above.  Check
378 		 * again.
379 		 */
380 		if (disp_bound_threads(cp, 1)) {
381 			start_cpus();
382 			goto again;
383 		}
384 
385 	}
386 
387 	/*
388 	 * Now that CPUs are paused, let the PG subsystem perform
389 	 * any necessary data structure updates.
390 	 */
391 	pg_cpupart_move(cp, oldpp, newpp);
392 
393 	/* save this cpu's lgroup -- it'll be the same in the new partition */
394 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
395 
396 	cpu_lpl = cp->cpu_lpl;
397 	/*
398 	 * let the lgroup framework know cp has left the partition
399 	 */
400 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
401 
402 	/* move out of old partition */
403 	oldpp->cp_ncpus--;
404 	if (oldpp->cp_ncpus > 0) {
405 
406 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
407 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
408 		if (oldpp->cp_cpulist == cp) {
409 			oldpp->cp_cpulist = ncp;
410 		}
411 	} else {
412 		ncp = oldpp->cp_cpulist = NULL;
413 		cp_numparts_nonempty--;
414 		ASSERT(cp_numparts_nonempty != 0);
415 	}
416 	oldpp->cp_gen++;
417 
418 	/* move into new partition */
419 	newlist = newpp->cp_cpulist;
420 	if (newlist == NULL) {
421 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
422 		cp_numparts_nonempty++;
423 		ASSERT(cp_numparts_nonempty != 0);
424 	} else {
425 		cp->cpu_next_part = newlist;
426 		cp->cpu_prev_part = newlist->cpu_prev_part;
427 		newlist->cpu_prev_part->cpu_next_part = cp;
428 		newlist->cpu_prev_part = cp;
429 	}
430 	cp->cpu_part = newpp;
431 	newpp->cp_ncpus++;
432 	newpp->cp_gen++;
433 
434 	ASSERT(CPUSET_ISNULL(newpp->cp_mach->mc_haltset));
435 	ASSERT(CPUSET_ISNULL(oldpp->cp_mach->mc_haltset));
436 
437 	/*
438 	 * let the lgroup framework know cp has entered the partition
439 	 */
440 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
441 
442 	/*
443 	 * If necessary, move threads off processor.
444 	 */
445 	if (move_threads) {
446 		ASSERT(ncp != NULL);
447 
448 		/*
449 		 * Walk thru the active process list to look for
450 		 * threads that need to have a new home lgroup,
451 		 * or the last CPU they run on is the same CPU
452 		 * being moved out of the partition.
453 		 */
454 
455 		for (p = practive; p != NULL; p = p->p_next) {
456 
457 			t = p->p_tlist;
458 
459 			if (t == NULL)
460 				continue;
461 
462 			lgrp_diff_lpl = 0;
463 
464 			do {
465 
466 				ASSERT(t->t_lpl != NULL);
467 
468 				/*
469 				 * Update the count of how many threads are
470 				 * in this CPU's lgroup but have a different lpl
471 				 */
472 
473 				if (t->t_lpl != cpu_lpl &&
474 				    t->t_lpl->lpl_lgrpid == lgrpid)
475 					lgrp_diff_lpl++;
476 				/*
477 				 * If the lgroup that t is assigned to no
478 				 * longer has any CPUs in t's partition,
479 				 * we'll have to choose a new lgroup for t.
480 				 */
481 
482 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
483 				    t->t_cpupart)) {
484 					lgrp_move_thread(t,
485 					    lgrp_choose(t, t->t_cpupart), 0);
486 				}
487 
488 				/*
489 				 * make sure lpl points to our own partition
490 				 */
491 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
492 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
493 					t->t_cpupart->cp_nlgrploads));
494 
495 				ASSERT(t->t_lpl->lpl_ncpu > 0);
496 
497 				/* Update CPU last ran on if it was this CPU */
498 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
499 				    t->t_bound_cpu != cp) {
500 					t->t_cpu = disp_lowpri_cpu(ncp,
501 					    t->t_lpl, t->t_pri, NULL);
502 				}
503 				t = t->t_forw;
504 			} while (t != p->p_tlist);
505 
506 			/*
507 			 * Didn't find any threads in the same lgroup as this
508 			 * CPU with a different lpl, so remove the lgroup from
509 			 * the process lgroup bitmask.
510 			 */
511 
512 			if (lgrp_diff_lpl)
513 				klgrpset_del(p->p_lgrpset, lgrpid);
514 		}
515 
516 		/*
517 		 * Walk thread list looking for threads that need to be
518 		 * rehomed, since there are some threads that are not in
519 		 * their process's p_tlist.
520 		 */
521 
522 		t = curthread;
523 
524 		do {
525 			ASSERT(t != NULL && t->t_lpl != NULL);
526 
527 			/*
528 			 * If the lgroup that t is assigned to no
529 			 * longer has any CPUs in t's partition,
530 			 * we'll have to choose a new lgroup for t.
531 			 * Also, choose best lgroup for home when
532 			 * thread has specified lgroup affinities,
533 			 * since there may be an lgroup with more
534 			 * affinity available after moving CPUs
535 			 * around.
536 			 */
537 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
538 			    t->t_cpupart) || t->t_lgrp_affinity) {
539 				lgrp_move_thread(t,
540 				    lgrp_choose(t, t->t_cpupart), 1);
541 			}
542 
543 			/* make sure lpl points to our own partition */
544 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
545 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
546 				t->t_cpupart->cp_nlgrploads));
547 
548 			ASSERT(t->t_lpl->lpl_ncpu > 0);
549 
550 			/* Update CPU last ran on if it was this CPU */
551 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
552 			    t->t_bound_cpu != cp) {
553 				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
554 				    t->t_pri, NULL);
555 			}
556 
557 			t = t->t_next;
558 		} while (t != curthread);
559 
560 		/*
561 		 * Clear off the CPU's run queue, and the kp queue if the
562 		 * partition is now empty.
563 		 */
564 		disp_cpu_inactive(cp);
565 
566 		/*
567 		 * Make cp switch to a thread from the new partition.
568 		 */
569 		cp->cpu_runrun = 1;
570 		cp->cpu_kprunrun = 1;
571 	}
572 
573 	cpu_inmotion = NULL;
574 	start_cpus();
575 
576 	/*
577 	 * Let anyone interested know that cpu has been added to the set.
578 	 */
579 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
580 
581 	/*
582 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
583 	 * bound to the new processor set.
584 	 */
585 	cyclic_move_in(cp);
586 
587 	return (0);
588 }
589 
590 /*
591  * Check if thread can be moved to a new cpu partition.  Called by
592  * cpupart_move_thread() and pset_bind_start().
593  */
594 int
595 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
596 {
597 	ASSERT(MUTEX_HELD(&cpu_lock));
598 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
599 	ASSERT(cp != NULL);
600 	ASSERT(THREAD_LOCK_HELD(tp));
601 
602 	/*
603 	 * CPU-bound threads can't be moved.
604 	 */
605 	if (!ignore) {
606 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
607 		    tp->t_weakbound_cpu;
608 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
609 			return (EBUSY);
610 	}
611 	return (0);
612 }
613 
614 /*
615  * Move thread to new partition.  If ignore is non-zero, then CPU
616  * bindings should be ignored (this is used when destroying a
617  * partition).
618  */
619 static int
620 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
621     void *projbuf, void *zonebuf)
622 {
623 	cpupart_t *oldpp = tp->t_cpupart;
624 	int ret;
625 
626 	ASSERT(MUTEX_HELD(&cpu_lock));
627 	ASSERT(MUTEX_HELD(&pidlock));
628 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
629 	ASSERT(newpp != NULL);
630 
631 	if (newpp->cp_cpulist == NULL)
632 		return (EINVAL);
633 
634 	/*
635 	 * Check for errors first.
636 	 */
637 	thread_lock(tp);
638 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
639 		thread_unlock(tp);
640 		return (ret);
641 	}
642 
643 	/* move the thread */
644 	if (oldpp != newpp) {
645 		/*
646 		 * Make the thread switch to the new partition.
647 		 */
648 		tp->t_cpupart = newpp;
649 		ASSERT(tp->t_lpl != NULL);
650 		/*
651 		 * Leave the thread on the same lgroup if possible; otherwise
652 		 * choose a new lgroup for it.  In either case, update its
653 		 * t_lpl.
654 		 */
655 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
656 		    tp->t_lgrp_affinity == NULL) {
657 			/*
658 			 * The thread's lgroup has CPUs in the thread's new
659 			 * partition, so the thread can stay assigned to the
660 			 * same lgroup.  Update its t_lpl to point to the
661 			 * lpl_t for its lgroup in its new partition.
662 			 */
663 			lgrp_move_thread(tp, &tp->t_cpupart->\
664 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
665 		} else {
666 			/*
667 			 * The thread's lgroup has no cpus in its new
668 			 * partition or it has specified lgroup affinities,
669 			 * so choose the best lgroup for the thread and
670 			 * assign it to that lgroup.
671 			 */
672 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
673 			    1);
674 		}
675 		/*
676 		 * make sure lpl points to our own partition
677 		 */
678 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
679 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
680 			tp->t_cpupart->cp_nlgrploads));
681 
682 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
683 
684 		if (tp->t_state == TS_ONPROC) {
685 			cpu_surrender(tp);
686 		} else if (tp->t_state == TS_RUN) {
687 			(void) dispdeq(tp);
688 			setbackdq(tp);
689 		}
690 	}
691 
692 	/*
693 	 * Our binding has changed; set TP_CHANGEBIND.
694 	 */
695 	tp->t_proc_flag |= TP_CHANGEBIND;
696 	aston(tp);
697 
698 	thread_unlock(tp);
699 	fss_changepset(tp, newpp, projbuf, zonebuf);
700 
701 	return (0);		/* success */
702 }
703 
704 
705 /*
706  * This function binds a thread to a partition.  Must be called with the
707  * p_lock of the containing process held (to keep the thread from going
708  * away), and thus also with cpu_lock held (since cpu_lock must be
709  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
710  * should be ignored (this is used when destroying a partition).
711  */
712 int
713 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
714     void *zonebuf)
715 {
716 	cpupart_t	*newpp;
717 
718 	ASSERT(pool_lock_held());
719 	ASSERT(MUTEX_HELD(&cpu_lock));
720 	ASSERT(MUTEX_HELD(&pidlock));
721 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
722 
723 	if (psid == PS_NONE)
724 		newpp = &cp_default;
725 	else {
726 		newpp = cpupart_find(psid);
727 		if (newpp == NULL) {
728 			return (EINVAL);
729 		}
730 	}
731 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
732 }
733 
734 
735 /*
736  * Create a new partition.  On MP systems, this also allocates a
737  * kpreempt disp queue for that partition.
738  */
739 int
740 cpupart_create(psetid_t *psid)
741 {
742 	cpupart_t	*pp;
743 	lgrp_id_t	i;
744 
745 	ASSERT(pool_lock_held());
746 
747 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
748 	pp->cp_mach = kmem_zalloc(sizeof (struct mach_cpupart), KM_SLEEP);
749 	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
750 	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
751 	    KM_SLEEP);
752 
753 	mutex_enter(&cpu_lock);
754 	if (cp_numparts == cp_max_numparts) {
755 		mutex_exit(&cpu_lock);
756 		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
757 		pp->cp_lgrploads = NULL;
758 		kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
759 		kmem_free(pp, sizeof (cpupart_t));
760 		return (ENOMEM);
761 	}
762 	cp_numparts++;
763 	/* find the next free partition ID */
764 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
765 		cp_id_next++;
766 	pp->cp_id = cp_id_next++;
767 	pp->cp_ncpus = 0;
768 	pp->cp_cpulist = NULL;
769 	pp->cp_attr = 0;
770 	klgrpset_clear(pp->cp_lgrpset);
771 	pp->cp_kp_queue.disp_maxrunpri = -1;
772 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
773 	pp->cp_kp_queue.disp_cpu = NULL;
774 	pp->cp_gen = 0;
775 	CPUSET_ZERO(pp->cp_mach->mc_haltset);
776 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
777 	*psid = CPTOPS(pp->cp_id);
778 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
779 	cpupart_kstat_create(pp);
780 	for (i = 0; i < pp->cp_nlgrploads; i++) {
781 		pp->cp_lgrploads[i].lpl_lgrpid = i;
782 	}
783 	bitset_init(&pp->cp_cmt_pgs);
784 
785 	/*
786 	 * Pause all CPUs while changing the partition list, to make sure
787 	 * the clock thread (which traverses the list without holding
788 	 * cpu_lock) isn't running.
789 	 */
790 	pause_cpus(NULL);
791 	pp->cp_next = cp_list_head;
792 	pp->cp_prev = cp_list_head->cp_prev;
793 	cp_list_head->cp_prev->cp_next = pp;
794 	cp_list_head->cp_prev = pp;
795 	start_cpus();
796 	mutex_exit(&cpu_lock);
797 
798 	return (0);
799 }
800 
801 
802 /*
803  * Destroy a partition.
804  */
805 int
806 cpupart_destroy(psetid_t psid)
807 {
808 	cpu_t	*cp, *first_cp;
809 	cpupart_t *pp, *newpp;
810 	int	err = 0;
811 	void 	*projbuf, *zonebuf;
812 	kthread_t *t;
813 	proc_t	*p;
814 
815 	ASSERT(pool_lock_held());
816 	mutex_enter(&cpu_lock);
817 
818 	pp = cpupart_find(psid);
819 	if (pp == NULL || pp == &cp_default) {
820 		mutex_exit(&cpu_lock);
821 		return (EINVAL);
822 	}
823 
824 	/*
825 	 * Pre-allocate enough buffers for FSS for all active projects and
826 	 * for all active zones on the system.  Unused buffers will be
827 	 * freed later by fss_freebuf().
828 	 */
829 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
830 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
831 
832 	/*
833 	 * First need to unbind all the threads currently bound to the
834 	 * partition.  Then do the actual destroy (which moves the CPUs).
835 	 */
836 	mutex_enter(&pidlock);
837 	t = curthread;
838 	do {
839 		if (t->t_bind_pset == psid) {
840 again:			p = ttoproc(t);
841 			mutex_enter(&p->p_lock);
842 			if (ttoproc(t) != p) {
843 				/*
844 				 * lwp_exit has changed this thread's process
845 				 * pointer before we grabbed its p_lock.
846 				 */
847 				mutex_exit(&p->p_lock);
848 				goto again;
849 			}
850 			err = cpupart_bind_thread(t, PS_NONE, 1,
851 			    projbuf, zonebuf);
852 			if (err) {
853 				mutex_exit(&p->p_lock);
854 				mutex_exit(&pidlock);
855 				mutex_exit(&cpu_lock);
856 				fss_freebuf(projbuf, FSS_ALLOC_PROJ);
857 				fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
858 				return (err);
859 			}
860 			t->t_bind_pset = PS_NONE;
861 			mutex_exit(&p->p_lock);
862 		}
863 		t = t->t_next;
864 	} while (t != curthread);
865 
866 	mutex_exit(&pidlock);
867 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
868 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
869 
870 	newpp = &cp_default;
871 	while ((cp = pp->cp_cpulist) != NULL) {
872 		if (err = cpupart_move_cpu(cp, newpp, 0)) {
873 			mutex_exit(&cpu_lock);
874 			return (err);
875 		}
876 	}
877 
878 	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
879 	ASSERT(CPUSET_ISNULL(pp->cp_mach->mc_haltset));
880 
881 	/*
882 	 * Teardown the partition's group of active CMT PGs now that
883 	 * all of the CPUs have left.
884 	 */
885 	bitset_fini(&pp->cp_cmt_pgs);
886 
887 	/*
888 	 * Reset the pointers in any offline processors so they won't
889 	 * try to rejoin the destroyed partition when they're turned
890 	 * online.
891 	 */
892 	first_cp = cp = CPU;
893 	do {
894 		if (cp->cpu_part == pp) {
895 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
896 			cp->cpu_part = newpp;
897 		}
898 		cp = cp->cpu_next;
899 	} while (cp != first_cp);
900 
901 	/*
902 	 * Pause all CPUs while changing the partition list, to make sure
903 	 * the clock thread (which traverses the list without holding
904 	 * cpu_lock) isn't running.
905 	 */
906 	pause_cpus(NULL);
907 	pp->cp_prev->cp_next = pp->cp_next;
908 	pp->cp_next->cp_prev = pp->cp_prev;
909 	if (cp_list_head == pp)
910 		cp_list_head = pp->cp_next;
911 	start_cpus();
912 
913 	if (cp_id_next > pp->cp_id)
914 		cp_id_next = pp->cp_id;
915 
916 	if (pp->cp_kstat)
917 		kstat_delete(pp->cp_kstat);
918 
919 	cp_numparts--;
920 
921 	disp_kp_free(&pp->cp_kp_queue);
922 	kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
923 	pp->cp_lgrploads = NULL;
924 	kmem_free(pp->cp_mach, sizeof (struct mach_cpupart));
925 	kmem_free(pp, sizeof (cpupart_t));
926 	mutex_exit(&cpu_lock);
927 
928 	return (err);
929 }
930 
931 
932 /*
933  * Return the ID of the partition to which the specified processor belongs.
934  */
935 psetid_t
936 cpupart_query_cpu(cpu_t *cp)
937 {
938 	ASSERT(MUTEX_HELD(&cpu_lock));
939 
940 	return (CPTOPS(cp->cpu_part->cp_id));
941 }
942 
943 
944 /*
945  * Attach a processor to an existing partition.
946  */
947 int
948 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
949 {
950 	cpupart_t	*pp;
951 	int		err;
952 
953 	ASSERT(pool_lock_held());
954 	ASSERT(MUTEX_HELD(&cpu_lock));
955 
956 	pp = cpupart_find(psid);
957 	if (pp == NULL)
958 		return (EINVAL);
959 	if (cp->cpu_flags & CPU_OFFLINE)
960 		return (EINVAL);
961 
962 	err = cpupart_move_cpu(cp, pp, forced);
963 	return (err);
964 }
965 
966 /*
967  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
968  * this just checks for a valid partition.  If numcpus is non-NULL but
969  * cpulist is NULL, the current number of cpus is stored in *numcpus.
970  * If both are non-NULL, the current number of cpus is stored in *numcpus,
971  * and a list of those cpus up to the size originally in *numcpus is
972  * stored in cpulist[].  Also, store the processor set id in *psid.
973  * This is useful in case the processor set id passed in was PS_MYID.
974  */
975 int
976 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
977 {
978 	cpupart_t	*pp;
979 	uint_t		ncpus;
980 	cpu_t		*c;
981 	int		i;
982 
983 	mutex_enter(&cpu_lock);
984 	pp = cpupart_find(*psid);
985 	if (pp == NULL) {
986 		mutex_exit(&cpu_lock);
987 		return (EINVAL);
988 	}
989 	*psid = CPTOPS(pp->cp_id);
990 	ncpus = pp->cp_ncpus;
991 	if (numcpus) {
992 		if (ncpus > *numcpus) {
993 			/*
994 			 * Only copy as many cpus as were passed in, but
995 			 * pass back the real number.
996 			 */
997 			uint_t t = ncpus;
998 			ncpus = *numcpus;
999 			*numcpus = t;
1000 		} else
1001 			*numcpus = ncpus;
1002 
1003 		if (cpulist) {
1004 			c = pp->cp_cpulist;
1005 			for (i = 0; i < ncpus; i++) {
1006 				ASSERT(c != NULL);
1007 				cpulist[i] = c->cpu_id;
1008 				c = c->cpu_next_part;
1009 			}
1010 		}
1011 	}
1012 	mutex_exit(&cpu_lock);
1013 	return (0);
1014 }
1015 
1016 /*
1017  * Reallocate kpreempt queues for each CPU partition.  Called from
1018  * disp_setup when a new scheduling class is loaded that increases the
1019  * number of priorities in the system.
1020  */
1021 void
1022 cpupart_kpqalloc(pri_t npri)
1023 {
1024 	cpupart_t *cpp;
1025 
1026 	ASSERT(MUTEX_HELD(&cpu_lock));
1027 	cpp = cp_list_head;
1028 	do {
1029 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1030 		cpp = cpp->cp_next;
1031 	} while (cpp != cp_list_head);
1032 }
1033 
1034 int
1035 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1036 {
1037 	cpupart_t *cp;
1038 	int i;
1039 
1040 	ASSERT(nelem >= 0);
1041 	ASSERT(nelem <= LOADAVG_NSTATS);
1042 	ASSERT(MUTEX_HELD(&cpu_lock));
1043 
1044 	cp = cpupart_find(psid);
1045 	if (cp == NULL)
1046 		return (EINVAL);
1047 	for (i = 0; i < nelem; i++)
1048 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1049 
1050 	return (0);
1051 }
1052 
1053 
1054 uint_t
1055 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1056 {
1057 	uint_t numpart = 0;
1058 	cpupart_t *cp;
1059 
1060 	ASSERT(MUTEX_HELD(&cpu_lock));
1061 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1062 
1063 	if (list != NULL) {
1064 		cp = cp_list_head;
1065 		do {
1066 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1067 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1068 				if (numpart == nelem)
1069 					break;
1070 				list[numpart++] = CPTOPS(cp->cp_id);
1071 			}
1072 			cp = cp->cp_next;
1073 		} while (cp != cp_list_head);
1074 	}
1075 
1076 	ASSERT(numpart < cp_numparts);
1077 
1078 	if (flag == CP_ALL)
1079 		numpart = cp_numparts - 1; /* leave out default partition */
1080 	else if (flag == CP_NONEMPTY)
1081 		numpart = cp_numparts_nonempty;
1082 
1083 	return (numpart);
1084 }
1085 
1086 int
1087 cpupart_setattr(psetid_t psid, uint_t attr)
1088 {
1089 	cpupart_t *cp;
1090 
1091 	ASSERT(pool_lock_held());
1092 
1093 	mutex_enter(&cpu_lock);
1094 	if ((cp = cpupart_find(psid)) == NULL) {
1095 		mutex_exit(&cpu_lock);
1096 		return (EINVAL);
1097 	}
1098 	/*
1099 	 * PSET_NOESCAPE attribute for default cpu partition is always set
1100 	 */
1101 	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1102 		mutex_exit(&cpu_lock);
1103 		return (EINVAL);
1104 	}
1105 	cp->cp_attr = attr;
1106 	mutex_exit(&cpu_lock);
1107 	return (0);
1108 }
1109 
1110 int
1111 cpupart_getattr(psetid_t psid, uint_t *attrp)
1112 {
1113 	cpupart_t *cp;
1114 
1115 	mutex_enter(&cpu_lock);
1116 	if ((cp = cpupart_find(psid)) == NULL) {
1117 		mutex_exit(&cpu_lock);
1118 		return (EINVAL);
1119 	}
1120 	*attrp = cp->cp_attr;
1121 	mutex_exit(&cpu_lock);
1122 	return (0);
1123 }
1124