xref: /illumos-gate/usr/src/uts/common/disp/cpupart.c (revision a6bde1a23b60f140c7ed78df979c2e22b1ed9b2c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/systm.h>
27 #include <sys/cmn_err.h>
28 #include <sys/cpuvar.h>
29 #include <sys/thread.h>
30 #include <sys/disp.h>
31 #include <sys/kmem.h>
32 #include <sys/debug.h>
33 #include <sys/cpupart.h>
34 #include <sys/pset.h>
35 #include <sys/var.h>
36 #include <sys/cyclic.h>
37 #include <sys/lgrp.h>
38 #include <sys/pghw.h>
39 #include <sys/loadavg.h>
40 #include <sys/class.h>
41 #include <sys/fss.h>
42 #include <sys/pool.h>
43 #include <sys/pool_pset.h>
44 #include <sys/policy.h>
45 
46 /*
47  * Calling pool_lock() protects the pools configuration, which includes
48  * CPU partitions.  cpu_lock protects the CPU partition list, and prevents
49  * partitions from being created or destroyed while the lock is held.
50  * The lock ordering with respect to related locks is:
51  *
52  *    pool_lock() ---> cpu_lock  --->  pidlock  -->  p_lock
53  *
54  * Blocking memory allocations may be made while holding "pool_lock"
55  * or cpu_lock.
56  */
57 
58 /*
59  * The cp_default partition is allocated statically, but its lgroup load average
60  * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
61  * saves some memory since the space allocated reflects the actual number of
62  * lgroups supported by the platform. The lgrp facility provides a temporary
63  * space to hold lpl information during system bootstrap.
64  */
65 
66 cpupart_t		*cp_list_head;
67 cpupart_t		cp_default;
68 static cpupartid_t	cp_id_next;
69 uint_t			cp_numparts;
70 uint_t			cp_numparts_nonempty;
71 
72 /*
73  * Need to limit total number of partitions to avoid slowing down the
74  * clock code too much.  The clock code traverses the list of
75  * partitions and needs to be able to execute in a reasonable amount
76  * of time (less than 1/hz seconds).  The maximum is sized based on
77  * max_ncpus so it shouldn't be a problem unless there are large
78  * numbers of empty partitions.
79  */
80 static uint_t		cp_max_numparts;
81 
82 /*
83  * Processor sets and CPU partitions are different but related concepts.
84  * A processor set is a user-level abstraction allowing users to create
85  * sets of CPUs and bind threads exclusively to those sets.  A CPU
86  * partition is a kernel dispatcher object consisting of a set of CPUs
87  * and a global dispatch queue.  The processor set abstraction is
88  * implemented via a CPU partition, and currently there is a 1-1
89  * mapping between processor sets and partitions (excluding the default
90  * partition, which is not visible as a processor set).  Hence, the
91  * numbering for processor sets and CPU partitions is identical.  This
92  * may not always be true in the future, and these macros could become
93  * less trivial if we support e.g. a processor set containing multiple
94  * CPU partitions.
95  */
96 #define	PSTOCP(psid)	((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
97 #define	CPTOPS(cpid)	((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
98 
99 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
100 
101 /*
102  * Find a CPU partition given a processor set ID.
103  */
104 static cpupart_t *
105 cpupart_find_all(psetid_t psid)
106 {
107 	cpupart_t *cp;
108 	cpupartid_t cpid = PSTOCP(psid);
109 
110 	ASSERT(MUTEX_HELD(&cpu_lock));
111 
112 	/* default partition not visible as a processor set */
113 	if (psid == CP_DEFAULT)
114 		return (NULL);
115 
116 	if (psid == PS_MYID)
117 		return (curthread->t_cpupart);
118 
119 	cp = cp_list_head;
120 	do {
121 		if (cp->cp_id == cpid)
122 			return (cp);
123 		cp = cp->cp_next;
124 	} while (cp != cp_list_head);
125 	return (NULL);
126 }
127 
128 /*
129  * Find a CPU partition given a processor set ID if the processor set
130  * should be visible from the calling zone.
131  */
132 cpupart_t *
133 cpupart_find(psetid_t psid)
134 {
135 	cpupart_t *cp;
136 
137 	ASSERT(MUTEX_HELD(&cpu_lock));
138 	cp = cpupart_find_all(psid);
139 	if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
140 	    zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
141 			return (NULL);
142 	return (cp);
143 }
144 
145 static int
146 cpupart_kstat_update(kstat_t *ksp, int rw)
147 {
148 	cpupart_t *cp = (cpupart_t *)ksp->ks_private;
149 	cpupart_kstat_t *cpksp = ksp->ks_data;
150 
151 	if (rw == KSTAT_WRITE)
152 		return (EACCES);
153 
154 	cpksp->cpk_updates.value.ui64 = cp->cp_updates;
155 	cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
156 	cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
157 	cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
158 	cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
159 	    (16 - FSHIFT);
160 	cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
161 	    (16 - FSHIFT);
162 	cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
163 	    (16 - FSHIFT);
164 	return (0);
165 }
166 
167 static void
168 cpupart_kstat_create(cpupart_t *cp)
169 {
170 	kstat_t *ksp;
171 	zoneid_t zoneid;
172 
173 	ASSERT(MUTEX_HELD(&cpu_lock));
174 
175 	/*
176 	 * We have a bit of a chicken-egg problem since this code will
177 	 * get called to create the kstats for CP_DEFAULT before the
178 	 * pools framework gets initialized.  We circumvent the problem
179 	 * by special-casing cp_default.
180 	 */
181 	if (cp != &cp_default && pool_pset_enabled())
182 		zoneid = GLOBAL_ZONEID;
183 	else
184 		zoneid = ALL_ZONES;
185 	ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
186 	    KSTAT_TYPE_NAMED,
187 	    sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
188 	if (ksp != NULL) {
189 		cpupart_kstat_t *cpksp = ksp->ks_data;
190 
191 		kstat_named_init(&cpksp->cpk_updates, "updates",
192 		    KSTAT_DATA_UINT64);
193 		kstat_named_init(&cpksp->cpk_runnable, "runnable",
194 		    KSTAT_DATA_UINT64);
195 		kstat_named_init(&cpksp->cpk_waiting, "waiting",
196 		    KSTAT_DATA_UINT64);
197 		kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
198 		    KSTAT_DATA_UINT32);
199 		kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
200 		    KSTAT_DATA_UINT32);
201 		kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
202 		    KSTAT_DATA_UINT32);
203 		kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
204 		    KSTAT_DATA_UINT32);
205 
206 		ksp->ks_update = cpupart_kstat_update;
207 		ksp->ks_private = cp;
208 
209 		kstat_install(ksp);
210 	}
211 	cp->cp_kstat = ksp;
212 }
213 
214 /*
215  * Initialize the cpupart's lgrp partions (lpls)
216  */
217 static void
218 cpupart_lpl_initialize(cpupart_t *cp)
219 {
220 	int i, sz;
221 
222 	sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
223 	cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
224 
225 	for (i = 0; i < sz; i++) {
226 		/*
227 		 * The last entry of the lpl's resource set is always NULL
228 		 * by design (to facilitate iteration)...hence the "oversizing"
229 		 * by 1.
230 		 */
231 		cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
232 		cp->cp_lgrploads[i].lpl_rset =
233 		    kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
234 		cp->cp_lgrploads[i].lpl_id2rset =
235 		    kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
236 		cp->cp_lgrploads[i].lpl_lgrpid = i;
237 	}
238 }
239 
240 /*
241  * Teardown the cpupart's lgrp partitions
242  */
243 static void
244 cpupart_lpl_teardown(cpupart_t *cp)
245 {
246 	int i, sz;
247 	lpl_t *lpl;
248 
249 	for (i = 0; i < cp->cp_nlgrploads; i++) {
250 		lpl = &cp->cp_lgrploads[i];
251 
252 		sz = lpl->lpl_rset_sz;
253 		kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
254 		kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
255 		lpl->lpl_rset = NULL;
256 		lpl->lpl_id2rset = NULL;
257 	}
258 	kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
259 	cp->cp_lgrploads = NULL;
260 }
261 
262 /*
263  * Initialize the default partition and kpreempt disp queue.
264  */
265 void
266 cpupart_initialize_default(void)
267 {
268 	lgrp_id_t i;
269 
270 	cp_list_head = &cp_default;
271 	cp_default.cp_next = &cp_default;
272 	cp_default.cp_prev = &cp_default;
273 	cp_default.cp_id = CP_DEFAULT;
274 	cp_default.cp_kp_queue.disp_maxrunpri = -1;
275 	cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
276 	cp_default.cp_kp_queue.disp_cpu = NULL;
277 	cp_default.cp_gen = 0;
278 	cp_default.cp_loadavg.lg_cur = 0;
279 	cp_default.cp_loadavg.lg_len = 0;
280 	cp_default.cp_loadavg.lg_total = 0;
281 	for (i = 0; i < S_LOADAVG_SZ; i++) {
282 		cp_default.cp_loadavg.lg_loads[i] = 0;
283 	}
284 	DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
285 	cp_id_next = CP_DEFAULT + 1;
286 	cpupart_kstat_create(&cp_default);
287 	cp_numparts = 1;
288 	if (cp_max_numparts == 0)	/* allow for /etc/system tuning */
289 		cp_max_numparts = max_ncpus * 2 + 1;
290 	/*
291 	 * Allocate space for cp_default list of lgrploads
292 	 */
293 	cpupart_lpl_initialize(&cp_default);
294 
295 	/*
296 	 * The initial lpl topology is created in a special lpl list
297 	 * lpl_bootstrap. It should be copied to cp_default.
298 	 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
299 	 *	 to the correct lpl in the cp_default.cp_lgrploads list.
300 	 */
301 	lpl_topo_bootstrap(cp_default.cp_lgrploads,
302 	    cp_default.cp_nlgrploads);
303 
304 
305 	cp_default.cp_attr = PSET_NOESCAPE;
306 	cp_numparts_nonempty = 1;
307 	/*
308 	 * Set t0's home
309 	 */
310 	t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
311 
312 	bitset_init(&cp_default.cp_cmt_pgs);
313 	bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
314 
315 	bitset_resize(&cp_default.cp_haltset, max_ncpus);
316 }
317 
318 
319 static int
320 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
321 {
322 	cpupart_t *oldpp;
323 	cpu_t	*ncp, *newlist;
324 	kthread_t *t;
325 	int	move_threads = 1;
326 	lgrp_id_t lgrpid;
327 	proc_t 	*p;
328 	int lgrp_diff_lpl;
329 	lpl_t	*cpu_lpl;
330 	int	ret;
331 	boolean_t unbind_all_threads = (forced != 0);
332 
333 	ASSERT(MUTEX_HELD(&cpu_lock));
334 	ASSERT(newpp != NULL);
335 
336 	oldpp = cp->cpu_part;
337 	ASSERT(oldpp != NULL);
338 	ASSERT(oldpp->cp_ncpus > 0);
339 
340 	if (newpp == oldpp) {
341 		/*
342 		 * Don't need to do anything.
343 		 */
344 		return (0);
345 	}
346 
347 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
348 
349 	if (!disp_bound_partition(cp, 0)) {
350 		/*
351 		 * Don't need to move threads if there are no threads in
352 		 * the partition.  Note that threads can't enter the
353 		 * partition while we're holding cpu_lock.
354 		 */
355 		move_threads = 0;
356 	} else if (oldpp->cp_ncpus == 1) {
357 		/*
358 		 * The last CPU is removed from a partition which has threads
359 		 * running in it. Some of these threads may be bound to this
360 		 * CPU.
361 		 *
362 		 * Attempt to unbind threads from the CPU and from the processor
363 		 * set. Note that no threads should be bound to this CPU since
364 		 * cpupart_move_threads will refuse to move bound threads to
365 		 * other CPUs.
366 		 */
367 		(void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
368 		(void) cpupart_unbind_threads(oldpp, B_FALSE);
369 
370 		if (!disp_bound_partition(cp, 0)) {
371 			/*
372 			 * No bound threads in this partition any more
373 			 */
374 			move_threads = 0;
375 		} else {
376 			/*
377 			 * There are still threads bound to the partition
378 			 */
379 			cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
380 			return (EBUSY);
381 		}
382 	}
383 
384 	/*
385 	 * If forced flag is set unbind any threads from this CPU.
386 	 * Otherwise unbind soft-bound threads only.
387 	 */
388 	if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
389 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
390 		return (ret);
391 	}
392 
393 	/*
394 	 * Stop further threads weak binding to this cpu.
395 	 */
396 	cpu_inmotion = cp;
397 	membar_enter();
398 
399 	/*
400 	 * Notify the Processor Groups subsystem that the CPU
401 	 * will be moving cpu partitions. This is done before
402 	 * CPUs are paused to provide an opportunity for any
403 	 * needed memory allocations.
404 	 */
405 	pg_cpupart_out(cp, oldpp);
406 	pg_cpupart_in(cp, newpp);
407 
408 again:
409 	if (move_threads) {
410 		int loop_count;
411 		/*
412 		 * Check for threads strong or weak bound to this CPU.
413 		 */
414 		for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
415 			if (loop_count >= 5) {
416 				cpu_state_change_notify(cp->cpu_id,
417 				    CPU_CPUPART_IN);
418 				pg_cpupart_out(cp, newpp);
419 				pg_cpupart_in(cp, oldpp);
420 				cpu_inmotion = NULL;
421 				return (EBUSY);	/* some threads still bound */
422 			}
423 			delay(1);
424 		}
425 	}
426 
427 	/*
428 	 * Before we actually start changing data structures, notify
429 	 * the cyclic subsystem that we want to move this CPU out of its
430 	 * partition.
431 	 */
432 	if (!cyclic_move_out(cp)) {
433 		/*
434 		 * This CPU must be the last CPU in a processor set with
435 		 * a bound cyclic.
436 		 */
437 		cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
438 		pg_cpupart_out(cp, newpp);
439 		pg_cpupart_in(cp, oldpp);
440 		cpu_inmotion = NULL;
441 		return (EBUSY);
442 	}
443 
444 	pause_cpus(cp, NULL);
445 
446 	if (move_threads) {
447 		/*
448 		 * The thread on cpu before the pause thread may have read
449 		 * cpu_inmotion before we raised the barrier above.  Check
450 		 * again.
451 		 */
452 		if (disp_bound_threads(cp, 1)) {
453 			start_cpus();
454 			goto again;
455 		}
456 
457 	}
458 
459 	/*
460 	 * Now that CPUs are paused, let the PG subsystem perform
461 	 * any necessary data structure updates.
462 	 */
463 	pg_cpupart_move(cp, oldpp, newpp);
464 
465 	/* save this cpu's lgroup -- it'll be the same in the new partition */
466 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
467 
468 	cpu_lpl = cp->cpu_lpl;
469 	/*
470 	 * let the lgroup framework know cp has left the partition
471 	 */
472 	lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
473 
474 	/* move out of old partition */
475 	oldpp->cp_ncpus--;
476 	if (oldpp->cp_ncpus > 0) {
477 
478 		ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
479 		cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
480 		if (oldpp->cp_cpulist == cp) {
481 			oldpp->cp_cpulist = ncp;
482 		}
483 	} else {
484 		ncp = oldpp->cp_cpulist = NULL;
485 		cp_numparts_nonempty--;
486 		ASSERT(cp_numparts_nonempty != 0);
487 	}
488 	oldpp->cp_gen++;
489 
490 	/* move into new partition */
491 	newlist = newpp->cp_cpulist;
492 	if (newlist == NULL) {
493 		newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
494 		cp_numparts_nonempty++;
495 		ASSERT(cp_numparts_nonempty != 0);
496 	} else {
497 		cp->cpu_next_part = newlist;
498 		cp->cpu_prev_part = newlist->cpu_prev_part;
499 		newlist->cpu_prev_part->cpu_next_part = cp;
500 		newlist->cpu_prev_part = cp;
501 	}
502 	cp->cpu_part = newpp;
503 	newpp->cp_ncpus++;
504 	newpp->cp_gen++;
505 
506 	ASSERT(bitset_is_null(&newpp->cp_haltset));
507 	ASSERT(bitset_is_null(&oldpp->cp_haltset));
508 
509 	/*
510 	 * let the lgroup framework know cp has entered the partition
511 	 */
512 	lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
513 
514 	/*
515 	 * If necessary, move threads off processor.
516 	 */
517 	if (move_threads) {
518 		ASSERT(ncp != NULL);
519 
520 		/*
521 		 * Walk thru the active process list to look for
522 		 * threads that need to have a new home lgroup,
523 		 * or the last CPU they run on is the same CPU
524 		 * being moved out of the partition.
525 		 */
526 
527 		for (p = practive; p != NULL; p = p->p_next) {
528 
529 			t = p->p_tlist;
530 
531 			if (t == NULL)
532 				continue;
533 
534 			lgrp_diff_lpl = 0;
535 
536 			do {
537 
538 				ASSERT(t->t_lpl != NULL);
539 
540 				/*
541 				 * Update the count of how many threads are
542 				 * in this CPU's lgroup but have a different lpl
543 				 */
544 
545 				if (t->t_lpl != cpu_lpl &&
546 				    t->t_lpl->lpl_lgrpid == lgrpid)
547 					lgrp_diff_lpl++;
548 				/*
549 				 * If the lgroup that t is assigned to no
550 				 * longer has any CPUs in t's partition,
551 				 * we'll have to choose a new lgroup for t.
552 				 */
553 
554 				if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
555 				    t->t_cpupart)) {
556 					lgrp_move_thread(t,
557 					    lgrp_choose(t, t->t_cpupart), 0);
558 				}
559 
560 				/*
561 				 * make sure lpl points to our own partition
562 				 */
563 				ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
564 				    (t->t_lpl < t->t_cpupart->cp_lgrploads +
565 				    t->t_cpupart->cp_nlgrploads));
566 
567 				ASSERT(t->t_lpl->lpl_ncpu > 0);
568 
569 				/* Update CPU last ran on if it was this CPU */
570 				if (t->t_cpu == cp && t->t_cpupart == oldpp &&
571 				    t->t_bound_cpu != cp) {
572 					t->t_cpu = disp_lowpri_cpu(ncp,
573 					    t->t_lpl, t->t_pri, NULL);
574 				}
575 				t = t->t_forw;
576 			} while (t != p->p_tlist);
577 
578 			/*
579 			 * Didn't find any threads in the same lgroup as this
580 			 * CPU with a different lpl, so remove the lgroup from
581 			 * the process lgroup bitmask.
582 			 */
583 
584 			if (lgrp_diff_lpl)
585 				klgrpset_del(p->p_lgrpset, lgrpid);
586 		}
587 
588 		/*
589 		 * Walk thread list looking for threads that need to be
590 		 * rehomed, since there are some threads that are not in
591 		 * their process's p_tlist.
592 		 */
593 
594 		t = curthread;
595 
596 		do {
597 			ASSERT(t != NULL && t->t_lpl != NULL);
598 
599 			/*
600 			 * If the lgroup that t is assigned to no
601 			 * longer has any CPUs in t's partition,
602 			 * we'll have to choose a new lgroup for t.
603 			 * Also, choose best lgroup for home when
604 			 * thread has specified lgroup affinities,
605 			 * since there may be an lgroup with more
606 			 * affinity available after moving CPUs
607 			 * around.
608 			 */
609 			if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
610 			    t->t_cpupart) || t->t_lgrp_affinity) {
611 				lgrp_move_thread(t,
612 				    lgrp_choose(t, t->t_cpupart), 1);
613 			}
614 
615 			/* make sure lpl points to our own partition */
616 			ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
617 			    (t->t_lpl < t->t_cpupart->cp_lgrploads +
618 			    t->t_cpupart->cp_nlgrploads));
619 
620 			ASSERT(t->t_lpl->lpl_ncpu > 0);
621 
622 			/* Update CPU last ran on if it was this CPU */
623 			if (t->t_cpu == cp && t->t_cpupart == oldpp &&
624 			    t->t_bound_cpu != cp) {
625 				t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
626 				    t->t_pri, NULL);
627 			}
628 
629 			t = t->t_next;
630 		} while (t != curthread);
631 
632 		/*
633 		 * Clear off the CPU's run queue, and the kp queue if the
634 		 * partition is now empty.
635 		 */
636 		disp_cpu_inactive(cp);
637 
638 		/*
639 		 * Make cp switch to a thread from the new partition.
640 		 */
641 		cp->cpu_runrun = 1;
642 		cp->cpu_kprunrun = 1;
643 	}
644 
645 	cpu_inmotion = NULL;
646 	start_cpus();
647 
648 	/*
649 	 * Let anyone interested know that cpu has been added to the set.
650 	 */
651 	cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
652 
653 	/*
654 	 * Now let the cyclic subsystem know that it can reshuffle cyclics
655 	 * bound to the new processor set.
656 	 */
657 	cyclic_move_in(cp);
658 
659 	return (0);
660 }
661 
662 /*
663  * Check if thread can be moved to a new cpu partition.  Called by
664  * cpupart_move_thread() and pset_bind_start().
665  */
666 int
667 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
668 {
669 	ASSERT(MUTEX_HELD(&cpu_lock));
670 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
671 	ASSERT(cp != NULL);
672 	ASSERT(THREAD_LOCK_HELD(tp));
673 
674 	/*
675 	 * CPU-bound threads can't be moved.
676 	 */
677 	if (!ignore) {
678 		cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
679 		    tp->t_weakbound_cpu;
680 		if (boundcpu != NULL && boundcpu->cpu_part != cp)
681 			return (EBUSY);
682 	}
683 
684 	if (tp->t_cid == sysdccid) {
685 		return (EINVAL);	/* For now, sysdc threads can't move */
686 	}
687 
688 	return (0);
689 }
690 
691 /*
692  * Move thread to new partition.  If ignore is non-zero, then CPU
693  * bindings should be ignored (this is used when destroying a
694  * partition).
695  */
696 static int
697 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
698     void *projbuf, void *zonebuf)
699 {
700 	cpupart_t *oldpp = tp->t_cpupart;
701 	int ret;
702 
703 	ASSERT(MUTEX_HELD(&cpu_lock));
704 	ASSERT(MUTEX_HELD(&pidlock));
705 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
706 	ASSERT(newpp != NULL);
707 
708 	if (newpp->cp_cpulist == NULL)
709 		return (EINVAL);
710 
711 	/*
712 	 * Check for errors first.
713 	 */
714 	thread_lock(tp);
715 	if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
716 		thread_unlock(tp);
717 		return (ret);
718 	}
719 
720 	/* move the thread */
721 	if (oldpp != newpp) {
722 		/*
723 		 * Make the thread switch to the new partition.
724 		 */
725 		tp->t_cpupart = newpp;
726 		ASSERT(tp->t_lpl != NULL);
727 		/*
728 		 * Leave the thread on the same lgroup if possible; otherwise
729 		 * choose a new lgroup for it.  In either case, update its
730 		 * t_lpl.
731 		 */
732 		if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
733 		    tp->t_lgrp_affinity == NULL) {
734 			/*
735 			 * The thread's lgroup has CPUs in the thread's new
736 			 * partition, so the thread can stay assigned to the
737 			 * same lgroup.  Update its t_lpl to point to the
738 			 * lpl_t for its lgroup in its new partition.
739 			 */
740 			lgrp_move_thread(tp, &tp->t_cpupart->\
741 			    cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
742 		} else {
743 			/*
744 			 * The thread's lgroup has no cpus in its new
745 			 * partition or it has specified lgroup affinities,
746 			 * so choose the best lgroup for the thread and
747 			 * assign it to that lgroup.
748 			 */
749 			lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
750 			    1);
751 		}
752 		/*
753 		 * make sure lpl points to our own partition
754 		 */
755 		ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
756 		    (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
757 		    tp->t_cpupart->cp_nlgrploads));
758 
759 		ASSERT(tp->t_lpl->lpl_ncpu > 0);
760 
761 		if (tp->t_state == TS_ONPROC) {
762 			cpu_surrender(tp);
763 		} else if (tp->t_state == TS_RUN) {
764 			(void) dispdeq(tp);
765 			setbackdq(tp);
766 		}
767 	}
768 
769 	/*
770 	 * Our binding has changed; set TP_CHANGEBIND.
771 	 */
772 	tp->t_proc_flag |= TP_CHANGEBIND;
773 	aston(tp);
774 
775 	thread_unlock(tp);
776 	fss_changepset(tp, newpp, projbuf, zonebuf);
777 
778 	return (0);		/* success */
779 }
780 
781 
782 /*
783  * This function binds a thread to a partition.  Must be called with the
784  * p_lock of the containing process held (to keep the thread from going
785  * away), and thus also with cpu_lock held (since cpu_lock must be
786  * acquired before p_lock).  If ignore is non-zero, then CPU bindings
787  * should be ignored (this is used when destroying a partition).
788  */
789 int
790 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
791     void *zonebuf)
792 {
793 	cpupart_t	*newpp;
794 
795 	ASSERT(pool_lock_held());
796 	ASSERT(MUTEX_HELD(&cpu_lock));
797 	ASSERT(MUTEX_HELD(&pidlock));
798 	ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
799 
800 	if (psid == PS_NONE)
801 		newpp = &cp_default;
802 	else {
803 		newpp = cpupart_find(psid);
804 		if (newpp == NULL) {
805 			return (EINVAL);
806 		}
807 	}
808 	return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
809 }
810 
811 
812 /*
813  * Create a new partition.  On MP systems, this also allocates a
814  * kpreempt disp queue for that partition.
815  */
816 int
817 cpupart_create(psetid_t *psid)
818 {
819 	cpupart_t	*pp;
820 
821 	ASSERT(pool_lock_held());
822 
823 	pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
824 	pp->cp_nlgrploads = lgrp_plat_max_lgrps();
825 	pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
826 	    KM_SLEEP);
827 
828 	mutex_enter(&cpu_lock);
829 	if (cp_numparts == cp_max_numparts) {
830 		mutex_exit(&cpu_lock);
831 		kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
832 		pp->cp_lgrploads = NULL;
833 		kmem_free(pp, sizeof (cpupart_t));
834 		return (ENOMEM);
835 	}
836 	cp_numparts++;
837 	/* find the next free partition ID */
838 	while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
839 		cp_id_next++;
840 	pp->cp_id = cp_id_next++;
841 	pp->cp_ncpus = 0;
842 	pp->cp_cpulist = NULL;
843 	pp->cp_attr = 0;
844 	klgrpset_clear(pp->cp_lgrpset);
845 	pp->cp_kp_queue.disp_maxrunpri = -1;
846 	pp->cp_kp_queue.disp_max_unbound_pri = -1;
847 	pp->cp_kp_queue.disp_cpu = NULL;
848 	pp->cp_gen = 0;
849 	DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
850 	*psid = CPTOPS(pp->cp_id);
851 	disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
852 	cpupart_kstat_create(pp);
853 	cpupart_lpl_initialize(pp);
854 
855 	bitset_init(&pp->cp_cmt_pgs);
856 
857 	/*
858 	 * Initialize and size the partition's bitset of halted CPUs.
859 	 */
860 	bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
861 	bitset_resize(&pp->cp_haltset, max_ncpus);
862 
863 	/*
864 	 * Pause all CPUs while changing the partition list, to make sure
865 	 * the clock thread (which traverses the list without holding
866 	 * cpu_lock) isn't running.
867 	 */
868 	pause_cpus(NULL, NULL);
869 	pp->cp_next = cp_list_head;
870 	pp->cp_prev = cp_list_head->cp_prev;
871 	cp_list_head->cp_prev->cp_next = pp;
872 	cp_list_head->cp_prev = pp;
873 	start_cpus();
874 	mutex_exit(&cpu_lock);
875 
876 	return (0);
877 }
878 
879 /*
880  * Move threads from specified partition to cp_default. If `force' is specified,
881  * move all threads, otherwise move only soft-bound threads.
882  */
883 static int
884 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
885 {
886 	void 	*projbuf, *zonebuf;
887 	kthread_t *t;
888 	proc_t	*p;
889 	int	err = 0;
890 	psetid_t psid = pp->cp_id;
891 
892 	ASSERT(pool_lock_held());
893 	ASSERT(MUTEX_HELD(&cpu_lock));
894 
895 	if (pp == NULL || pp == &cp_default) {
896 		return (EINVAL);
897 	}
898 
899 	/*
900 	 * Pre-allocate enough buffers for FSS for all active projects and
901 	 * for all active zones on the system.  Unused buffers will be
902 	 * freed later by fss_freebuf().
903 	 */
904 	projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
905 	zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
906 
907 	mutex_enter(&pidlock);
908 	t = curthread;
909 	do {
910 		if (t->t_bind_pset == psid) {
911 again:			p = ttoproc(t);
912 			mutex_enter(&p->p_lock);
913 			if (ttoproc(t) != p) {
914 				/*
915 				 * lwp_exit has changed this thread's process
916 				 * pointer before we grabbed its p_lock.
917 				 */
918 				mutex_exit(&p->p_lock);
919 				goto again;
920 			}
921 
922 			/*
923 			 * Can only unbind threads which have revocable binding
924 			 * unless force unbinding requested.
925 			 */
926 			if (unbind_all || TB_PSET_IS_SOFT(t)) {
927 				err = cpupart_bind_thread(t, PS_NONE, 1,
928 				    projbuf, zonebuf);
929 				if (err) {
930 					mutex_exit(&p->p_lock);
931 					mutex_exit(&pidlock);
932 					fss_freebuf(projbuf, FSS_ALLOC_PROJ);
933 					fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
934 					return (err);
935 				}
936 				t->t_bind_pset = PS_NONE;
937 			}
938 			mutex_exit(&p->p_lock);
939 		}
940 		t = t->t_next;
941 	} while (t != curthread);
942 
943 	mutex_exit(&pidlock);
944 	fss_freebuf(projbuf, FSS_ALLOC_PROJ);
945 	fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
946 	return (err);
947 }
948 
949 /*
950  * Destroy a partition.
951  */
952 int
953 cpupart_destroy(psetid_t psid)
954 {
955 	cpu_t	*cp, *first_cp;
956 	cpupart_t *pp, *newpp;
957 	int	err = 0;
958 
959 	ASSERT(pool_lock_held());
960 	mutex_enter(&cpu_lock);
961 
962 	pp = cpupart_find(psid);
963 	if (pp == NULL || pp == &cp_default) {
964 		mutex_exit(&cpu_lock);
965 		return (EINVAL);
966 	}
967 
968 	/*
969 	 * Unbind all the threads currently bound to the partition.
970 	 */
971 	err = cpupart_unbind_threads(pp, B_TRUE);
972 	if (err) {
973 		mutex_exit(&cpu_lock);
974 		return (err);
975 	}
976 
977 	newpp = &cp_default;
978 	while ((cp = pp->cp_cpulist) != NULL) {
979 		if (err = cpupart_move_cpu(cp, newpp, 0)) {
980 			mutex_exit(&cpu_lock);
981 			return (err);
982 		}
983 	}
984 
985 	ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
986 	ASSERT(bitset_is_null(&pp->cp_haltset));
987 
988 	/*
989 	 * Teardown the partition's group of active CMT PGs and halted
990 	 * CPUs now that they have all left.
991 	 */
992 	bitset_fini(&pp->cp_cmt_pgs);
993 	bitset_fini(&pp->cp_haltset);
994 
995 	/*
996 	 * Reset the pointers in any offline processors so they won't
997 	 * try to rejoin the destroyed partition when they're turned
998 	 * online.
999 	 */
1000 	first_cp = cp = CPU;
1001 	do {
1002 		if (cp->cpu_part == pp) {
1003 			ASSERT(cp->cpu_flags & CPU_OFFLINE);
1004 			cp->cpu_part = newpp;
1005 		}
1006 		cp = cp->cpu_next;
1007 	} while (cp != first_cp);
1008 
1009 	/*
1010 	 * Pause all CPUs while changing the partition list, to make sure
1011 	 * the clock thread (which traverses the list without holding
1012 	 * cpu_lock) isn't running.
1013 	 */
1014 	pause_cpus(NULL, NULL);
1015 	pp->cp_prev->cp_next = pp->cp_next;
1016 	pp->cp_next->cp_prev = pp->cp_prev;
1017 	if (cp_list_head == pp)
1018 		cp_list_head = pp->cp_next;
1019 	start_cpus();
1020 
1021 	if (cp_id_next > pp->cp_id)
1022 		cp_id_next = pp->cp_id;
1023 
1024 	if (pp->cp_kstat)
1025 		kstat_delete(pp->cp_kstat);
1026 
1027 	cp_numparts--;
1028 
1029 	disp_kp_free(&pp->cp_kp_queue);
1030 
1031 	cpupart_lpl_teardown(pp);
1032 
1033 	kmem_free(pp, sizeof (cpupart_t));
1034 	mutex_exit(&cpu_lock);
1035 
1036 	return (err);
1037 }
1038 
1039 
1040 /*
1041  * Return the ID of the partition to which the specified processor belongs.
1042  */
1043 psetid_t
1044 cpupart_query_cpu(cpu_t *cp)
1045 {
1046 	ASSERT(MUTEX_HELD(&cpu_lock));
1047 
1048 	return (CPTOPS(cp->cpu_part->cp_id));
1049 }
1050 
1051 
1052 /*
1053  * Attach a processor to an existing partition.
1054  */
1055 int
1056 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1057 {
1058 	cpupart_t	*pp;
1059 	int		err;
1060 
1061 	ASSERT(pool_lock_held());
1062 	ASSERT(MUTEX_HELD(&cpu_lock));
1063 
1064 	pp = cpupart_find(psid);
1065 	if (pp == NULL)
1066 		return (EINVAL);
1067 	if (cp->cpu_flags & CPU_OFFLINE)
1068 		return (EINVAL);
1069 
1070 	err = cpupart_move_cpu(cp, pp, forced);
1071 	return (err);
1072 }
1073 
1074 /*
1075  * Get a list of cpus belonging to the partition.  If numcpus is NULL,
1076  * this just checks for a valid partition.  If numcpus is non-NULL but
1077  * cpulist is NULL, the current number of cpus is stored in *numcpus.
1078  * If both are non-NULL, the current number of cpus is stored in *numcpus,
1079  * and a list of those cpus up to the size originally in *numcpus is
1080  * stored in cpulist[].  Also, store the processor set id in *psid.
1081  * This is useful in case the processor set id passed in was PS_MYID.
1082  */
1083 int
1084 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1085 {
1086 	cpupart_t	*pp;
1087 	uint_t		ncpus;
1088 	cpu_t		*c;
1089 	int		i;
1090 
1091 	mutex_enter(&cpu_lock);
1092 	pp = cpupart_find(*psid);
1093 	if (pp == NULL) {
1094 		mutex_exit(&cpu_lock);
1095 		return (EINVAL);
1096 	}
1097 	*psid = CPTOPS(pp->cp_id);
1098 	ncpus = pp->cp_ncpus;
1099 	if (numcpus) {
1100 		if (ncpus > *numcpus) {
1101 			/*
1102 			 * Only copy as many cpus as were passed in, but
1103 			 * pass back the real number.
1104 			 */
1105 			uint_t t = ncpus;
1106 			ncpus = *numcpus;
1107 			*numcpus = t;
1108 		} else
1109 			*numcpus = ncpus;
1110 
1111 		if (cpulist) {
1112 			c = pp->cp_cpulist;
1113 			for (i = 0; i < ncpus; i++) {
1114 				ASSERT(c != NULL);
1115 				cpulist[i] = c->cpu_id;
1116 				c = c->cpu_next_part;
1117 			}
1118 		}
1119 	}
1120 	mutex_exit(&cpu_lock);
1121 	return (0);
1122 }
1123 
1124 /*
1125  * Reallocate kpreempt queues for each CPU partition.  Called from
1126  * disp_setup when a new scheduling class is loaded that increases the
1127  * number of priorities in the system.
1128  */
1129 void
1130 cpupart_kpqalloc(pri_t npri)
1131 {
1132 	cpupart_t *cpp;
1133 
1134 	ASSERT(MUTEX_HELD(&cpu_lock));
1135 	cpp = cp_list_head;
1136 	do {
1137 		disp_kp_alloc(&cpp->cp_kp_queue, npri);
1138 		cpp = cpp->cp_next;
1139 	} while (cpp != cp_list_head);
1140 }
1141 
1142 int
1143 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1144 {
1145 	cpupart_t *cp;
1146 	int i;
1147 
1148 	ASSERT(nelem >= 0);
1149 	ASSERT(nelem <= LOADAVG_NSTATS);
1150 	ASSERT(MUTEX_HELD(&cpu_lock));
1151 
1152 	cp = cpupart_find(psid);
1153 	if (cp == NULL)
1154 		return (EINVAL);
1155 	for (i = 0; i < nelem; i++)
1156 		buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1157 
1158 	return (0);
1159 }
1160 
1161 
1162 uint_t
1163 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1164 {
1165 	uint_t numpart = 0;
1166 	cpupart_t *cp;
1167 
1168 	ASSERT(MUTEX_HELD(&cpu_lock));
1169 	ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1170 
1171 	if (list != NULL) {
1172 		cp = cp_list_head;
1173 		do {
1174 			if (((flag == CP_ALL) && (cp != &cp_default)) ||
1175 			    ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1176 				if (numpart == nelem)
1177 					break;
1178 				list[numpart++] = CPTOPS(cp->cp_id);
1179 			}
1180 			cp = cp->cp_next;
1181 		} while (cp != cp_list_head);
1182 	}
1183 
1184 	ASSERT(numpart < cp_numparts);
1185 
1186 	if (flag == CP_ALL)
1187 		numpart = cp_numparts - 1; /* leave out default partition */
1188 	else if (flag == CP_NONEMPTY)
1189 		numpart = cp_numparts_nonempty;
1190 
1191 	return (numpart);
1192 }
1193 
1194 int
1195 cpupart_setattr(psetid_t psid, uint_t attr)
1196 {
1197 	cpupart_t *cp;
1198 
1199 	ASSERT(pool_lock_held());
1200 
1201 	mutex_enter(&cpu_lock);
1202 	if ((cp = cpupart_find(psid)) == NULL) {
1203 		mutex_exit(&cpu_lock);
1204 		return (EINVAL);
1205 	}
1206 	/*
1207 	 * PSET_NOESCAPE attribute for default cpu partition is always set
1208 	 */
1209 	if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1210 		mutex_exit(&cpu_lock);
1211 		return (EINVAL);
1212 	}
1213 	cp->cp_attr = attr;
1214 	mutex_exit(&cpu_lock);
1215 	return (0);
1216 }
1217 
1218 int
1219 cpupart_getattr(psetid_t psid, uint_t *attrp)
1220 {
1221 	cpupart_t *cp;
1222 
1223 	mutex_enter(&cpu_lock);
1224 	if ((cp = cpupart_find(psid)) == NULL) {
1225 		mutex_exit(&cpu_lock);
1226 		return (EINVAL);
1227 	}
1228 	*attrp = cp->cp_attr;
1229 	mutex_exit(&cpu_lock);
1230 	return (0);
1231 }
1232