1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/types.h>
26 #include <sys/systm.h>
27 #include <sys/cmn_err.h>
28 #include <sys/cpuvar.h>
29 #include <sys/thread.h>
30 #include <sys/disp.h>
31 #include <sys/kmem.h>
32 #include <sys/debug.h>
33 #include <sys/cpupart.h>
34 #include <sys/pset.h>
35 #include <sys/var.h>
36 #include <sys/cyclic.h>
37 #include <sys/lgrp.h>
38 #include <sys/pghw.h>
39 #include <sys/loadavg.h>
40 #include <sys/class.h>
41 #include <sys/fss.h>
42 #include <sys/pool.h>
43 #include <sys/pool_pset.h>
44 #include <sys/policy.h>
45
46 /*
47 * Calling pool_lock() protects the pools configuration, which includes
48 * CPU partitions. cpu_lock protects the CPU partition list, and prevents
49 * partitions from being created or destroyed while the lock is held.
50 * The lock ordering with respect to related locks is:
51 *
52 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock
53 *
54 * Blocking memory allocations may be made while holding "pool_lock"
55 * or cpu_lock.
56 */
57
58 /*
59 * The cp_default partition is allocated statically, but its lgroup load average
60 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
61 * saves some memory since the space allocated reflects the actual number of
62 * lgroups supported by the platform. The lgrp facility provides a temporary
63 * space to hold lpl information during system bootstrap.
64 */
65
66 cpupart_t *cp_list_head;
67 cpupart_t cp_default;
68 static cpupartid_t cp_id_next;
69 uint_t cp_numparts;
70 uint_t cp_numparts_nonempty;
71
72 /*
73 * Need to limit total number of partitions to avoid slowing down the
74 * clock code too much. The clock code traverses the list of
75 * partitions and needs to be able to execute in a reasonable amount
76 * of time (less than 1/hz seconds). The maximum is sized based on
77 * max_ncpus so it shouldn't be a problem unless there are large
78 * numbers of empty partitions.
79 */
80 static uint_t cp_max_numparts;
81
82 /*
83 * Processor sets and CPU partitions are different but related concepts.
84 * A processor set is a user-level abstraction allowing users to create
85 * sets of CPUs and bind threads exclusively to those sets. A CPU
86 * partition is a kernel dispatcher object consisting of a set of CPUs
87 * and a global dispatch queue. The processor set abstraction is
88 * implemented via a CPU partition, and currently there is a 1-1
89 * mapping between processor sets and partitions (excluding the default
90 * partition, which is not visible as a processor set). Hence, the
91 * numbering for processor sets and CPU partitions is identical. This
92 * may not always be true in the future, and these macros could become
93 * less trivial if we support e.g. a processor set containing multiple
94 * CPU partitions.
95 */
96 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
97 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
98
99 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
100
101 /*
102 * Find a CPU partition given a processor set ID.
103 */
104 static cpupart_t *
cpupart_find_all(psetid_t psid)105 cpupart_find_all(psetid_t psid)
106 {
107 cpupart_t *cp;
108 cpupartid_t cpid = PSTOCP(psid);
109
110 ASSERT(MUTEX_HELD(&cpu_lock));
111
112 /* default partition not visible as a processor set */
113 if (psid == CP_DEFAULT)
114 return (NULL);
115
116 if (psid == PS_MYID)
117 return (curthread->t_cpupart);
118
119 cp = cp_list_head;
120 do {
121 if (cp->cp_id == cpid)
122 return (cp);
123 cp = cp->cp_next;
124 } while (cp != cp_list_head);
125 return (NULL);
126 }
127
128 /*
129 * Find a CPU partition given a processor set ID if the processor set
130 * should be visible from the calling zone.
131 */
132 cpupart_t *
cpupart_find(psetid_t psid)133 cpupart_find(psetid_t psid)
134 {
135 cpupart_t *cp;
136
137 ASSERT(MUTEX_HELD(&cpu_lock));
138 cp = cpupart_find_all(psid);
139 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
140 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
141 return (NULL);
142 return (cp);
143 }
144
145 static int
cpupart_kstat_update(kstat_t * ksp,int rw)146 cpupart_kstat_update(kstat_t *ksp, int rw)
147 {
148 cpupart_t *cp = (cpupart_t *)ksp->ks_private;
149 cpupart_kstat_t *cpksp = ksp->ks_data;
150
151 if (rw == KSTAT_WRITE)
152 return (EACCES);
153
154 cpksp->cpk_updates.value.ui64 = cp->cp_updates;
155 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
156 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
157 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
158 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
159 (16 - FSHIFT);
160 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
161 (16 - FSHIFT);
162 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
163 (16 - FSHIFT);
164 return (0);
165 }
166
167 static void
cpupart_kstat_create(cpupart_t * cp)168 cpupart_kstat_create(cpupart_t *cp)
169 {
170 kstat_t *ksp;
171 zoneid_t zoneid;
172
173 ASSERT(MUTEX_HELD(&cpu_lock));
174
175 /*
176 * We have a bit of a chicken-egg problem since this code will
177 * get called to create the kstats for CP_DEFAULT before the
178 * pools framework gets initialized. We circumvent the problem
179 * by special-casing cp_default.
180 */
181 if (cp != &cp_default && pool_pset_enabled())
182 zoneid = GLOBAL_ZONEID;
183 else
184 zoneid = ALL_ZONES;
185 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
186 KSTAT_TYPE_NAMED,
187 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
188 if (ksp != NULL) {
189 cpupart_kstat_t *cpksp = ksp->ks_data;
190
191 kstat_named_init(&cpksp->cpk_updates, "updates",
192 KSTAT_DATA_UINT64);
193 kstat_named_init(&cpksp->cpk_runnable, "runnable",
194 KSTAT_DATA_UINT64);
195 kstat_named_init(&cpksp->cpk_waiting, "waiting",
196 KSTAT_DATA_UINT64);
197 kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
198 KSTAT_DATA_UINT32);
199 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
200 KSTAT_DATA_UINT32);
201 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
202 KSTAT_DATA_UINT32);
203 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
204 KSTAT_DATA_UINT32);
205
206 ksp->ks_update = cpupart_kstat_update;
207 ksp->ks_private = cp;
208
209 kstat_install(ksp);
210 }
211 cp->cp_kstat = ksp;
212 }
213
214 /*
215 * Initialize the cpupart's lgrp partions (lpls)
216 */
217 static void
cpupart_lpl_initialize(cpupart_t * cp)218 cpupart_lpl_initialize(cpupart_t *cp)
219 {
220 int i, sz;
221
222 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
223 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
224
225 for (i = 0; i < sz; i++) {
226 /*
227 * The last entry of the lpl's resource set is always NULL
228 * by design (to facilitate iteration)...hence the "oversizing"
229 * by 1.
230 */
231 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
232 cp->cp_lgrploads[i].lpl_rset =
233 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
234 cp->cp_lgrploads[i].lpl_id2rset =
235 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
236 cp->cp_lgrploads[i].lpl_lgrpid = i;
237 }
238 }
239
240 /*
241 * Teardown the cpupart's lgrp partitions
242 */
243 static void
cpupart_lpl_teardown(cpupart_t * cp)244 cpupart_lpl_teardown(cpupart_t *cp)
245 {
246 int i, sz;
247 lpl_t *lpl;
248
249 for (i = 0; i < cp->cp_nlgrploads; i++) {
250 lpl = &cp->cp_lgrploads[i];
251
252 sz = lpl->lpl_rset_sz;
253 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
254 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
255 lpl->lpl_rset = NULL;
256 lpl->lpl_id2rset = NULL;
257 }
258 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
259 cp->cp_lgrploads = NULL;
260 }
261
262 /*
263 * Initialize the default partition and kpreempt disp queue.
264 */
265 void
cpupart_initialize_default(void)266 cpupart_initialize_default(void)
267 {
268 lgrp_id_t i;
269
270 cp_list_head = &cp_default;
271 cp_default.cp_next = &cp_default;
272 cp_default.cp_prev = &cp_default;
273 cp_default.cp_id = CP_DEFAULT;
274 cp_default.cp_kp_queue.disp_maxrunpri = -1;
275 cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
276 cp_default.cp_kp_queue.disp_cpu = NULL;
277 cp_default.cp_gen = 0;
278 cp_default.cp_loadavg.lg_cur = 0;
279 cp_default.cp_loadavg.lg_len = 0;
280 cp_default.cp_loadavg.lg_total = 0;
281 for (i = 0; i < S_LOADAVG_SZ; i++) {
282 cp_default.cp_loadavg.lg_loads[i] = 0;
283 }
284 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
285 cp_id_next = CP_DEFAULT + 1;
286 cpupart_kstat_create(&cp_default);
287 cp_numparts = 1;
288 if (cp_max_numparts == 0) /* allow for /etc/system tuning */
289 cp_max_numparts = max_ncpus * 2 + 1;
290 /*
291 * Allocate space for cp_default list of lgrploads
292 */
293 cpupart_lpl_initialize(&cp_default);
294
295 /*
296 * The initial lpl topology is created in a special lpl list
297 * lpl_bootstrap. It should be copied to cp_default.
298 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
299 * to the correct lpl in the cp_default.cp_lgrploads list.
300 */
301 lpl_topo_bootstrap(cp_default.cp_lgrploads,
302 cp_default.cp_nlgrploads);
303
304
305 cp_default.cp_attr = PSET_NOESCAPE;
306 cp_numparts_nonempty = 1;
307 /*
308 * Set t0's home
309 */
310 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
311
312 bitset_init(&cp_default.cp_cmt_pgs);
313 bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
314
315 bitset_resize(&cp_default.cp_haltset, max_ncpus);
316 }
317
318
319 static int
cpupart_move_cpu(cpu_t * cp,cpupart_t * newpp,int forced)320 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
321 {
322 cpupart_t *oldpp;
323 cpu_t *ncp, *newlist;
324 kthread_t *t;
325 int move_threads = 1;
326 lgrp_id_t lgrpid;
327 proc_t *p;
328 int lgrp_diff_lpl;
329 lpl_t *cpu_lpl;
330 int ret;
331 boolean_t unbind_all_threads = (forced != 0);
332
333 ASSERT(MUTEX_HELD(&cpu_lock));
334 ASSERT(newpp != NULL);
335
336 oldpp = cp->cpu_part;
337 ASSERT(oldpp != NULL);
338 ASSERT(oldpp->cp_ncpus > 0);
339
340 if (newpp == oldpp) {
341 /*
342 * Don't need to do anything.
343 */
344 return (0);
345 }
346
347 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
348
349 if (!disp_bound_partition(cp, 0)) {
350 /*
351 * Don't need to move threads if there are no threads in
352 * the partition. Note that threads can't enter the
353 * partition while we're holding cpu_lock.
354 */
355 move_threads = 0;
356 } else if (oldpp->cp_ncpus == 1) {
357 /*
358 * The last CPU is removed from a partition which has threads
359 * running in it. Some of these threads may be bound to this
360 * CPU.
361 *
362 * Attempt to unbind threads from the CPU and from the processor
363 * set. Note that no threads should be bound to this CPU since
364 * cpupart_move_threads will refuse to move bound threads to
365 * other CPUs.
366 */
367 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
368 (void) cpupart_unbind_threads(oldpp, B_FALSE);
369
370 if (!disp_bound_partition(cp, 0)) {
371 /*
372 * No bound threads in this partition any more
373 */
374 move_threads = 0;
375 } else {
376 /*
377 * There are still threads bound to the partition
378 */
379 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
380 return (EBUSY);
381 }
382 }
383
384 /*
385 * If forced flag is set unbind any threads from this CPU.
386 * Otherwise unbind soft-bound threads only.
387 */
388 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
389 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
390 return (ret);
391 }
392
393 /*
394 * Stop further threads weak binding to this cpu.
395 */
396 cpu_inmotion = cp;
397 membar_enter();
398
399 /*
400 * Notify the Processor Groups subsystem that the CPU
401 * will be moving cpu partitions. This is done before
402 * CPUs are paused to provide an opportunity for any
403 * needed memory allocations.
404 */
405 pg_cpupart_out(cp, oldpp);
406 pg_cpupart_in(cp, newpp);
407
408 again:
409 if (move_threads) {
410 int loop_count;
411 /*
412 * Check for threads strong or weak bound to this CPU.
413 */
414 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
415 if (loop_count >= 5) {
416 cpu_state_change_notify(cp->cpu_id,
417 CPU_CPUPART_IN);
418 pg_cpupart_out(cp, newpp);
419 pg_cpupart_in(cp, oldpp);
420 cpu_inmotion = NULL;
421 return (EBUSY); /* some threads still bound */
422 }
423 delay(1);
424 }
425 }
426
427 /*
428 * Before we actually start changing data structures, notify
429 * the cyclic subsystem that we want to move this CPU out of its
430 * partition.
431 */
432 if (!cyclic_move_out(cp)) {
433 /*
434 * This CPU must be the last CPU in a processor set with
435 * a bound cyclic.
436 */
437 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
438 pg_cpupart_out(cp, newpp);
439 pg_cpupart_in(cp, oldpp);
440 cpu_inmotion = NULL;
441 return (EBUSY);
442 }
443
444 pause_cpus(cp, NULL);
445
446 if (move_threads) {
447 /*
448 * The thread on cpu before the pause thread may have read
449 * cpu_inmotion before we raised the barrier above. Check
450 * again.
451 */
452 if (disp_bound_threads(cp, 1)) {
453 start_cpus();
454 goto again;
455 }
456
457 }
458
459 /*
460 * Now that CPUs are paused, let the PG subsystem perform
461 * any necessary data structure updates.
462 */
463 pg_cpupart_move(cp, oldpp, newpp);
464
465 /* save this cpu's lgroup -- it'll be the same in the new partition */
466 lgrpid = cp->cpu_lpl->lpl_lgrpid;
467
468 cpu_lpl = cp->cpu_lpl;
469 /*
470 * let the lgroup framework know cp has left the partition
471 */
472 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
473
474 /* move out of old partition */
475 oldpp->cp_ncpus--;
476 if (oldpp->cp_ncpus > 0) {
477
478 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
479 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
480 if (oldpp->cp_cpulist == cp) {
481 oldpp->cp_cpulist = ncp;
482 }
483 } else {
484 ncp = oldpp->cp_cpulist = NULL;
485 cp_numparts_nonempty--;
486 ASSERT(cp_numparts_nonempty != 0);
487 }
488 oldpp->cp_gen++;
489
490 /* move into new partition */
491 newlist = newpp->cp_cpulist;
492 if (newlist == NULL) {
493 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
494 cp_numparts_nonempty++;
495 ASSERT(cp_numparts_nonempty != 0);
496 } else {
497 cp->cpu_next_part = newlist;
498 cp->cpu_prev_part = newlist->cpu_prev_part;
499 newlist->cpu_prev_part->cpu_next_part = cp;
500 newlist->cpu_prev_part = cp;
501 }
502 cp->cpu_part = newpp;
503 newpp->cp_ncpus++;
504 newpp->cp_gen++;
505
506 ASSERT(bitset_is_null(&newpp->cp_haltset));
507 ASSERT(bitset_is_null(&oldpp->cp_haltset));
508
509 /*
510 * let the lgroup framework know cp has entered the partition
511 */
512 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
513
514 /*
515 * If necessary, move threads off processor.
516 */
517 if (move_threads) {
518 ASSERT(ncp != NULL);
519
520 /*
521 * Walk thru the active process list to look for
522 * threads that need to have a new home lgroup,
523 * or the last CPU they run on is the same CPU
524 * being moved out of the partition.
525 */
526
527 for (p = practive; p != NULL; p = p->p_next) {
528
529 t = p->p_tlist;
530
531 if (t == NULL)
532 continue;
533
534 lgrp_diff_lpl = 0;
535
536 do {
537
538 ASSERT(t->t_lpl != NULL);
539
540 /*
541 * Update the count of how many threads are
542 * in this CPU's lgroup but have a different lpl
543 */
544
545 if (t->t_lpl != cpu_lpl &&
546 t->t_lpl->lpl_lgrpid == lgrpid)
547 lgrp_diff_lpl++;
548 /*
549 * If the lgroup that t is assigned to no
550 * longer has any CPUs in t's partition,
551 * we'll have to choose a new lgroup for t.
552 */
553
554 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
555 t->t_cpupart)) {
556 lgrp_move_thread(t,
557 lgrp_choose(t, t->t_cpupart), 0);
558 }
559
560 /*
561 * make sure lpl points to our own partition
562 */
563 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
564 (t->t_lpl < t->t_cpupart->cp_lgrploads +
565 t->t_cpupart->cp_nlgrploads));
566
567 ASSERT(t->t_lpl->lpl_ncpu > 0);
568
569 /* Update CPU last ran on if it was this CPU */
570 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
571 t->t_bound_cpu != cp) {
572 t->t_cpu = disp_lowpri_cpu(ncp,
573 t->t_lpl, t->t_pri, NULL);
574 }
575 t = t->t_forw;
576 } while (t != p->p_tlist);
577
578 /*
579 * Didn't find any threads in the same lgroup as this
580 * CPU with a different lpl, so remove the lgroup from
581 * the process lgroup bitmask.
582 */
583
584 if (lgrp_diff_lpl)
585 klgrpset_del(p->p_lgrpset, lgrpid);
586 }
587
588 /*
589 * Walk thread list looking for threads that need to be
590 * rehomed, since there are some threads that are not in
591 * their process's p_tlist.
592 */
593
594 t = curthread;
595
596 do {
597 ASSERT(t != NULL && t->t_lpl != NULL);
598
599 /*
600 * If the lgroup that t is assigned to no
601 * longer has any CPUs in t's partition,
602 * we'll have to choose a new lgroup for t.
603 * Also, choose best lgroup for home when
604 * thread has specified lgroup affinities,
605 * since there may be an lgroup with more
606 * affinity available after moving CPUs
607 * around.
608 */
609 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
610 t->t_cpupart) || t->t_lgrp_affinity) {
611 lgrp_move_thread(t,
612 lgrp_choose(t, t->t_cpupart), 1);
613 }
614
615 /* make sure lpl points to our own partition */
616 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
617 (t->t_lpl < t->t_cpupart->cp_lgrploads +
618 t->t_cpupart->cp_nlgrploads));
619
620 ASSERT(t->t_lpl->lpl_ncpu > 0);
621
622 /* Update CPU last ran on if it was this CPU */
623 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
624 t->t_bound_cpu != cp) {
625 t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl,
626 t->t_pri, NULL);
627 }
628
629 t = t->t_next;
630 } while (t != curthread);
631
632 /*
633 * Clear off the CPU's run queue, and the kp queue if the
634 * partition is now empty.
635 */
636 disp_cpu_inactive(cp);
637
638 /*
639 * Make cp switch to a thread from the new partition.
640 */
641 cp->cpu_runrun = 1;
642 cp->cpu_kprunrun = 1;
643 }
644
645 cpu_inmotion = NULL;
646 start_cpus();
647
648 /*
649 * Let anyone interested know that cpu has been added to the set.
650 */
651 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
652
653 /*
654 * Now let the cyclic subsystem know that it can reshuffle cyclics
655 * bound to the new processor set.
656 */
657 cyclic_move_in(cp);
658
659 return (0);
660 }
661
662 /*
663 * Check if thread can be moved to a new cpu partition. Called by
664 * cpupart_move_thread() and pset_bind_start().
665 */
666 int
cpupart_movable_thread(kthread_id_t tp,cpupart_t * cp,int ignore)667 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
668 {
669 ASSERT(MUTEX_HELD(&cpu_lock));
670 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
671 ASSERT(cp != NULL);
672 ASSERT(THREAD_LOCK_HELD(tp));
673
674 /*
675 * CPU-bound threads can't be moved.
676 */
677 if (!ignore) {
678 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
679 tp->t_weakbound_cpu;
680 if (boundcpu != NULL && boundcpu->cpu_part != cp)
681 return (EBUSY);
682 }
683
684 if (tp->t_cid == sysdccid) {
685 return (EINVAL); /* For now, sysdc threads can't move */
686 }
687
688 return (0);
689 }
690
691 /*
692 * Move thread to new partition. If ignore is non-zero, then CPU
693 * bindings should be ignored (this is used when destroying a
694 * partition).
695 */
696 static int
cpupart_move_thread(kthread_id_t tp,cpupart_t * newpp,int ignore,void * projbuf,void * zonebuf)697 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
698 void *projbuf, void *zonebuf)
699 {
700 cpupart_t *oldpp = tp->t_cpupart;
701 int ret;
702
703 ASSERT(MUTEX_HELD(&cpu_lock));
704 ASSERT(MUTEX_HELD(&pidlock));
705 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
706 ASSERT(newpp != NULL);
707
708 if (newpp->cp_cpulist == NULL)
709 return (EINVAL);
710
711 /*
712 * Check for errors first.
713 */
714 thread_lock(tp);
715 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
716 thread_unlock(tp);
717 return (ret);
718 }
719
720 /* move the thread */
721 if (oldpp != newpp) {
722 /*
723 * Make the thread switch to the new partition.
724 */
725 tp->t_cpupart = newpp;
726 ASSERT(tp->t_lpl != NULL);
727 /*
728 * Leave the thread on the same lgroup if possible; otherwise
729 * choose a new lgroup for it. In either case, update its
730 * t_lpl.
731 */
732 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
733 tp->t_lgrp_affinity == NULL) {
734 /*
735 * The thread's lgroup has CPUs in the thread's new
736 * partition, so the thread can stay assigned to the
737 * same lgroup. Update its t_lpl to point to the
738 * lpl_t for its lgroup in its new partition.
739 */
740 lgrp_move_thread(tp, &tp->t_cpupart->\
741 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
742 } else {
743 /*
744 * The thread's lgroup has no cpus in its new
745 * partition or it has specified lgroup affinities,
746 * so choose the best lgroup for the thread and
747 * assign it to that lgroup.
748 */
749 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
750 1);
751 }
752 /*
753 * make sure lpl points to our own partition
754 */
755 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
756 (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
757 tp->t_cpupart->cp_nlgrploads));
758
759 ASSERT(tp->t_lpl->lpl_ncpu > 0);
760
761 if (tp->t_state == TS_ONPROC) {
762 cpu_surrender(tp);
763 } else if (tp->t_state == TS_RUN) {
764 (void) dispdeq(tp);
765 setbackdq(tp);
766 }
767 }
768
769 /*
770 * Our binding has changed; set TP_CHANGEBIND.
771 */
772 tp->t_proc_flag |= TP_CHANGEBIND;
773 aston(tp);
774
775 thread_unlock(tp);
776 fss_changepset(tp, newpp, projbuf, zonebuf);
777
778 return (0); /* success */
779 }
780
781
782 /*
783 * This function binds a thread to a partition. Must be called with the
784 * p_lock of the containing process held (to keep the thread from going
785 * away), and thus also with cpu_lock held (since cpu_lock must be
786 * acquired before p_lock). If ignore is non-zero, then CPU bindings
787 * should be ignored (this is used when destroying a partition).
788 */
789 int
cpupart_bind_thread(kthread_id_t tp,psetid_t psid,int ignore,void * projbuf,void * zonebuf)790 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
791 void *zonebuf)
792 {
793 cpupart_t *newpp;
794
795 ASSERT(pool_lock_held());
796 ASSERT(MUTEX_HELD(&cpu_lock));
797 ASSERT(MUTEX_HELD(&pidlock));
798 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
799
800 if (psid == PS_NONE)
801 newpp = &cp_default;
802 else {
803 newpp = cpupart_find(psid);
804 if (newpp == NULL) {
805 return (EINVAL);
806 }
807 }
808 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
809 }
810
811
812 /*
813 * Create a new partition. On MP systems, this also allocates a
814 * kpreempt disp queue for that partition.
815 */
816 int
cpupart_create(psetid_t * psid)817 cpupart_create(psetid_t *psid)
818 {
819 cpupart_t *pp;
820
821 ASSERT(pool_lock_held());
822
823 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
824 pp->cp_nlgrploads = lgrp_plat_max_lgrps();
825 pp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * pp->cp_nlgrploads,
826 KM_SLEEP);
827
828 mutex_enter(&cpu_lock);
829 if (cp_numparts == cp_max_numparts) {
830 mutex_exit(&cpu_lock);
831 kmem_free(pp->cp_lgrploads, sizeof (lpl_t) * pp->cp_nlgrploads);
832 pp->cp_lgrploads = NULL;
833 kmem_free(pp, sizeof (cpupart_t));
834 return (ENOMEM);
835 }
836 cp_numparts++;
837 /* find the next free partition ID */
838 while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
839 cp_id_next++;
840 pp->cp_id = cp_id_next++;
841 pp->cp_ncpus = 0;
842 pp->cp_cpulist = NULL;
843 pp->cp_attr = 0;
844 klgrpset_clear(pp->cp_lgrpset);
845 pp->cp_kp_queue.disp_maxrunpri = -1;
846 pp->cp_kp_queue.disp_max_unbound_pri = -1;
847 pp->cp_kp_queue.disp_cpu = NULL;
848 pp->cp_gen = 0;
849 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
850 *psid = CPTOPS(pp->cp_id);
851 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
852 cpupart_kstat_create(pp);
853 cpupart_lpl_initialize(pp);
854
855 bitset_init(&pp->cp_cmt_pgs);
856
857 /*
858 * Initialize and size the partition's bitset of halted CPUs.
859 */
860 bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
861 bitset_resize(&pp->cp_haltset, max_ncpus);
862
863 /*
864 * Pause all CPUs while changing the partition list, to make sure
865 * the clock thread (which traverses the list without holding
866 * cpu_lock) isn't running.
867 */
868 pause_cpus(NULL, NULL);
869 pp->cp_next = cp_list_head;
870 pp->cp_prev = cp_list_head->cp_prev;
871 cp_list_head->cp_prev->cp_next = pp;
872 cp_list_head->cp_prev = pp;
873 start_cpus();
874 mutex_exit(&cpu_lock);
875
876 return (0);
877 }
878
879 /*
880 * Move threads from specified partition to cp_default. If `force' is specified,
881 * move all threads, otherwise move only soft-bound threads.
882 */
883 static int
cpupart_unbind_threads(cpupart_t * pp,boolean_t unbind_all)884 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
885 {
886 void *projbuf, *zonebuf;
887 kthread_t *t;
888 proc_t *p;
889 int err = 0;
890 psetid_t psid = pp->cp_id;
891
892 ASSERT(pool_lock_held());
893 ASSERT(MUTEX_HELD(&cpu_lock));
894
895 if (pp == NULL || pp == &cp_default) {
896 return (EINVAL);
897 }
898
899 /*
900 * Pre-allocate enough buffers for FSS for all active projects and
901 * for all active zones on the system. Unused buffers will be
902 * freed later by fss_freebuf().
903 */
904 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
905 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
906
907 mutex_enter(&pidlock);
908 t = curthread;
909 do {
910 if (t->t_bind_pset == psid) {
911 again: p = ttoproc(t);
912 mutex_enter(&p->p_lock);
913 if (ttoproc(t) != p) {
914 /*
915 * lwp_exit has changed this thread's process
916 * pointer before we grabbed its p_lock.
917 */
918 mutex_exit(&p->p_lock);
919 goto again;
920 }
921
922 /*
923 * Can only unbind threads which have revocable binding
924 * unless force unbinding requested.
925 */
926 if (unbind_all || TB_PSET_IS_SOFT(t)) {
927 err = cpupart_bind_thread(t, PS_NONE, 1,
928 projbuf, zonebuf);
929 if (err) {
930 mutex_exit(&p->p_lock);
931 mutex_exit(&pidlock);
932 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
933 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
934 return (err);
935 }
936 t->t_bind_pset = PS_NONE;
937 }
938 mutex_exit(&p->p_lock);
939 }
940 t = t->t_next;
941 } while (t != curthread);
942
943 mutex_exit(&pidlock);
944 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
945 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
946 return (err);
947 }
948
949 /*
950 * Destroy a partition.
951 */
952 int
cpupart_destroy(psetid_t psid)953 cpupart_destroy(psetid_t psid)
954 {
955 cpu_t *cp, *first_cp;
956 cpupart_t *pp, *newpp;
957 int err = 0;
958
959 ASSERT(pool_lock_held());
960 mutex_enter(&cpu_lock);
961
962 pp = cpupart_find(psid);
963 if (pp == NULL || pp == &cp_default) {
964 mutex_exit(&cpu_lock);
965 return (EINVAL);
966 }
967
968 /*
969 * Unbind all the threads currently bound to the partition.
970 */
971 err = cpupart_unbind_threads(pp, B_TRUE);
972 if (err) {
973 mutex_exit(&cpu_lock);
974 return (err);
975 }
976
977 newpp = &cp_default;
978 while ((cp = pp->cp_cpulist) != NULL) {
979 if (err = cpupart_move_cpu(cp, newpp, 0)) {
980 mutex_exit(&cpu_lock);
981 return (err);
982 }
983 }
984
985 ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
986 ASSERT(bitset_is_null(&pp->cp_haltset));
987
988 /*
989 * Teardown the partition's group of active CMT PGs and halted
990 * CPUs now that they have all left.
991 */
992 bitset_fini(&pp->cp_cmt_pgs);
993 bitset_fini(&pp->cp_haltset);
994
995 /*
996 * Reset the pointers in any offline processors so they won't
997 * try to rejoin the destroyed partition when they're turned
998 * online.
999 */
1000 first_cp = cp = CPU;
1001 do {
1002 if (cp->cpu_part == pp) {
1003 ASSERT(cp->cpu_flags & CPU_OFFLINE);
1004 cp->cpu_part = newpp;
1005 }
1006 cp = cp->cpu_next;
1007 } while (cp != first_cp);
1008
1009 /*
1010 * Pause all CPUs while changing the partition list, to make sure
1011 * the clock thread (which traverses the list without holding
1012 * cpu_lock) isn't running.
1013 */
1014 pause_cpus(NULL, NULL);
1015 pp->cp_prev->cp_next = pp->cp_next;
1016 pp->cp_next->cp_prev = pp->cp_prev;
1017 if (cp_list_head == pp)
1018 cp_list_head = pp->cp_next;
1019 start_cpus();
1020
1021 if (cp_id_next > pp->cp_id)
1022 cp_id_next = pp->cp_id;
1023
1024 if (pp->cp_kstat)
1025 kstat_delete(pp->cp_kstat);
1026
1027 cp_numparts--;
1028
1029 disp_kp_free(&pp->cp_kp_queue);
1030
1031 cpupart_lpl_teardown(pp);
1032
1033 kmem_free(pp, sizeof (cpupart_t));
1034 mutex_exit(&cpu_lock);
1035
1036 return (err);
1037 }
1038
1039
1040 /*
1041 * Return the ID of the partition to which the specified processor belongs.
1042 */
1043 psetid_t
cpupart_query_cpu(cpu_t * cp)1044 cpupart_query_cpu(cpu_t *cp)
1045 {
1046 ASSERT(MUTEX_HELD(&cpu_lock));
1047
1048 return (CPTOPS(cp->cpu_part->cp_id));
1049 }
1050
1051
1052 /*
1053 * Attach a processor to an existing partition.
1054 */
1055 int
cpupart_attach_cpu(psetid_t psid,cpu_t * cp,int forced)1056 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1057 {
1058 cpupart_t *pp;
1059 int err;
1060
1061 ASSERT(pool_lock_held());
1062 ASSERT(MUTEX_HELD(&cpu_lock));
1063
1064 pp = cpupart_find(psid);
1065 if (pp == NULL)
1066 return (EINVAL);
1067 if (cp->cpu_flags & CPU_OFFLINE)
1068 return (EINVAL);
1069
1070 err = cpupart_move_cpu(cp, pp, forced);
1071 return (err);
1072 }
1073
1074 /*
1075 * Get a list of cpus belonging to the partition. If numcpus is NULL,
1076 * this just checks for a valid partition. If numcpus is non-NULL but
1077 * cpulist is NULL, the current number of cpus is stored in *numcpus.
1078 * If both are non-NULL, the current number of cpus is stored in *numcpus,
1079 * and a list of those cpus up to the size originally in *numcpus is
1080 * stored in cpulist[]. Also, store the processor set id in *psid.
1081 * This is useful in case the processor set id passed in was PS_MYID.
1082 */
1083 int
cpupart_get_cpus(psetid_t * psid,processorid_t * cpulist,uint_t * numcpus)1084 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1085 {
1086 cpupart_t *pp;
1087 uint_t ncpus;
1088 cpu_t *c;
1089 int i;
1090
1091 mutex_enter(&cpu_lock);
1092 pp = cpupart_find(*psid);
1093 if (pp == NULL) {
1094 mutex_exit(&cpu_lock);
1095 return (EINVAL);
1096 }
1097 *psid = CPTOPS(pp->cp_id);
1098 ncpus = pp->cp_ncpus;
1099 if (numcpus) {
1100 if (ncpus > *numcpus) {
1101 /*
1102 * Only copy as many cpus as were passed in, but
1103 * pass back the real number.
1104 */
1105 uint_t t = ncpus;
1106 ncpus = *numcpus;
1107 *numcpus = t;
1108 } else
1109 *numcpus = ncpus;
1110
1111 if (cpulist) {
1112 c = pp->cp_cpulist;
1113 for (i = 0; i < ncpus; i++) {
1114 ASSERT(c != NULL);
1115 cpulist[i] = c->cpu_id;
1116 c = c->cpu_next_part;
1117 }
1118 }
1119 }
1120 mutex_exit(&cpu_lock);
1121 return (0);
1122 }
1123
1124 /*
1125 * Reallocate kpreempt queues for each CPU partition. Called from
1126 * disp_setup when a new scheduling class is loaded that increases the
1127 * number of priorities in the system.
1128 */
1129 void
cpupart_kpqalloc(pri_t npri)1130 cpupart_kpqalloc(pri_t npri)
1131 {
1132 cpupart_t *cpp;
1133
1134 ASSERT(MUTEX_HELD(&cpu_lock));
1135 cpp = cp_list_head;
1136 do {
1137 disp_kp_alloc(&cpp->cp_kp_queue, npri);
1138 cpp = cpp->cp_next;
1139 } while (cpp != cp_list_head);
1140 }
1141
1142 int
cpupart_get_loadavg(psetid_t psid,int * buf,int nelem)1143 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1144 {
1145 cpupart_t *cp;
1146 int i;
1147
1148 ASSERT(nelem >= 0);
1149 ASSERT(nelem <= LOADAVG_NSTATS);
1150 ASSERT(MUTEX_HELD(&cpu_lock));
1151
1152 cp = cpupart_find(psid);
1153 if (cp == NULL)
1154 return (EINVAL);
1155 for (i = 0; i < nelem; i++)
1156 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1157
1158 return (0);
1159 }
1160
1161
1162 uint_t
cpupart_list(psetid_t * list,uint_t nelem,int flag)1163 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1164 {
1165 uint_t numpart = 0;
1166 cpupart_t *cp;
1167
1168 ASSERT(MUTEX_HELD(&cpu_lock));
1169 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1170
1171 if (list != NULL) {
1172 cp = cp_list_head;
1173 do {
1174 if (((flag == CP_ALL) && (cp != &cp_default)) ||
1175 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1176 if (numpart == nelem)
1177 break;
1178 list[numpart++] = CPTOPS(cp->cp_id);
1179 }
1180 cp = cp->cp_next;
1181 } while (cp != cp_list_head);
1182 }
1183
1184 ASSERT(numpart < cp_numparts);
1185
1186 if (flag == CP_ALL)
1187 numpart = cp_numparts - 1; /* leave out default partition */
1188 else if (flag == CP_NONEMPTY)
1189 numpart = cp_numparts_nonempty;
1190
1191 return (numpart);
1192 }
1193
1194 int
cpupart_setattr(psetid_t psid,uint_t attr)1195 cpupart_setattr(psetid_t psid, uint_t attr)
1196 {
1197 cpupart_t *cp;
1198
1199 ASSERT(pool_lock_held());
1200
1201 mutex_enter(&cpu_lock);
1202 if ((cp = cpupart_find(psid)) == NULL) {
1203 mutex_exit(&cpu_lock);
1204 return (EINVAL);
1205 }
1206 /*
1207 * PSET_NOESCAPE attribute for default cpu partition is always set
1208 */
1209 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1210 mutex_exit(&cpu_lock);
1211 return (EINVAL);
1212 }
1213 cp->cp_attr = attr;
1214 mutex_exit(&cpu_lock);
1215 return (0);
1216 }
1217
1218 int
cpupart_getattr(psetid_t psid,uint_t * attrp)1219 cpupart_getattr(psetid_t psid, uint_t *attrp)
1220 {
1221 cpupart_t *cp;
1222
1223 mutex_enter(&cpu_lock);
1224 if ((cp = cpupart_find(psid)) == NULL) {
1225 mutex_exit(&cpu_lock);
1226 return (EINVAL);
1227 }
1228 *attrp = cp->cp_attr;
1229 mutex_exit(&cpu_lock);
1230 return (0);
1231 }
1232