1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved.
23 *
24 * Copyright 2018 Joyent, Inc.
25 * Copyright (c) 2017 by Delphix. All rights reserved.
26 */
27
28 #include <sys/types.h>
29 #include <sys/systm.h>
30 #include <sys/cmn_err.h>
31 #include <sys/cpuvar.h>
32 #include <sys/thread.h>
33 #include <sys/disp.h>
34 #include <sys/kmem.h>
35 #include <sys/debug.h>
36 #include <sys/cpupart.h>
37 #include <sys/pset.h>
38 #include <sys/var.h>
39 #include <sys/cyclic.h>
40 #include <sys/lgrp.h>
41 #include <sys/pghw.h>
42 #include <sys/loadavg.h>
43 #include <sys/class.h>
44 #include <sys/fss.h>
45 #include <sys/pool.h>
46 #include <sys/pool_pset.h>
47 #include <sys/policy.h>
48
49 /*
50 * Calling pool_lock() protects the pools configuration, which includes
51 * CPU partitions. cpu_lock protects the CPU partition list, and prevents
52 * partitions from being created or destroyed while the lock is held.
53 * The lock ordering with respect to related locks is:
54 *
55 * pool_lock() ---> cpu_lock ---> pidlock --> p_lock
56 *
57 * Blocking memory allocations may be made while holding "pool_lock"
58 * or cpu_lock.
59 */
60
61 /*
62 * The cp_default partition is allocated statically, but its lgroup load average
63 * (lpl) list is allocated dynamically after kmem subsystem is initialized. This
64 * saves some memory since the space allocated reflects the actual number of
65 * lgroups supported by the platform. The lgrp facility provides a temporary
66 * space to hold lpl information during system bootstrap.
67 */
68
69 cpupart_t *cp_list_head;
70 cpupart_t cp_default;
71 static cpupartid_t cp_id_next;
72 uint_t cp_numparts;
73 uint_t cp_numparts_nonempty;
74
75 /*
76 * Need to limit total number of partitions to avoid slowing down the
77 * clock code too much. The clock code traverses the list of
78 * partitions and needs to be able to execute in a reasonable amount
79 * of time (less than 1/hz seconds). The maximum is sized based on
80 * max_ncpus so it shouldn't be a problem unless there are large
81 * numbers of empty partitions.
82 */
83 static uint_t cp_max_numparts;
84
85 /*
86 * Processor sets and CPU partitions are different but related concepts.
87 * A processor set is a user-level abstraction allowing users to create
88 * sets of CPUs and bind threads exclusively to those sets. A CPU
89 * partition is a kernel dispatcher object consisting of a set of CPUs
90 * and a global dispatch queue. The processor set abstraction is
91 * implemented via a CPU partition, and currently there is a 1-1
92 * mapping between processor sets and partitions (excluding the default
93 * partition, which is not visible as a processor set). Hence, the
94 * numbering for processor sets and CPU partitions is identical. This
95 * may not always be true in the future, and these macros could become
96 * less trivial if we support e.g. a processor set containing multiple
97 * CPU partitions.
98 */
99 #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid)))
100 #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid)))
101
102 static int cpupart_unbind_threads(cpupart_t *, boolean_t);
103
104 /*
105 * Find a CPU partition given a processor set ID.
106 */
107 static cpupart_t *
cpupart_find_all(psetid_t psid)108 cpupart_find_all(psetid_t psid)
109 {
110 cpupart_t *cp;
111 cpupartid_t cpid = PSTOCP(psid);
112
113 ASSERT(MUTEX_HELD(&cpu_lock));
114
115 /* default partition not visible as a processor set */
116 if (psid == CP_DEFAULT)
117 return (NULL);
118
119 if (psid == PS_MYID)
120 return (curthread->t_cpupart);
121
122 cp = cp_list_head;
123 do {
124 if (cp->cp_id == cpid)
125 return (cp);
126 cp = cp->cp_next;
127 } while (cp != cp_list_head);
128 return (NULL);
129 }
130
131 /*
132 * Find a CPU partition given a processor set ID if the processor set
133 * should be visible from the calling zone.
134 */
135 cpupart_t *
cpupart_find(psetid_t psid)136 cpupart_find(psetid_t psid)
137 {
138 cpupart_t *cp;
139
140 ASSERT(MUTEX_HELD(&cpu_lock));
141 cp = cpupart_find_all(psid);
142 if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() &&
143 zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id))
144 return (NULL);
145 return (cp);
146 }
147
148 static int
cpupart_kstat_update(kstat_t * ksp,int rw)149 cpupart_kstat_update(kstat_t *ksp, int rw)
150 {
151 cpupart_t *cp = (cpupart_t *)ksp->ks_private;
152 cpupart_kstat_t *cpksp = ksp->ks_data;
153
154 if (rw == KSTAT_WRITE)
155 return (EACCES);
156
157 cpksp->cpk_updates.value.ui64 = cp->cp_updates;
158 cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum;
159 cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum;
160 cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus;
161 cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >>
162 (16 - FSHIFT);
163 cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >>
164 (16 - FSHIFT);
165 cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >>
166 (16 - FSHIFT);
167 return (0);
168 }
169
170 static void
cpupart_kstat_create(cpupart_t * cp)171 cpupart_kstat_create(cpupart_t *cp)
172 {
173 kstat_t *ksp;
174 zoneid_t zoneid;
175
176 ASSERT(MUTEX_HELD(&cpu_lock));
177
178 /*
179 * We have a bit of a chicken-egg problem since this code will
180 * get called to create the kstats for CP_DEFAULT before the
181 * pools framework gets initialized. We circumvent the problem
182 * by special-casing cp_default.
183 */
184 if (cp != &cp_default && pool_pset_enabled())
185 zoneid = GLOBAL_ZONEID;
186 else
187 zoneid = ALL_ZONES;
188 ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc",
189 KSTAT_TYPE_NAMED,
190 sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid);
191 if (ksp != NULL) {
192 cpupart_kstat_t *cpksp = ksp->ks_data;
193
194 kstat_named_init(&cpksp->cpk_updates, "updates",
195 KSTAT_DATA_UINT64);
196 kstat_named_init(&cpksp->cpk_runnable, "runnable",
197 KSTAT_DATA_UINT64);
198 kstat_named_init(&cpksp->cpk_waiting, "waiting",
199 KSTAT_DATA_UINT64);
200 kstat_named_init(&cpksp->cpk_ncpus, "ncpus",
201 KSTAT_DATA_UINT32);
202 kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min",
203 KSTAT_DATA_UINT32);
204 kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min",
205 KSTAT_DATA_UINT32);
206 kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min",
207 KSTAT_DATA_UINT32);
208
209 ksp->ks_update = cpupart_kstat_update;
210 ksp->ks_private = cp;
211
212 kstat_install(ksp);
213 }
214 cp->cp_kstat = ksp;
215 }
216
217 /*
218 * Initialize the cpupart's lgrp partions (lpls)
219 */
220 static void
cpupart_lpl_initialize(cpupart_t * cp)221 cpupart_lpl_initialize(cpupart_t *cp)
222 {
223 int i, sz;
224
225 sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps();
226 cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP);
227
228 for (i = 0; i < sz; i++) {
229 /*
230 * The last entry of the lpl's resource set is always NULL
231 * by design (to facilitate iteration)...hence the "oversizing"
232 * by 1.
233 */
234 cp->cp_lgrploads[i].lpl_rset_sz = sz + 1;
235 cp->cp_lgrploads[i].lpl_rset =
236 kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP);
237 cp->cp_lgrploads[i].lpl_id2rset =
238 kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP);
239 cp->cp_lgrploads[i].lpl_lgrpid = i;
240 }
241 }
242
243 /*
244 * Teardown the cpupart's lgrp partitions
245 */
246 static void
cpupart_lpl_teardown(cpupart_t * cp)247 cpupart_lpl_teardown(cpupart_t *cp)
248 {
249 int i, sz;
250 lpl_t *lpl;
251
252 for (i = 0; i < cp->cp_nlgrploads; i++) {
253 lpl = &cp->cp_lgrploads[i];
254
255 sz = lpl->lpl_rset_sz;
256 kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz);
257 kmem_free(lpl->lpl_id2rset, sizeof (int) * sz);
258 lpl->lpl_rset = NULL;
259 lpl->lpl_id2rset = NULL;
260 }
261 kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads);
262 cp->cp_lgrploads = NULL;
263 }
264
265 /*
266 * Initialize the default partition and kpreempt disp queue.
267 */
268 void
cpupart_initialize_default(void)269 cpupart_initialize_default(void)
270 {
271 lgrp_id_t i;
272
273 cp_list_head = &cp_default;
274 cp_default.cp_next = &cp_default;
275 cp_default.cp_prev = &cp_default;
276 cp_default.cp_id = CP_DEFAULT;
277 cp_default.cp_kp_queue.disp_maxrunpri = -1;
278 cp_default.cp_kp_queue.disp_max_unbound_pri = -1;
279 cp_default.cp_kp_queue.disp_cpu = NULL;
280 cp_default.cp_gen = 0;
281 cp_default.cp_loadavg.lg_cur = 0;
282 cp_default.cp_loadavg.lg_len = 0;
283 cp_default.cp_loadavg.lg_total = 0;
284 for (i = 0; i < S_LOADAVG_SZ; i++) {
285 cp_default.cp_loadavg.lg_loads[i] = 0;
286 }
287 DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock);
288 cp_id_next = CP_DEFAULT + 1;
289 cpupart_kstat_create(&cp_default);
290 cp_numparts = 1;
291 if (cp_max_numparts == 0) /* allow for /etc/system tuning */
292 cp_max_numparts = max_ncpus * 2 + 1;
293 /*
294 * Allocate space for cp_default list of lgrploads
295 */
296 cpupart_lpl_initialize(&cp_default);
297
298 /*
299 * The initial lpl topology is created in a special lpl list
300 * lpl_bootstrap. It should be copied to cp_default.
301 * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point
302 * to the correct lpl in the cp_default.cp_lgrploads list.
303 */
304 lpl_topo_bootstrap(cp_default.cp_lgrploads,
305 cp_default.cp_nlgrploads);
306
307
308 cp_default.cp_attr = PSET_NOESCAPE;
309 cp_numparts_nonempty = 1;
310 /*
311 * Set t0's home
312 */
313 t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID];
314
315 bitset_init(&cp_default.cp_cmt_pgs);
316 bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout);
317
318 bitset_resize(&cp_default.cp_haltset, max_ncpus);
319 }
320
321
322 static int
cpupart_move_cpu(cpu_t * cp,cpupart_t * newpp,int forced)323 cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced)
324 {
325 cpupart_t *oldpp;
326 cpu_t *ncp, *newlist;
327 kthread_t *t;
328 int move_threads = 1;
329 lgrp_id_t lgrpid;
330 proc_t *p;
331 int lgrp_diff_lpl;
332 lpl_t *cpu_lpl;
333 int ret;
334 boolean_t unbind_all_threads = (forced != 0);
335
336 ASSERT(MUTEX_HELD(&cpu_lock));
337 ASSERT(newpp != NULL);
338
339 oldpp = cp->cpu_part;
340 ASSERT(oldpp != NULL);
341 ASSERT(oldpp->cp_ncpus > 0);
342
343 if (newpp == oldpp) {
344 /*
345 * Don't need to do anything.
346 */
347 return (0);
348 }
349
350 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT);
351
352 if (!disp_bound_partition(cp, 0)) {
353 /*
354 * Don't need to move threads if there are no threads in
355 * the partition. Note that threads can't enter the
356 * partition while we're holding cpu_lock.
357 */
358 move_threads = 0;
359 } else if (oldpp->cp_ncpus == 1) {
360 /*
361 * The last CPU is removed from a partition which has threads
362 * running in it. Some of these threads may be bound to this
363 * CPU.
364 *
365 * Attempt to unbind threads from the CPU and from the processor
366 * set. Note that no threads should be bound to this CPU since
367 * cpupart_move_threads will refuse to move bound threads to
368 * other CPUs.
369 */
370 (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE);
371 (void) cpupart_unbind_threads(oldpp, B_FALSE);
372
373 if (!disp_bound_partition(cp, 0)) {
374 /*
375 * No bound threads in this partition any more
376 */
377 move_threads = 0;
378 } else {
379 /*
380 * There are still threads bound to the partition
381 */
382 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
383 return (EBUSY);
384 }
385 }
386
387 /*
388 * If forced flag is set unbind any threads from this CPU.
389 * Otherwise unbind soft-bound threads only.
390 */
391 if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) {
392 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
393 return (ret);
394 }
395
396 /*
397 * Stop further threads weak binding to this cpu.
398 */
399 cpu_inmotion = cp;
400 membar_enter();
401
402 /*
403 * Notify the Processor Groups subsystem that the CPU
404 * will be moving cpu partitions. This is done before
405 * CPUs are paused to provide an opportunity for any
406 * needed memory allocations.
407 */
408 pg_cpupart_out(cp, oldpp);
409 pg_cpupart_in(cp, newpp);
410
411 again:
412 if (move_threads) {
413 int loop_count;
414 /*
415 * Check for threads strong or weak bound to this CPU.
416 */
417 for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) {
418 if (loop_count >= 5) {
419 cpu_state_change_notify(cp->cpu_id,
420 CPU_CPUPART_IN);
421 pg_cpupart_out(cp, newpp);
422 pg_cpupart_in(cp, oldpp);
423 cpu_inmotion = NULL;
424 return (EBUSY); /* some threads still bound */
425 }
426 delay(1);
427 }
428 }
429
430 /*
431 * Before we actually start changing data structures, notify
432 * the cyclic subsystem that we want to move this CPU out of its
433 * partition.
434 */
435 if (!cyclic_move_out(cp)) {
436 /*
437 * This CPU must be the last CPU in a processor set with
438 * a bound cyclic.
439 */
440 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
441 pg_cpupart_out(cp, newpp);
442 pg_cpupart_in(cp, oldpp);
443 cpu_inmotion = NULL;
444 return (EBUSY);
445 }
446
447 pause_cpus(cp, NULL);
448
449 if (move_threads) {
450 /*
451 * The thread on cpu before the pause thread may have read
452 * cpu_inmotion before we raised the barrier above. Check
453 * again.
454 */
455 if (disp_bound_threads(cp, 1)) {
456 start_cpus();
457 goto again;
458 }
459
460 }
461
462 /*
463 * Now that CPUs are paused, let the PG subsystem perform
464 * any necessary data structure updates.
465 */
466 pg_cpupart_move(cp, oldpp, newpp);
467
468 /* save this cpu's lgroup -- it'll be the same in the new partition */
469 lgrpid = cp->cpu_lpl->lpl_lgrpid;
470
471 cpu_lpl = cp->cpu_lpl;
472 /*
473 * let the lgroup framework know cp has left the partition
474 */
475 lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid);
476
477 /* move out of old partition */
478 oldpp->cp_ncpus--;
479 if (oldpp->cp_ncpus > 0) {
480
481 ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part;
482 cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part;
483 if (oldpp->cp_cpulist == cp) {
484 oldpp->cp_cpulist = ncp;
485 }
486 } else {
487 ncp = oldpp->cp_cpulist = NULL;
488 cp_numparts_nonempty--;
489 ASSERT(cp_numparts_nonempty != 0);
490 }
491 oldpp->cp_gen++;
492
493 /* move into new partition */
494 newlist = newpp->cp_cpulist;
495 if (newlist == NULL) {
496 newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp;
497 cp_numparts_nonempty++;
498 ASSERT(cp_numparts_nonempty != 0);
499 } else {
500 cp->cpu_next_part = newlist;
501 cp->cpu_prev_part = newlist->cpu_prev_part;
502 newlist->cpu_prev_part->cpu_next_part = cp;
503 newlist->cpu_prev_part = cp;
504 }
505 cp->cpu_part = newpp;
506 newpp->cp_ncpus++;
507 newpp->cp_gen++;
508
509 ASSERT(bitset_is_null(&newpp->cp_haltset));
510 ASSERT(bitset_is_null(&oldpp->cp_haltset));
511
512 /*
513 * let the lgroup framework know cp has entered the partition
514 */
515 lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid);
516
517 /*
518 * If necessary, move threads off processor.
519 */
520 if (move_threads) {
521 ASSERT(ncp != NULL);
522
523 /*
524 * Walk thru the active process list to look for
525 * threads that need to have a new home lgroup,
526 * or the last CPU they run on is the same CPU
527 * being moved out of the partition.
528 */
529
530 for (p = practive; p != NULL; p = p->p_next) {
531
532 t = p->p_tlist;
533
534 if (t == NULL)
535 continue;
536
537 lgrp_diff_lpl = 0;
538
539 do {
540
541 ASSERT(t->t_lpl != NULL);
542
543 /*
544 * Update the count of how many threads are
545 * in this CPU's lgroup but have a different lpl
546 */
547
548 if (t->t_lpl != cpu_lpl &&
549 t->t_lpl->lpl_lgrpid == lgrpid)
550 lgrp_diff_lpl++;
551 /*
552 * If the lgroup that t is assigned to no
553 * longer has any CPUs in t's partition,
554 * we'll have to choose a new lgroup for t.
555 */
556
557 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
558 t->t_cpupart)) {
559 lgrp_move_thread(t,
560 lgrp_choose(t, t->t_cpupart), 0);
561 }
562
563 /*
564 * make sure lpl points to our own partition
565 */
566 ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads &&
567 (t->t_lpl < t->t_cpupart->cp_lgrploads +
568 t->t_cpupart->cp_nlgrploads));
569
570 ASSERT(t->t_lpl->lpl_ncpu > 0);
571
572 /* Update CPU last ran on if it was this CPU */
573 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
574 t->t_bound_cpu != cp) {
575 t->t_cpu = disp_lowpri_cpu(ncp, t,
576 t->t_pri);
577 }
578 t = t->t_forw;
579 } while (t != p->p_tlist);
580
581 /*
582 * Didn't find any threads in the same lgroup as this
583 * CPU with a different lpl, so remove the lgroup from
584 * the process lgroup bitmask.
585 */
586
587 if (lgrp_diff_lpl)
588 klgrpset_del(p->p_lgrpset, lgrpid);
589 }
590
591 /*
592 * Walk thread list looking for threads that need to be
593 * rehomed, since there are some threads that are not in
594 * their process's p_tlist.
595 */
596
597 t = curthread;
598
599 do {
600 ASSERT(t != NULL && t->t_lpl != NULL);
601
602 /*
603 * If the lgroup that t is assigned to no
604 * longer has any CPUs in t's partition,
605 * we'll have to choose a new lgroup for t.
606 * Also, choose best lgroup for home when
607 * thread has specified lgroup affinities,
608 * since there may be an lgroup with more
609 * affinity available after moving CPUs
610 * around.
611 */
612 if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid,
613 t->t_cpupart) || t->t_lgrp_affinity) {
614 lgrp_move_thread(t,
615 lgrp_choose(t, t->t_cpupart), 1);
616 }
617
618 /* make sure lpl points to our own partition */
619 ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) &&
620 (t->t_lpl < t->t_cpupart->cp_lgrploads +
621 t->t_cpupart->cp_nlgrploads));
622
623 ASSERT(t->t_lpl->lpl_ncpu > 0);
624
625 /* Update CPU last ran on if it was this CPU */
626 if (t->t_cpu == cp && t->t_cpupart == oldpp &&
627 t->t_bound_cpu != cp) {
628 t->t_cpu = disp_lowpri_cpu(ncp, t,
629 t->t_pri);
630 }
631
632 t = t->t_next;
633 } while (t != curthread);
634
635 /*
636 * Clear off the CPU's run queue, and the kp queue if the
637 * partition is now empty.
638 */
639 disp_cpu_inactive(cp);
640
641 /*
642 * Make cp switch to a thread from the new partition.
643 */
644 cp->cpu_runrun = 1;
645 cp->cpu_kprunrun = 1;
646 }
647
648 cpu_inmotion = NULL;
649 start_cpus();
650
651 /*
652 * Let anyone interested know that cpu has been added to the set.
653 */
654 cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN);
655
656 /*
657 * Now let the cyclic subsystem know that it can reshuffle cyclics
658 * bound to the new processor set.
659 */
660 cyclic_move_in(cp);
661
662 return (0);
663 }
664
665 /*
666 * Check if thread can be moved to a new cpu partition. Called by
667 * cpupart_move_thread() and pset_bind_start().
668 */
669 int
cpupart_movable_thread(kthread_id_t tp,cpupart_t * cp,int ignore)670 cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore)
671 {
672 ASSERT(MUTEX_HELD(&cpu_lock));
673 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
674 ASSERT(cp != NULL);
675 ASSERT(THREAD_LOCK_HELD(tp));
676
677 /*
678 * CPU-bound threads can't be moved.
679 */
680 if (!ignore) {
681 cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu :
682 tp->t_weakbound_cpu;
683 if (boundcpu != NULL && boundcpu->cpu_part != cp)
684 return (EBUSY);
685 }
686
687 if (tp->t_cid == sysdccid) {
688 return (EINVAL); /* For now, sysdc threads can't move */
689 }
690
691 return (0);
692 }
693
694 /*
695 * Move thread to new partition. If ignore is non-zero, then CPU
696 * bindings should be ignored (this is used when destroying a
697 * partition).
698 */
699 static int
cpupart_move_thread(kthread_id_t tp,cpupart_t * newpp,int ignore,void * projbuf,void * zonebuf)700 cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore,
701 void *projbuf, void *zonebuf)
702 {
703 cpupart_t *oldpp = tp->t_cpupart;
704 int ret;
705
706 ASSERT(MUTEX_HELD(&cpu_lock));
707 ASSERT(MUTEX_HELD(&pidlock));
708 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
709 ASSERT(newpp != NULL);
710
711 if (newpp->cp_cpulist == NULL)
712 return (EINVAL);
713
714 /*
715 * Check for errors first.
716 */
717 thread_lock(tp);
718 if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) {
719 thread_unlock(tp);
720 return (ret);
721 }
722
723 /* move the thread */
724 if (oldpp != newpp) {
725 /*
726 * Make the thread switch to the new partition.
727 */
728 tp->t_cpupart = newpp;
729 ASSERT(tp->t_lpl != NULL);
730 /*
731 * Leave the thread on the same lgroup if possible; otherwise
732 * choose a new lgroup for it. In either case, update its
733 * t_lpl.
734 */
735 if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) &&
736 tp->t_lgrp_affinity == NULL) {
737 /*
738 * The thread's lgroup has CPUs in the thread's new
739 * partition, so the thread can stay assigned to the
740 * same lgroup. Update its t_lpl to point to the
741 * lpl_t for its lgroup in its new partition.
742 */
743 lgrp_move_thread(tp, &tp->t_cpupart->\
744 cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1);
745 } else {
746 /*
747 * The thread's lgroup has no cpus in its new
748 * partition or it has specified lgroup affinities,
749 * so choose the best lgroup for the thread and
750 * assign it to that lgroup.
751 */
752 lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart),
753 1);
754 }
755 /*
756 * make sure lpl points to our own partition
757 */
758 ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) &&
759 (tp->t_lpl < tp->t_cpupart->cp_lgrploads +
760 tp->t_cpupart->cp_nlgrploads));
761
762 ASSERT(tp->t_lpl->lpl_ncpu > 0);
763
764 if (tp->t_state == TS_ONPROC) {
765 cpu_surrender(tp);
766 } else if (tp->t_state == TS_RUN) {
767 (void) dispdeq(tp);
768 setbackdq(tp);
769 }
770 }
771
772 /*
773 * Our binding has changed; set TP_CHANGEBIND.
774 */
775 tp->t_proc_flag |= TP_CHANGEBIND;
776 aston(tp);
777
778 thread_unlock(tp);
779 fss_changepset(tp, newpp, projbuf, zonebuf);
780
781 return (0); /* success */
782 }
783
784
785 /*
786 * This function binds a thread to a partition. Must be called with the
787 * p_lock of the containing process held (to keep the thread from going
788 * away), and thus also with cpu_lock held (since cpu_lock must be
789 * acquired before p_lock). If ignore is non-zero, then CPU bindings
790 * should be ignored (this is used when destroying a partition).
791 */
792 int
cpupart_bind_thread(kthread_id_t tp,psetid_t psid,int ignore,void * projbuf,void * zonebuf)793 cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf,
794 void *zonebuf)
795 {
796 cpupart_t *newpp;
797
798 ASSERT(pool_lock_held());
799 ASSERT(MUTEX_HELD(&cpu_lock));
800 ASSERT(MUTEX_HELD(&pidlock));
801 ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock));
802
803 if (psid == PS_NONE)
804 newpp = &cp_default;
805 else {
806 newpp = cpupart_find(psid);
807 if (newpp == NULL) {
808 return (EINVAL);
809 }
810 }
811 return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf));
812 }
813
814
815 /*
816 * Create a new partition. On MP systems, this also allocates a
817 * kpreempt disp queue for that partition.
818 */
819 int
cpupart_create(psetid_t * psid)820 cpupart_create(psetid_t *psid)
821 {
822 cpupart_t *pp;
823
824 ASSERT(pool_lock_held());
825
826 pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP);
827
828 mutex_enter(&cpu_lock);
829 if (cp_numparts == cp_max_numparts) {
830 mutex_exit(&cpu_lock);
831 kmem_free(pp, sizeof (cpupart_t));
832 return (ENOMEM);
833 }
834 cp_numparts++;
835 /* find the next free partition ID */
836 while (cpupart_find(CPTOPS(cp_id_next)) != NULL)
837 cp_id_next++;
838 pp->cp_id = cp_id_next++;
839 pp->cp_ncpus = 0;
840 pp->cp_cpulist = NULL;
841 pp->cp_attr = 0;
842 klgrpset_clear(pp->cp_lgrpset);
843 pp->cp_kp_queue.disp_maxrunpri = -1;
844 pp->cp_kp_queue.disp_max_unbound_pri = -1;
845 pp->cp_kp_queue.disp_cpu = NULL;
846 pp->cp_gen = 0;
847 DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock);
848 *psid = CPTOPS(pp->cp_id);
849 disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris);
850 cpupart_kstat_create(pp);
851 cpupart_lpl_initialize(pp);
852
853 bitset_init(&pp->cp_cmt_pgs);
854
855 /*
856 * Initialize and size the partition's bitset of halted CPUs.
857 */
858 bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout);
859 bitset_resize(&pp->cp_haltset, max_ncpus);
860
861 /*
862 * Pause all CPUs while changing the partition list, to make sure
863 * the clock thread (which traverses the list without holding
864 * cpu_lock) isn't running.
865 */
866 pause_cpus(NULL, NULL);
867 pp->cp_next = cp_list_head;
868 pp->cp_prev = cp_list_head->cp_prev;
869 cp_list_head->cp_prev->cp_next = pp;
870 cp_list_head->cp_prev = pp;
871 start_cpus();
872 mutex_exit(&cpu_lock);
873
874 return (0);
875 }
876
877 /*
878 * Move threads from specified partition to cp_default. If `force' is specified,
879 * move all threads, otherwise move only soft-bound threads.
880 */
881 static int
cpupart_unbind_threads(cpupart_t * pp,boolean_t unbind_all)882 cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all)
883 {
884 void *projbuf, *zonebuf;
885 kthread_t *t;
886 proc_t *p;
887 int err = 0;
888 psetid_t psid;
889
890 ASSERT(pool_lock_held());
891 ASSERT(MUTEX_HELD(&cpu_lock));
892
893 if (pp == NULL || pp == &cp_default) {
894 return (EINVAL);
895 }
896 psid = pp->cp_id;
897
898 /*
899 * Pre-allocate enough buffers for FSS for all active projects and
900 * for all active zones on the system. Unused buffers will be
901 * freed later by fss_freebuf().
902 */
903 projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ);
904 zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE);
905
906 mutex_enter(&pidlock);
907 t = curthread;
908 do {
909 if (t->t_bind_pset == psid) {
910 again: p = ttoproc(t);
911 mutex_enter(&p->p_lock);
912 if (ttoproc(t) != p) {
913 /*
914 * lwp_exit has changed this thread's process
915 * pointer before we grabbed its p_lock.
916 */
917 mutex_exit(&p->p_lock);
918 goto again;
919 }
920
921 /*
922 * Can only unbind threads which have revocable binding
923 * unless force unbinding requested.
924 */
925 if (unbind_all || TB_PSET_IS_SOFT(t)) {
926 err = cpupart_bind_thread(t, PS_NONE, 1,
927 projbuf, zonebuf);
928 if (err) {
929 mutex_exit(&p->p_lock);
930 mutex_exit(&pidlock);
931 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
932 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
933 return (err);
934 }
935 t->t_bind_pset = PS_NONE;
936 }
937 mutex_exit(&p->p_lock);
938 }
939 t = t->t_next;
940 } while (t != curthread);
941
942 mutex_exit(&pidlock);
943 fss_freebuf(projbuf, FSS_ALLOC_PROJ);
944 fss_freebuf(zonebuf, FSS_ALLOC_ZONE);
945 return (err);
946 }
947
948 /*
949 * Destroy a partition.
950 */
951 int
cpupart_destroy(psetid_t psid)952 cpupart_destroy(psetid_t psid)
953 {
954 cpu_t *cp, *first_cp;
955 cpupart_t *pp, *newpp;
956 int err = 0;
957
958 ASSERT(pool_lock_held());
959 mutex_enter(&cpu_lock);
960
961 pp = cpupart_find(psid);
962 if (pp == NULL || pp == &cp_default) {
963 mutex_exit(&cpu_lock);
964 return (EINVAL);
965 }
966
967 /*
968 * Unbind all the threads currently bound to the partition.
969 */
970 err = cpupart_unbind_threads(pp, B_TRUE);
971 if (err) {
972 mutex_exit(&cpu_lock);
973 return (err);
974 }
975
976 newpp = &cp_default;
977 while ((cp = pp->cp_cpulist) != NULL) {
978 if ((err = cpupart_move_cpu(cp, newpp, 0)) != 0) {
979 mutex_exit(&cpu_lock);
980 return (err);
981 }
982 }
983
984 ASSERT(bitset_is_null(&pp->cp_cmt_pgs));
985 ASSERT(bitset_is_null(&pp->cp_haltset));
986
987 /*
988 * Teardown the partition's group of active CMT PGs and halted
989 * CPUs now that they have all left.
990 */
991 bitset_fini(&pp->cp_cmt_pgs);
992 bitset_fini(&pp->cp_haltset);
993
994 /*
995 * Reset the pointers in any offline processors so they won't
996 * try to rejoin the destroyed partition when they're turned
997 * online.
998 */
999 first_cp = cp = CPU;
1000 do {
1001 if (cp->cpu_part == pp) {
1002 ASSERT(cp->cpu_flags & CPU_OFFLINE);
1003 cp->cpu_part = newpp;
1004 }
1005 cp = cp->cpu_next;
1006 } while (cp != first_cp);
1007
1008 /*
1009 * Pause all CPUs while changing the partition list, to make sure
1010 * the clock thread (which traverses the list without holding
1011 * cpu_lock) isn't running.
1012 */
1013 pause_cpus(NULL, NULL);
1014 pp->cp_prev->cp_next = pp->cp_next;
1015 pp->cp_next->cp_prev = pp->cp_prev;
1016 if (cp_list_head == pp)
1017 cp_list_head = pp->cp_next;
1018 start_cpus();
1019
1020 if (cp_id_next > pp->cp_id)
1021 cp_id_next = pp->cp_id;
1022
1023 if (pp->cp_kstat)
1024 kstat_delete(pp->cp_kstat);
1025
1026 cp_numparts--;
1027
1028 disp_kp_free(&pp->cp_kp_queue);
1029
1030 cpupart_lpl_teardown(pp);
1031
1032 kmem_free(pp, sizeof (cpupart_t));
1033 mutex_exit(&cpu_lock);
1034
1035 return (err);
1036 }
1037
1038
1039 /*
1040 * Return the ID of the partition to which the specified processor belongs.
1041 */
1042 psetid_t
cpupart_query_cpu(cpu_t * cp)1043 cpupart_query_cpu(cpu_t *cp)
1044 {
1045 ASSERT(MUTEX_HELD(&cpu_lock));
1046
1047 return (CPTOPS(cp->cpu_part->cp_id));
1048 }
1049
1050
1051 /*
1052 * Attach a processor to an existing partition.
1053 */
1054 int
cpupart_attach_cpu(psetid_t psid,cpu_t * cp,int forced)1055 cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced)
1056 {
1057 cpupart_t *pp;
1058 int err;
1059
1060 ASSERT(pool_lock_held());
1061 ASSERT(MUTEX_HELD(&cpu_lock));
1062
1063 pp = cpupart_find(psid);
1064 if (pp == NULL)
1065 return (EINVAL);
1066 if (cp->cpu_flags & CPU_OFFLINE)
1067 return (EINVAL);
1068
1069 err = cpupart_move_cpu(cp, pp, forced);
1070 return (err);
1071 }
1072
1073 /*
1074 * Get a list of cpus belonging to the partition. If numcpus is NULL,
1075 * this just checks for a valid partition. If numcpus is non-NULL but
1076 * cpulist is NULL, the current number of cpus is stored in *numcpus.
1077 * If both are non-NULL, the current number of cpus is stored in *numcpus,
1078 * and a list of those cpus up to the size originally in *numcpus is
1079 * stored in cpulist[]. Also, store the processor set id in *psid.
1080 * This is useful in case the processor set id passed in was PS_MYID.
1081 */
1082 int
cpupart_get_cpus(psetid_t * psid,processorid_t * cpulist,uint_t * numcpus)1083 cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus)
1084 {
1085 cpupart_t *pp;
1086 uint_t ncpus;
1087 cpu_t *c;
1088 int i;
1089
1090 mutex_enter(&cpu_lock);
1091 pp = cpupart_find(*psid);
1092 if (pp == NULL) {
1093 mutex_exit(&cpu_lock);
1094 return (EINVAL);
1095 }
1096 *psid = CPTOPS(pp->cp_id);
1097 ncpus = pp->cp_ncpus;
1098 if (numcpus) {
1099 if (ncpus > *numcpus) {
1100 /*
1101 * Only copy as many cpus as were passed in, but
1102 * pass back the real number.
1103 */
1104 uint_t t = ncpus;
1105 ncpus = *numcpus;
1106 *numcpus = t;
1107 } else
1108 *numcpus = ncpus;
1109
1110 if (cpulist) {
1111 c = pp->cp_cpulist;
1112 for (i = 0; i < ncpus; i++) {
1113 ASSERT(c != NULL);
1114 cpulist[i] = c->cpu_id;
1115 c = c->cpu_next_part;
1116 }
1117 }
1118 }
1119 mutex_exit(&cpu_lock);
1120 return (0);
1121 }
1122
1123 /*
1124 * Reallocate kpreempt queues for each CPU partition. Called from
1125 * disp_setup when a new scheduling class is loaded that increases the
1126 * number of priorities in the system.
1127 */
1128 void
cpupart_kpqalloc(pri_t npri)1129 cpupart_kpqalloc(pri_t npri)
1130 {
1131 cpupart_t *cpp;
1132
1133 ASSERT(MUTEX_HELD(&cpu_lock));
1134 cpp = cp_list_head;
1135 do {
1136 disp_kp_alloc(&cpp->cp_kp_queue, npri);
1137 cpp = cpp->cp_next;
1138 } while (cpp != cp_list_head);
1139 }
1140
1141 int
cpupart_get_loadavg(psetid_t psid,int * buf,int nelem)1142 cpupart_get_loadavg(psetid_t psid, int *buf, int nelem)
1143 {
1144 cpupart_t *cp;
1145 int i;
1146
1147 ASSERT(nelem >= 0);
1148 ASSERT(nelem <= LOADAVG_NSTATS);
1149 ASSERT(MUTEX_HELD(&cpu_lock));
1150
1151 cp = cpupart_find(psid);
1152 if (cp == NULL)
1153 return (EINVAL);
1154 for (i = 0; i < nelem; i++)
1155 buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT);
1156
1157 return (0);
1158 }
1159
1160
1161 uint_t
cpupart_list(psetid_t * list,uint_t nelem,int flag)1162 cpupart_list(psetid_t *list, uint_t nelem, int flag)
1163 {
1164 uint_t numpart = 0;
1165 cpupart_t *cp;
1166
1167 ASSERT(MUTEX_HELD(&cpu_lock));
1168 ASSERT(flag == CP_ALL || flag == CP_NONEMPTY);
1169
1170 if (list != NULL) {
1171 cp = cp_list_head;
1172 do {
1173 if (((flag == CP_ALL) && (cp != &cp_default)) ||
1174 ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) {
1175 if (numpart == nelem)
1176 break;
1177 list[numpart++] = CPTOPS(cp->cp_id);
1178 }
1179 cp = cp->cp_next;
1180 } while (cp != cp_list_head);
1181 }
1182
1183 ASSERT(numpart < cp_numparts);
1184
1185 if (flag == CP_ALL)
1186 numpart = cp_numparts - 1; /* leave out default partition */
1187 else if (flag == CP_NONEMPTY)
1188 numpart = cp_numparts_nonempty;
1189
1190 return (numpart);
1191 }
1192
1193 int
cpupart_setattr(psetid_t psid,uint_t attr)1194 cpupart_setattr(psetid_t psid, uint_t attr)
1195 {
1196 cpupart_t *cp;
1197
1198 ASSERT(pool_lock_held());
1199
1200 mutex_enter(&cpu_lock);
1201 if ((cp = cpupart_find(psid)) == NULL) {
1202 mutex_exit(&cpu_lock);
1203 return (EINVAL);
1204 }
1205 /*
1206 * PSET_NOESCAPE attribute for default cpu partition is always set
1207 */
1208 if (cp == &cp_default && !(attr & PSET_NOESCAPE)) {
1209 mutex_exit(&cpu_lock);
1210 return (EINVAL);
1211 }
1212 cp->cp_attr = attr;
1213 mutex_exit(&cpu_lock);
1214 return (0);
1215 }
1216
1217 int
cpupart_getattr(psetid_t psid,uint_t * attrp)1218 cpupart_getattr(psetid_t psid, uint_t *attrp)
1219 {
1220 cpupart_t *cp;
1221
1222 mutex_enter(&cpu_lock);
1223 if ((cp = cpupart_find(psid)) == NULL) {
1224 mutex_exit(&cpu_lock);
1225 return (EINVAL);
1226 }
1227 *attrp = cp->cp_attr;
1228 mutex_exit(&cpu_lock);
1229 return (0);
1230 }
1231