1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/systm.h>
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/thread.h>
29 #include <sys/cpuvar.h>
30 #include <sys/cpupart.h>
31 #include <sys/kmem.h>
32 #include <sys/cmn_err.h>
33 #include <sys/kstat.h>
34 #include <sys/processor.h>
35 #include <sys/disp.h>
36 #include <sys/group.h>
37 #include <sys/pghw.h>
38 #include <sys/bitset.h>
39 #include <sys/lgrp.h>
40 #include <sys/cmt.h>
41 #include <sys/cpu_pm.h>
42
43 /*
44 * CMT scheduler / dispatcher support
45 *
46 * This file implements CMT scheduler support using Processor Groups.
47 * The CMT processor group class creates and maintains the CMT class
48 * specific processor group pg_cmt_t.
49 *
50 * ---------------------------- <-- pg_cmt_t *
51 * | pghw_t |
52 * ----------------------------
53 * | CMT class specific data |
54 * | - hierarchy linkage |
55 * | - CMT load balancing data|
56 * | - active CPU group/bitset|
57 * ----------------------------
58 *
59 * The scheduler/dispatcher leverages knowledge of the performance
60 * relevant CMT sharing relationships existing between cpus to implement
61 * optimized affinity, load balancing, and coalescence policies.
62 *
63 * Load balancing policy seeks to improve performance by minimizing
64 * contention over shared processor resources / facilities, Affinity
65 * policies seek to improve cache and TLB utilization. Coalescence
66 * policies improve resource utilization and ultimately power efficiency.
67 *
68 * The CMT PGs created by this class are already arranged into a
69 * hierarchy (which is done in the pghw layer). To implement the top-down
70 * CMT load balancing algorithm, the CMT PGs additionally maintain
71 * parent, child and sibling hierarchy relationships.
72 * Parent PGs always contain a superset of their children(s) resources,
73 * each PG can have at most one parent, and siblings are the group of PGs
74 * sharing the same parent.
75 *
76 * On UMA based systems, the CMT load balancing algorithm begins by balancing
77 * load across the group of top level PGs in the system hierarchy.
78 * On NUMA systems, the CMT load balancing algorithm balances load across the
79 * group of top level PGs in each leaf lgroup...but for root homed threads,
80 * is willing to balance against all the top level PGs in the system.
81 *
82 * Groups of top level PGs are maintained to implement the above, one for each
83 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
84 * root lgroup) that contains all the top level PGs in the system.
85 */
86 static cmt_lgrp_t *cmt_lgrps = NULL; /* cmt_lgrps list head */
87 static cmt_lgrp_t *cpu0_lgrp = NULL; /* boot CPU's initial lgrp */
88 /* used for null_proc_lpa */
89 cmt_lgrp_t *cmt_root = NULL; /* Reference to root cmt pg */
90
91 static int is_cpu0 = 1; /* true if this is boot CPU context */
92
93 /*
94 * Array of hardware sharing relationships that are blacklisted.
95 * CMT scheduling optimizations won't be performed for blacklisted sharing
96 * relationships.
97 */
98 static int cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];
99
100 /*
101 * Set this to non-zero to disable CMT scheduling
102 * This must be done via kmdb -d, as /etc/system will be too late
103 */
104 int cmt_sched_disabled = 0;
105
106 /*
107 * Status codes for CMT lineage validation
108 * See pg_cmt_lineage_validate() below
109 */
110 typedef enum cmt_lineage_validation {
111 CMT_LINEAGE_VALID,
112 CMT_LINEAGE_NON_CONCENTRIC,
113 CMT_LINEAGE_PG_SPANS_LGRPS,
114 CMT_LINEAGE_NON_PROMOTABLE,
115 CMT_LINEAGE_REPAIRED,
116 CMT_LINEAGE_UNRECOVERABLE
117 } cmt_lineage_validation_t;
118
119 /*
120 * Status of the current lineage under construction.
121 * One must be holding cpu_lock to change this.
122 */
123 cmt_lineage_validation_t cmt_lineage_status = CMT_LINEAGE_VALID;
124
125 /*
126 * Power domain definitions (on x86) are defined by ACPI, and
127 * therefore may be subject to BIOS bugs.
128 */
129 #define PG_CMT_HW_SUSPECT(hw) PGHW_IS_PM_DOMAIN(hw)
130
131 /*
132 * Macro to test if PG is managed by the CMT PG class
133 */
134 #define IS_CMT_PG(pg) (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)
135
136 static pg_cid_t pg_cmt_class_id; /* PG class id */
137
138 static pg_t *pg_cmt_alloc();
139 static void pg_cmt_free(pg_t *);
140 static void pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
141 static void pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
142 static void pg_cmt_cpu_active(cpu_t *);
143 static void pg_cmt_cpu_inactive(cpu_t *);
144 static void pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
145 static void pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
146 static char *pg_cmt_policy_name(pg_t *);
147 static void pg_cmt_hier_sort(pg_cmt_t **, int);
148 static pg_cmt_t *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
149 static int pg_cmt_cpu_belongs(pg_t *, cpu_t *);
150 static int pg_cmt_hw(pghw_type_t);
151 static cmt_lgrp_t *pg_cmt_find_lgrp(lgrp_handle_t);
152 static cmt_lgrp_t *pg_cmt_lgrp_create(lgrp_handle_t);
153 static void cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
154 kthread_t *, kthread_t *);
155 static void cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
156 kthread_t *, kthread_t *);
157 static void cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
158 static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *,
159 cpu_pg_t *);
160
161 /*
162 * CMT PG ops
163 */
164 struct pg_ops pg_ops_cmt = {
165 pg_cmt_alloc,
166 pg_cmt_free,
167 pg_cmt_cpu_init,
168 pg_cmt_cpu_fini,
169 pg_cmt_cpu_active,
170 pg_cmt_cpu_inactive,
171 pg_cmt_cpupart_in,
172 NULL, /* cpupart_out */
173 pg_cmt_cpupart_move,
174 pg_cmt_cpu_belongs,
175 pg_cmt_policy_name,
176 };
177
178 /*
179 * Initialize the CMT PG class
180 */
181 void
pg_cmt_class_init(void)182 pg_cmt_class_init(void)
183 {
184 if (cmt_sched_disabled)
185 return;
186
187 pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
188 }
189
190 /*
191 * Called to indicate a new CPU has started up so
192 * that either t0 or the slave startup thread can
193 * be accounted for.
194 */
195 void
pg_cmt_cpu_startup(cpu_t * cp)196 pg_cmt_cpu_startup(cpu_t *cp)
197 {
198 pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
199 cp->cpu_thread);
200 }
201
202 /*
203 * Return non-zero if thread can migrate between "from" and "to"
204 * without a performance penalty
205 */
206 int
pg_cmt_can_migrate(cpu_t * from,cpu_t * to)207 pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
208 {
209 if (from->cpu_physid->cpu_cacheid ==
210 to->cpu_physid->cpu_cacheid)
211 return (1);
212 return (0);
213 }
214
215 /*
216 * CMT class specific PG allocation
217 */
218 static pg_t *
pg_cmt_alloc(void)219 pg_cmt_alloc(void)
220 {
221 return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
222 }
223
224 /*
225 * Class specific PG de-allocation
226 */
227 static void
pg_cmt_free(pg_t * pg)228 pg_cmt_free(pg_t *pg)
229 {
230 ASSERT(pg != NULL);
231 ASSERT(IS_CMT_PG(pg));
232
233 kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
234 }
235
236 /*
237 * Given a hardware sharing relationship, return which dispatcher
238 * policies should be implemented to optimize performance and efficiency
239 */
240 static pg_cmt_policy_t
pg_cmt_policy(pghw_type_t hw)241 pg_cmt_policy(pghw_type_t hw)
242 {
243 pg_cmt_policy_t p;
244
245 /*
246 * Give the platform a chance to override the default
247 */
248 if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
249 return (p);
250
251 switch (hw) {
252 case PGHW_IPIPE:
253 case PGHW_FPU:
254 case PGHW_PROCNODE:
255 case PGHW_CHIP:
256 return (CMT_BALANCE);
257 case PGHW_CACHE:
258 return (CMT_AFFINITY | CMT_BALANCE);
259 case PGHW_POW_ACTIVE:
260 case PGHW_POW_IDLE:
261 return (CMT_BALANCE);
262 default:
263 return (CMT_NO_POLICY);
264 }
265 }
266
267 /*
268 * Rank the importance of optimizing for the pg1 relationship vs.
269 * the pg2 relationship.
270 */
271 static pg_cmt_t *
pg_cmt_hier_rank(pg_cmt_t * pg1,pg_cmt_t * pg2)272 pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
273 {
274 pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
275 pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;
276
277 /*
278 * A power domain is only important if CPUPM is enabled.
279 */
280 if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
281 if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
282 return (pg2);
283 if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
284 return (pg1);
285 }
286
287 /*
288 * Otherwise, ask the platform
289 */
290 if (pg_plat_hw_rank(hw1, hw2) == hw1)
291 return (pg1);
292 else
293 return (pg2);
294 }
295
296 /*
297 * Initialize CMT callbacks for the given PG
298 */
299 static void
cmt_callback_init(pg_t * pg)300 cmt_callback_init(pg_t *pg)
301 {
302 /*
303 * Stick with the default callbacks if there isn't going to be
304 * any CMT thread placement optimizations implemented.
305 */
306 if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
307 return;
308
309 switch (((pghw_t *)pg)->pghw_hw) {
310 case PGHW_POW_ACTIVE:
311 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
312 pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
313 break;
314 default:
315 pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;
316
317 }
318 }
319
320 /*
321 * Promote PG above it's current parent.
322 * This is only legal if PG has an equal or greater number of CPUs than its
323 * parent.
324 *
325 * This routine operates on the CPU specific processor group data (for the CPUs
326 * in the PG being promoted), and may be invoked from a context where one CPU's
327 * PG data is under construction. In this case the argument "pgdata", if not
328 * NULL, is a reference to the CPU's under-construction PG data.
329 */
330 static void
cmt_hier_promote(pg_cmt_t * pg,cpu_pg_t * pgdata)331 cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
332 {
333 pg_cmt_t *parent;
334 group_t *children;
335 cpu_t *cpu;
336 group_iter_t iter;
337 pg_cpu_itr_t cpu_iter;
338 int r;
339 int err;
340 int nchildren;
341
342 ASSERT(MUTEX_HELD(&cpu_lock));
343
344 parent = pg->cmt_parent;
345 if (parent == NULL) {
346 /*
347 * Nothing to do
348 */
349 return;
350 }
351
352 ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));
353
354 /*
355 * We're changing around the hierarchy, which is actively traversed
356 * by the dispatcher. Pause CPUS to ensure exclusivity.
357 */
358 pause_cpus(NULL, NULL);
359
360 /*
361 * If necessary, update the parent's sibling set, replacing parent
362 * with PG.
363 */
364 if (parent->cmt_siblings) {
365 if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
366 != -1) {
367 r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
368 ASSERT(r != -1);
369 }
370 }
371
372 /*
373 * If the parent is at the top of the hierarchy, replace it's entry
374 * in the root lgroup's group of top level PGs.
375 */
376 if (parent->cmt_parent == NULL &&
377 parent->cmt_siblings != &cmt_root->cl_pgs) {
378 if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
379 != -1) {
380 r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
381 ASSERT(r != -1);
382 }
383 }
384
385 /*
386 * We assume (and therefore assert) that the PG being promoted is an
387 * only child of it's parent. Update the parent's children set
388 * replacing PG's entry with the parent (since the parent is becoming
389 * the child). Then have PG and the parent swap children sets and
390 * children counts.
391 */
392 ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
393 if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
394 r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
395 ASSERT(r != -1);
396 }
397
398 children = pg->cmt_children;
399 pg->cmt_children = parent->cmt_children;
400 parent->cmt_children = children;
401
402 nchildren = pg->cmt_nchildren;
403 pg->cmt_nchildren = parent->cmt_nchildren;
404 parent->cmt_nchildren = nchildren;
405
406 /*
407 * Update the sibling references for PG and it's parent
408 */
409 pg->cmt_siblings = parent->cmt_siblings;
410 parent->cmt_siblings = pg->cmt_children;
411
412 /*
413 * Update any cached lineages in the per CPU pg data.
414 */
415 PG_CPU_ITR_INIT(pg, cpu_iter);
416 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
417 int idx;
418 int sz;
419 pg_cmt_t *cpu_pg;
420 cpu_pg_t *pgd; /* CPU's PG data */
421
422 /*
423 * The CPU's whose lineage is under construction still
424 * references the bootstrap CPU PG data structure.
425 */
426 if (pg_cpu_is_bootstrapped(cpu))
427 pgd = pgdata;
428 else
429 pgd = cpu->cpu_pg;
430
431 /*
432 * Iterate over the CPU's PGs updating the children
433 * of the PG being promoted, since they have a new parent.
434 */
435 group_iter_init(&iter);
436 while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
437 if (cpu_pg->cmt_parent == pg) {
438 cpu_pg->cmt_parent = parent;
439 }
440 }
441
442 /*
443 * Update the CMT load balancing lineage
444 */
445 if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
446 /*
447 * Unless this is the CPU who's lineage is being
448 * constructed, the PG being promoted should be
449 * in the lineage.
450 */
451 ASSERT(pg_cpu_is_bootstrapped(cpu));
452 continue;
453 }
454
455 ASSERT(idx > 0);
456 ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);
457
458 /*
459 * Have the child and the parent swap places in the CPU's
460 * lineage
461 */
462 group_remove_at(&pgd->cmt_pgs, idx);
463 group_remove_at(&pgd->cmt_pgs, idx - 1);
464 err = group_add_at(&pgd->cmt_pgs, parent, idx);
465 ASSERT(err == 0);
466 err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
467 ASSERT(err == 0);
468
469 /*
470 * Ensure cmt_lineage references CPU's leaf PG.
471 * Since cmt_pgs is top-down ordered, the bottom is the last
472 * element.
473 */
474 if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
475 pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
476 }
477
478 /*
479 * Update the parent references for PG and it's parent
480 */
481 pg->cmt_parent = parent->cmt_parent;
482 parent->cmt_parent = pg;
483
484 start_cpus();
485 }
486
487 /*
488 * CMT class callback for a new CPU entering the system
489 *
490 * This routine operates on the CPU specific processor group data (for the CPU
491 * being initialized). The argument "pgdata" is a reference to the CPU's PG
492 * data to be constructed.
493 *
494 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
495 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
496 * calls must be careful to operate only on the "pgdata" argument, and not
497 * cp->cpu_pg.
498 */
499 static void
pg_cmt_cpu_init(cpu_t * cp,cpu_pg_t * pgdata)500 pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
501 {
502 pg_cmt_t *pg;
503 group_t *cmt_pgs;
504 int levels, level;
505 pghw_type_t hw;
506 pg_t *pg_cache = NULL;
507 pg_cmt_t *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
508 lgrp_handle_t lgrp_handle;
509 cmt_lgrp_t *lgrp;
510 cmt_lineage_validation_t lineage_status;
511
512 ASSERT(MUTEX_HELD(&cpu_lock));
513 ASSERT(pg_cpu_is_bootstrapped(cp));
514
515 if (cmt_sched_disabled)
516 return;
517
518 /*
519 * A new CPU is coming into the system.
520 * Interrogate the platform to see if the CPU
521 * has any performance or efficiency relevant
522 * sharing relationships
523 */
524 cmt_pgs = &pgdata->cmt_pgs;
525 pgdata->cmt_lineage = NULL;
526
527 bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
528 levels = 0;
529 for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {
530
531 pg_cmt_policy_t policy;
532
533 /*
534 * We're only interested in the hw sharing relationships
535 * for which we know how to optimize.
536 */
537 policy = pg_cmt_policy(hw);
538 if (policy == CMT_NO_POLICY ||
539 pg_plat_hw_shared(cp, hw) == 0)
540 continue;
541
542 /*
543 * We will still create the PGs for hardware sharing
544 * relationships that have been blacklisted, but won't
545 * implement CMT thread placement optimizations against them.
546 */
547 if (cmt_hw_blacklisted[hw] == 1)
548 policy = CMT_NO_POLICY;
549
550 /*
551 * Find (or create) the PG associated with
552 * the hw sharing relationship in which cp
553 * belongs.
554 *
555 * Determine if a suitable PG already
556 * exists, or if one needs to be created.
557 */
558 pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
559 if (pg == NULL) {
560 /*
561 * Create a new one.
562 * Initialize the common...
563 */
564 pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);
565
566 /* ... physical ... */
567 pghw_init((pghw_t *)pg, cp, hw);
568
569 /*
570 * ... and CMT specific portions of the
571 * structure.
572 */
573 pg->cmt_policy = policy;
574
575 /* CMT event callbacks */
576 cmt_callback_init((pg_t *)pg);
577
578 bitset_init(&pg->cmt_cpus_actv_set);
579 group_create(&pg->cmt_cpus_actv);
580 } else {
581 ASSERT(IS_CMT_PG(pg));
582 }
583
584 ((pghw_t *)pg)->pghw_generation++;
585
586 /* Add the CPU to the PG */
587 pg_cpu_add((pg_t *)pg, cp, pgdata);
588
589 /*
590 * Ensure capacity of the active CPU group/bitset
591 */
592 group_expand(&pg->cmt_cpus_actv,
593 GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
594
595 if (cp->cpu_seqid >=
596 bitset_capacity(&pg->cmt_cpus_actv_set)) {
597 bitset_resize(&pg->cmt_cpus_actv_set,
598 cp->cpu_seqid + 1);
599 }
600
601 /*
602 * Build a lineage of CMT PGs for load balancing / coalescence
603 */
604 if (policy & (CMT_BALANCE | CMT_COALESCE)) {
605 cpu_cmt_hier[levels++] = pg;
606 }
607
608 /* Cache this for later */
609 if (hw == PGHW_CACHE)
610 pg_cache = (pg_t *)pg;
611 }
612
613 group_expand(cmt_pgs, levels);
614
615 if (cmt_root == NULL)
616 cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());
617
618 /*
619 * Find the lgrp that encapsulates this CPU's CMT hierarchy
620 */
621 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
622 if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
623 lgrp = pg_cmt_lgrp_create(lgrp_handle);
624
625 /*
626 * Ascendingly sort the PGs in the lineage by number of CPUs
627 */
628 pg_cmt_hier_sort(cpu_cmt_hier, levels);
629
630 /*
631 * Examine the lineage and validate it.
632 * This routine will also try to fix the lineage along with the
633 * rest of the PG hierarchy should it detect an issue.
634 *
635 * If it returns anything other than VALID or REPAIRED, an
636 * unrecoverable error has occurred, and we cannot proceed.
637 */
638 lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
639 if ((lineage_status != CMT_LINEAGE_VALID) &&
640 (lineage_status != CMT_LINEAGE_REPAIRED)) {
641 /*
642 * In the case of an unrecoverable error where CMT scheduling
643 * has been disabled, assert that the under construction CPU's
644 * PG data has an empty CMT load balancing lineage.
645 */
646 ASSERT((cmt_sched_disabled == 0) ||
647 (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
648 return;
649 }
650
651 /*
652 * For existing PGs in the lineage, verify that the parent is
653 * correct, as the generation in the lineage may have changed
654 * as a result of the sorting. Start the traversal at the top
655 * of the lineage, moving down.
656 */
657 for (level = levels - 1; level >= 0; ) {
658 int reorg;
659
660 reorg = 0;
661 pg = cpu_cmt_hier[level];
662
663 /*
664 * Promote PGs at an incorrect generation into place.
665 */
666 while (pg->cmt_parent &&
667 pg->cmt_parent != cpu_cmt_hier[level + 1]) {
668 cmt_hier_promote(pg, pgdata);
669 reorg++;
670 }
671 if (reorg > 0)
672 level = levels - 1;
673 else
674 level--;
675 }
676
677 /*
678 * For each of the PGs in the CPU's lineage:
679 * - Add an entry in the CPU sorted CMT PG group
680 * which is used for top down CMT load balancing
681 * - Tie the PG into the CMT hierarchy by connecting
682 * it to it's parent and siblings.
683 */
684 for (level = 0; level < levels; level++) {
685 uint_t children;
686 int err;
687
688 pg = cpu_cmt_hier[level];
689 err = group_add_at(cmt_pgs, pg, levels - level - 1);
690 ASSERT(err == 0);
691
692 if (level == 0)
693 pgdata->cmt_lineage = (pg_t *)pg;
694
695 if (pg->cmt_siblings != NULL) {
696 /* Already initialized */
697 ASSERT(pg->cmt_parent == NULL ||
698 pg->cmt_parent == cpu_cmt_hier[level + 1]);
699 ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
700 ((pg->cmt_parent != NULL) &&
701 pg->cmt_siblings == pg->cmt_parent->cmt_children));
702 continue;
703 }
704
705 if ((level + 1) == levels) {
706 pg->cmt_parent = NULL;
707
708 pg->cmt_siblings = &lgrp->cl_pgs;
709 children = ++lgrp->cl_npgs;
710 if (cmt_root != lgrp)
711 cmt_root->cl_npgs++;
712 } else {
713 pg->cmt_parent = cpu_cmt_hier[level + 1];
714
715 /*
716 * A good parent keeps track of their children.
717 * The parent's children group is also the PG's
718 * siblings.
719 */
720 if (pg->cmt_parent->cmt_children == NULL) {
721 pg->cmt_parent->cmt_children =
722 kmem_zalloc(sizeof (group_t), KM_SLEEP);
723 group_create(pg->cmt_parent->cmt_children);
724 }
725 pg->cmt_siblings = pg->cmt_parent->cmt_children;
726 children = ++pg->cmt_parent->cmt_nchildren;
727 }
728
729 group_expand(pg->cmt_siblings, children);
730 group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
731 }
732
733 /*
734 * Cache the chip and core IDs in the cpu_t->cpu_physid structure
735 * for fast lookups later.
736 */
737 if (cp->cpu_physid) {
738 cp->cpu_physid->cpu_chipid =
739 pg_plat_hw_instance_id(cp, PGHW_CHIP);
740 cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);
741
742 /*
743 * If this cpu has a PG representing shared cache, then set
744 * cpu_cacheid to that PG's logical id
745 */
746 if (pg_cache)
747 cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
748 }
749
750 /* CPU0 only initialization */
751 if (is_cpu0) {
752 is_cpu0 = 0;
753 cpu0_lgrp = lgrp;
754 }
755
756 }
757
758 /*
759 * Class callback when a CPU is leaving the system (deletion)
760 *
761 * "pgdata" is a reference to the CPU's PG data to be deconstructed.
762 *
763 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
764 * references a "bootstrap" structure across this function's invocation.
765 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
766 * on the "pgdata" argument, and not cp->cpu_pg.
767 */
768 static void
pg_cmt_cpu_fini(cpu_t * cp,cpu_pg_t * pgdata)769 pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
770 {
771 group_iter_t i;
772 pg_cmt_t *pg;
773 group_t *pgs, *cmt_pgs;
774 lgrp_handle_t lgrp_handle;
775 cmt_lgrp_t *lgrp;
776
777 if (cmt_sched_disabled)
778 return;
779
780 ASSERT(pg_cpu_is_bootstrapped(cp));
781
782 pgs = &pgdata->pgs;
783 cmt_pgs = &pgdata->cmt_pgs;
784
785 /*
786 * Find the lgroup that encapsulates this CPU's CMT hierarchy
787 */
788 lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
789
790 lgrp = pg_cmt_find_lgrp(lgrp_handle);
791 if (ncpus == 1 && lgrp != cpu0_lgrp) {
792 /*
793 * One might wonder how we could be deconfiguring the
794 * only CPU in the system.
795 *
796 * On Starcat systems when null_proc_lpa is detected,
797 * the boot CPU (which is already configured into a leaf
798 * lgroup), is moved into the root lgroup. This is done by
799 * deconfiguring it from both lgroups and processor
800 * groups), and then later reconfiguring it back in. This
801 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
802 *
803 * This special case is detected by noting that the platform
804 * has changed the CPU's lgrp affiliation (since it now
805 * belongs in the root). In this case, use the cmt_lgrp_t
806 * cached for the boot CPU, since this is what needs to be
807 * torn down.
808 */
809 lgrp = cpu0_lgrp;
810 }
811
812 ASSERT(lgrp != NULL);
813
814 /*
815 * First, clean up anything load balancing specific for each of
816 * the CPU's PGs that participated in CMT load balancing
817 */
818 pg = (pg_cmt_t *)pgdata->cmt_lineage;
819 while (pg != NULL) {
820
821 ((pghw_t *)pg)->pghw_generation++;
822
823 /*
824 * Remove the PG from the CPU's load balancing lineage
825 */
826 (void) group_remove(cmt_pgs, pg, GRP_RESIZE);
827
828 /*
829 * If it's about to become empty, destroy it's children
830 * group, and remove it's reference from it's siblings.
831 * This is done here (rather than below) to avoid removing
832 * our reference from a PG that we just eliminated.
833 */
834 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
835 if (pg->cmt_children != NULL)
836 group_destroy(pg->cmt_children);
837 if (pg->cmt_siblings != NULL) {
838 if (pg->cmt_siblings == &lgrp->cl_pgs)
839 lgrp->cl_npgs--;
840 else
841 pg->cmt_parent->cmt_nchildren--;
842 }
843 }
844 pg = pg->cmt_parent;
845 }
846 ASSERT(GROUP_SIZE(cmt_pgs) == 0);
847
848 /*
849 * Now that the load balancing lineage updates have happened,
850 * remove the CPU from all it's PGs (destroying any that become
851 * empty).
852 */
853 group_iter_init(&i);
854 while ((pg = group_iterate(pgs, &i)) != NULL) {
855 if (IS_CMT_PG(pg) == 0)
856 continue;
857
858 pg_cpu_delete((pg_t *)pg, cp, pgdata);
859 /*
860 * Deleting the CPU from the PG changes the CPU's
861 * PG group over which we are actively iterating
862 * Re-initialize the iteration
863 */
864 group_iter_init(&i);
865
866 if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {
867
868 /*
869 * The PG has become zero sized, so destroy it.
870 */
871 group_destroy(&pg->cmt_cpus_actv);
872 bitset_fini(&pg->cmt_cpus_actv_set);
873 pghw_fini((pghw_t *)pg);
874
875 pg_destroy((pg_t *)pg);
876 }
877 }
878 }
879
880 /*
881 * Class callback when a CPU is entering a cpu partition
882 */
883 static void
pg_cmt_cpupart_in(cpu_t * cp,cpupart_t * pp)884 pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
885 {
886 group_t *pgs;
887 pg_t *pg;
888 group_iter_t i;
889
890 ASSERT(MUTEX_HELD(&cpu_lock));
891
892 if (cmt_sched_disabled)
893 return;
894
895 pgs = &cp->cpu_pg->pgs;
896
897 /*
898 * Ensure that the new partition's PG bitset
899 * is large enough for all CMT PG's to which cp
900 * belongs
901 */
902 group_iter_init(&i);
903 while ((pg = group_iterate(pgs, &i)) != NULL) {
904 if (IS_CMT_PG(pg) == 0)
905 continue;
906
907 if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
908 bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
909 }
910 }
911
912 /*
913 * Class callback when a CPU is actually moving partitions
914 */
915 static void
pg_cmt_cpupart_move(cpu_t * cp,cpupart_t * oldpp,cpupart_t * newpp)916 pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
917 {
918 cpu_t *cpp;
919 group_t *pgs;
920 pg_t *pg;
921 group_iter_t pg_iter;
922 pg_cpu_itr_t cpu_iter;
923 boolean_t found;
924
925 ASSERT(MUTEX_HELD(&cpu_lock));
926
927 if (cmt_sched_disabled)
928 return;
929
930 pgs = &cp->cpu_pg->pgs;
931 group_iter_init(&pg_iter);
932
933 /*
934 * Iterate over the CPUs CMT PGs
935 */
936 while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {
937
938 if (IS_CMT_PG(pg) == 0)
939 continue;
940
941 /*
942 * Add the PG to the bitset in the new partition.
943 */
944 bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);
945
946 /*
947 * Remove the PG from the bitset in the old partition
948 * if the last of the PG's CPUs have left.
949 */
950 found = B_FALSE;
951 PG_CPU_ITR_INIT(pg, cpu_iter);
952 while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
953 if (cpp == cp)
954 continue;
955 if (CPU_ACTIVE(cpp) &&
956 cpp->cpu_part->cp_id == oldpp->cp_id) {
957 found = B_TRUE;
958 break;
959 }
960 }
961 if (!found)
962 bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
963 }
964 }
965
966 /*
967 * Class callback when a CPU becomes active (online)
968 *
969 * This is called in a context where CPUs are paused
970 */
971 static void
pg_cmt_cpu_active(cpu_t * cp)972 pg_cmt_cpu_active(cpu_t *cp)
973 {
974 int err;
975 group_iter_t i;
976 pg_cmt_t *pg;
977 group_t *pgs;
978
979 ASSERT(MUTEX_HELD(&cpu_lock));
980
981 if (cmt_sched_disabled)
982 return;
983
984 pgs = &cp->cpu_pg->pgs;
985 group_iter_init(&i);
986
987 /*
988 * Iterate over the CPU's PGs
989 */
990 while ((pg = group_iterate(pgs, &i)) != NULL) {
991
992 if (IS_CMT_PG(pg) == 0)
993 continue;
994
995 /*
996 * Move to the next generation since topology is changing
997 */
998 ((pghw_t *)pg)->pghw_generation++;
999
1000 err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1001 ASSERT(err == 0);
1002
1003 /*
1004 * If this is the first active CPU in the PG, and it
1005 * represents a hardware sharing relationship over which
1006 * CMT load balancing is performed, add it as a candidate
1007 * for balancing with it's siblings.
1008 */
1009 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
1010 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1011 err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
1012 ASSERT(err == 0);
1013
1014 /*
1015 * If this is a top level PG, add it as a balancing
1016 * candidate when balancing within the root lgroup.
1017 */
1018 if (pg->cmt_parent == NULL &&
1019 pg->cmt_siblings != &cmt_root->cl_pgs) {
1020 err = group_add(&cmt_root->cl_pgs, pg,
1021 GRP_NORESIZE);
1022 ASSERT(err == 0);
1023 }
1024 }
1025
1026 /*
1027 * Notate the CPU in the PGs active CPU bitset.
1028 * Also notate the PG as being active in it's associated
1029 * partition
1030 */
1031 bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1032 bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
1033 }
1034 }
1035
1036 /*
1037 * Class callback when a CPU goes inactive (offline)
1038 *
1039 * This is called in a context where CPUs are paused
1040 */
1041 static void
pg_cmt_cpu_inactive(cpu_t * cp)1042 pg_cmt_cpu_inactive(cpu_t *cp)
1043 {
1044 int err;
1045 group_t *pgs;
1046 pg_cmt_t *pg;
1047 cpu_t *cpp;
1048 group_iter_t i;
1049 pg_cpu_itr_t cpu_itr;
1050 boolean_t found;
1051
1052 ASSERT(MUTEX_HELD(&cpu_lock));
1053
1054 if (cmt_sched_disabled)
1055 return;
1056
1057 pgs = &cp->cpu_pg->pgs;
1058 group_iter_init(&i);
1059
1060 while ((pg = group_iterate(pgs, &i)) != NULL) {
1061
1062 if (IS_CMT_PG(pg) == 0)
1063 continue;
1064
1065 /*
1066 * Move to the next generation since topology is changing
1067 */
1068 ((pghw_t *)pg)->pghw_generation++;
1069
1070 /*
1071 * Remove the CPU from the CMT PGs active CPU group
1072 * bitmap
1073 */
1074 err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
1075 ASSERT(err == 0);
1076
1077 bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
1078
1079 /*
1080 * If there are no more active CPUs in this PG over which
1081 * load was balanced, remove it as a balancing candidate.
1082 */
1083 if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
1084 (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
1085 err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1086 ASSERT(err == 0);
1087
1088 if (pg->cmt_parent == NULL &&
1089 pg->cmt_siblings != &cmt_root->cl_pgs) {
1090 err = group_remove(&cmt_root->cl_pgs, pg,
1091 GRP_NORESIZE);
1092 ASSERT(err == 0);
1093 }
1094 }
1095
1096 /*
1097 * Assert the number of active CPUs does not exceed
1098 * the total number of CPUs in the PG
1099 */
1100 ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
1101 GROUP_SIZE(&((pg_t *)pg)->pg_cpus));
1102
1103 /*
1104 * Update the PG bitset in the CPU's old partition
1105 */
1106 found = B_FALSE;
1107 PG_CPU_ITR_INIT(pg, cpu_itr);
1108 while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
1109 if (cpp == cp)
1110 continue;
1111 if (CPU_ACTIVE(cpp) &&
1112 cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
1113 found = B_TRUE;
1114 break;
1115 }
1116 }
1117 if (!found) {
1118 bitset_del(&cp->cpu_part->cp_cmt_pgs,
1119 ((pg_t *)pg)->pg_id);
1120 }
1121 }
1122 }
1123
1124 /*
1125 * Return non-zero if the CPU belongs in the given PG
1126 */
1127 static int
pg_cmt_cpu_belongs(pg_t * pg,cpu_t * cp)1128 pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
1129 {
1130 cpu_t *pg_cpu;
1131
1132 pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);
1133
1134 ASSERT(pg_cpu != NULL);
1135
1136 /*
1137 * The CPU belongs if, given the nature of the hardware sharing
1138 * relationship represented by the PG, the CPU has that
1139 * relationship with some other CPU already in the PG
1140 */
1141 if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
1142 return (1);
1143
1144 return (0);
1145 }
1146
1147 /*
1148 * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
1149 */
1150 static void
pg_cmt_hier_sort(pg_cmt_t ** hier,int size)1151 pg_cmt_hier_sort(pg_cmt_t **hier, int size)
1152 {
1153 int i, j, inc, sz;
1154 int start, end;
1155 pg_t *tmp;
1156 pg_t **h = (pg_t **)hier;
1157
1158 /*
1159 * First sort by number of CPUs
1160 */
1161 inc = size / 2;
1162 while (inc > 0) {
1163 for (i = inc; i < size; i++) {
1164 j = i;
1165 tmp = h[i];
1166 while ((j >= inc) &&
1167 (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
1168 h[j] = h[j - inc];
1169 j = j - inc;
1170 }
1171 h[j] = tmp;
1172 }
1173 if (inc == 2)
1174 inc = 1;
1175 else
1176 inc = (inc * 5) / 11;
1177 }
1178
1179 /*
1180 * Break ties by asking the platform.
1181 * Determine if h[i] outranks h[i + 1] and if so, swap them.
1182 */
1183 for (start = 0; start < size; start++) {
1184
1185 /*
1186 * Find various contiguous sets of elements,
1187 * in the array, with the same number of cpus
1188 */
1189 end = start;
1190 sz = PG_NUM_CPUS(h[start]);
1191 while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
1192 end++;
1193 /*
1194 * Sort each such set of the array by rank
1195 */
1196 for (i = start + 1; i < end; i++) {
1197 j = i - 1;
1198 tmp = h[i];
1199 while (j >= start &&
1200 pg_cmt_hier_rank(hier[j],
1201 (pg_cmt_t *)tmp) == hier[j]) {
1202 h[j + 1] = h[j];
1203 j--;
1204 }
1205 h[j + 1] = tmp;
1206 }
1207 }
1208 }
1209
1210 /*
1211 * Return a cmt_lgrp_t * given an lgroup handle.
1212 */
1213 static cmt_lgrp_t *
pg_cmt_find_lgrp(lgrp_handle_t hand)1214 pg_cmt_find_lgrp(lgrp_handle_t hand)
1215 {
1216 cmt_lgrp_t *lgrp;
1217
1218 ASSERT(MUTEX_HELD(&cpu_lock));
1219
1220 lgrp = cmt_lgrps;
1221 while (lgrp != NULL) {
1222 if (lgrp->cl_hand == hand)
1223 break;
1224 lgrp = lgrp->cl_next;
1225 }
1226 return (lgrp);
1227 }
1228
1229 /*
1230 * Create a cmt_lgrp_t with the specified handle.
1231 */
1232 static cmt_lgrp_t *
pg_cmt_lgrp_create(lgrp_handle_t hand)1233 pg_cmt_lgrp_create(lgrp_handle_t hand)
1234 {
1235 cmt_lgrp_t *lgrp;
1236
1237 ASSERT(MUTEX_HELD(&cpu_lock));
1238
1239 lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);
1240
1241 lgrp->cl_hand = hand;
1242 lgrp->cl_npgs = 0;
1243 lgrp->cl_next = cmt_lgrps;
1244 cmt_lgrps = lgrp;
1245 group_create(&lgrp->cl_pgs);
1246
1247 return (lgrp);
1248 }
1249
1250 /*
1251 * Interfaces to enable and disable power aware dispatching
1252 * The caller must be holding cpu_lock.
1253 *
1254 * Return 0 on success and -1 on failure.
1255 */
1256 int
cmt_pad_enable(pghw_type_t type)1257 cmt_pad_enable(pghw_type_t type)
1258 {
1259 group_t *hwset;
1260 group_iter_t iter;
1261 pg_cmt_t *pg;
1262
1263 ASSERT(PGHW_IS_PM_DOMAIN(type));
1264 ASSERT(MUTEX_HELD(&cpu_lock));
1265
1266 if (cmt_sched_disabled == 1)
1267 return (-1);
1268
1269 if ((hwset = pghw_set_lookup(type)) == NULL ||
1270 cmt_hw_blacklisted[type]) {
1271 /*
1272 * Unable to find any instances of the specified type
1273 * of power domain, or the power domains have been blacklisted.
1274 */
1275 return (-1);
1276 }
1277
1278 /*
1279 * Iterate over the power domains, setting the default dispatcher
1280 * policy for power/performance optimization.
1281 *
1282 * Simply setting the policy isn't enough in the case where the power
1283 * domain is an only child of another PG. Because the dispatcher walks
1284 * the PG hierarchy in a top down fashion, the higher up PG's policy
1285 * will dominate. So promote the power domain above it's parent if both
1286 * PG and it's parent have the same CPUs to ensure it's policy
1287 * dominates.
1288 */
1289 group_iter_init(&iter);
1290 while ((pg = group_iterate(hwset, &iter)) != NULL) {
1291 /*
1292 * If the power domain is an only child to a parent
1293 * not implementing the same policy, promote the child
1294 * above the parent to activate the policy.
1295 */
1296 pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
1297 while ((pg->cmt_parent != NULL) &&
1298 (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
1299 (PG_NUM_CPUS((pg_t *)pg) ==
1300 PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
1301 cmt_hier_promote(pg, NULL);
1302 }
1303 }
1304
1305 return (0);
1306 }
1307
1308 int
cmt_pad_disable(pghw_type_t type)1309 cmt_pad_disable(pghw_type_t type)
1310 {
1311 group_t *hwset;
1312 group_iter_t iter;
1313 pg_cmt_t *pg;
1314 pg_cmt_t *child;
1315
1316 ASSERT(PGHW_IS_PM_DOMAIN(type));
1317 ASSERT(MUTEX_HELD(&cpu_lock));
1318
1319 if (cmt_sched_disabled == 1)
1320 return (-1);
1321
1322 if ((hwset = pghw_set_lookup(type)) == NULL) {
1323 /*
1324 * Unable to find any instances of the specified type of
1325 * power domain.
1326 */
1327 return (-1);
1328 }
1329 /*
1330 * Iterate over the power domains, setting the default dispatcher
1331 * policy for performance optimization (load balancing).
1332 */
1333 group_iter_init(&iter);
1334 while ((pg = group_iterate(hwset, &iter)) != NULL) {
1335
1336 /*
1337 * If the power domain has an only child that implements
1338 * policy other than load balancing, promote the child
1339 * above the power domain to ensure it's policy dominates.
1340 */
1341 if (pg->cmt_children != NULL &&
1342 GROUP_SIZE(pg->cmt_children) == 1) {
1343 child = GROUP_ACCESS(pg->cmt_children, 0);
1344 if ((child->cmt_policy & CMT_BALANCE) == 0) {
1345 cmt_hier_promote(child, NULL);
1346 }
1347 }
1348 pg->cmt_policy = CMT_BALANCE;
1349 }
1350 return (0);
1351 }
1352
1353 /* ARGSUSED */
1354 static void
cmt_ev_thread_swtch(pg_t * pg,cpu_t * cp,hrtime_t now,kthread_t * old,kthread_t * new)1355 cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1356 kthread_t *new)
1357 {
1358 pg_cmt_t *cmt_pg = (pg_cmt_t *)pg;
1359
1360 if (old == cp->cpu_idle_thread) {
1361 atomic_inc_32(&cmt_pg->cmt_utilization);
1362 } else if (new == cp->cpu_idle_thread) {
1363 atomic_dec_32(&cmt_pg->cmt_utilization);
1364 }
1365 }
1366
1367 /*
1368 * Macro to test whether a thread is currently runnable on a CPU in a PG.
1369 */
1370 #define THREAD_RUNNABLE_IN_PG(t, pg) \
1371 ((t)->t_state == TS_RUN && \
1372 (t)->t_disp_queue->disp_cpu && \
1373 bitset_in_set(&(pg)->cmt_cpus_actv_set, \
1374 (t)->t_disp_queue->disp_cpu->cpu_seqid))
1375
1376 static void
cmt_ev_thread_swtch_pwr(pg_t * pg,cpu_t * cp,hrtime_t now,kthread_t * old,kthread_t * new)1377 cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
1378 kthread_t *new)
1379 {
1380 pg_cmt_t *cmt = (pg_cmt_t *)pg;
1381 cpupm_domain_t *dom;
1382 uint32_t u;
1383
1384 if (old == cp->cpu_idle_thread) {
1385 ASSERT(new != cp->cpu_idle_thread);
1386 u = atomic_inc_32_nv(&cmt->cmt_utilization);
1387 if (u == 1) {
1388 /*
1389 * Notify the CPU power manager that the domain
1390 * is non-idle.
1391 */
1392 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1393 cpupm_utilization_event(cp, now, dom,
1394 CPUPM_DOM_BUSY_FROM_IDLE);
1395 }
1396 } else if (new == cp->cpu_idle_thread) {
1397 ASSERT(old != cp->cpu_idle_thread);
1398 u = atomic_dec_32_nv(&cmt->cmt_utilization);
1399 if (u == 0) {
1400 /*
1401 * The domain is idle, notify the CPU power
1402 * manager.
1403 *
1404 * Avoid notifying if the thread is simply migrating
1405 * between CPUs in the domain.
1406 */
1407 if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
1408 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1409 cpupm_utilization_event(cp, now, dom,
1410 CPUPM_DOM_IDLE_FROM_BUSY);
1411 }
1412 }
1413 }
1414 }
1415
1416 /* ARGSUSED */
1417 static void
cmt_ev_thread_remain_pwr(pg_t * pg,cpu_t * cp,kthread_t * t)1418 cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
1419 {
1420 pg_cmt_t *cmt = (pg_cmt_t *)pg;
1421 cpupm_domain_t *dom;
1422
1423 dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
1424 cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
1425 }
1426
1427 /*
1428 * Return the name of the CMT scheduling policy
1429 * being implemented across this PG
1430 */
1431 static char *
pg_cmt_policy_name(pg_t * pg)1432 pg_cmt_policy_name(pg_t *pg)
1433 {
1434 pg_cmt_policy_t policy;
1435
1436 policy = ((pg_cmt_t *)pg)->cmt_policy;
1437
1438 if (policy & CMT_AFFINITY) {
1439 if (policy & CMT_BALANCE)
1440 return ("Load Balancing & Affinity");
1441 else if (policy & CMT_COALESCE)
1442 return ("Load Coalescence & Affinity");
1443 else
1444 return ("Affinity");
1445 } else {
1446 if (policy & CMT_BALANCE)
1447 return ("Load Balancing");
1448 else if (policy & CMT_COALESCE)
1449 return ("Load Coalescence");
1450 else
1451 return ("None");
1452 }
1453 }
1454
1455 /*
1456 * Prune PG, and all other instances of PG's hardware sharing relationship
1457 * from the CMT PG hierarchy.
1458 *
1459 * This routine operates on the CPU specific processor group data (for the CPUs
1460 * in the PG being pruned), and may be invoked from a context where one CPU's
1461 * PG data is under construction. In this case the argument "pgdata", if not
1462 * NULL, is a reference to the CPU's under-construction PG data.
1463 */
1464 static int
pg_cmt_prune(pg_cmt_t * pg_bad,pg_cmt_t ** lineage,int * sz,cpu_pg_t * pgdata)1465 pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1466 {
1467 group_t *hwset, *children;
1468 int i, j, r, size = *sz;
1469 group_iter_t hw_iter, child_iter;
1470 pg_cpu_itr_t cpu_iter;
1471 pg_cmt_t *pg, *child;
1472 cpu_t *cpu;
1473 int cap_needed;
1474 pghw_type_t hw;
1475
1476 ASSERT(MUTEX_HELD(&cpu_lock));
1477
1478 /*
1479 * Inform pghw layer that this PG is pruned.
1480 */
1481 pghw_cmt_fini((pghw_t *)pg_bad);
1482
1483 hw = ((pghw_t *)pg_bad)->pghw_hw;
1484
1485 if (hw == PGHW_POW_ACTIVE) {
1486 cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
1487 "Event Based CPUPM Unavailable");
1488 } else if (hw == PGHW_POW_IDLE) {
1489 cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
1490 "Dispatcher assisted CPUPM disabled.");
1491 }
1492
1493 /*
1494 * Find and eliminate the PG from the lineage.
1495 */
1496 for (i = 0; i < size; i++) {
1497 if (lineage[i] == pg_bad) {
1498 for (j = i; j < size - 1; j++)
1499 lineage[j] = lineage[j + 1];
1500 *sz = size - 1;
1501 break;
1502 }
1503 }
1504
1505 /*
1506 * We'll prune all instances of the hardware sharing relationship
1507 * represented by pg. But before we do that (and pause CPUs) we need
1508 * to ensure the hierarchy's groups are properly sized.
1509 */
1510 hwset = pghw_set_lookup(hw);
1511
1512 /*
1513 * Blacklist the hardware so future processor groups of this type won't
1514 * participate in CMT thread placement.
1515 *
1516 * XXX
1517 * For heterogeneous system configurations, this might be overkill.
1518 * We may only need to blacklist the illegal PGs, and other instances
1519 * of this hardware sharing relationship may be ok.
1520 */
1521 cmt_hw_blacklisted[hw] = 1;
1522
1523 /*
1524 * For each of the PGs being pruned, ensure sufficient capacity in
1525 * the siblings set for the PG's children
1526 */
1527 group_iter_init(&hw_iter);
1528 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1529 /*
1530 * PG is being pruned, but if it is bringing up more than
1531 * one child, ask for more capacity in the siblings group.
1532 */
1533 cap_needed = 0;
1534 if (pg->cmt_children &&
1535 GROUP_SIZE(pg->cmt_children) > 1) {
1536 cap_needed = GROUP_SIZE(pg->cmt_children) - 1;
1537
1538 group_expand(pg->cmt_siblings,
1539 GROUP_SIZE(pg->cmt_siblings) + cap_needed);
1540
1541 /*
1542 * If this is a top level group, also ensure the
1543 * capacity in the root lgrp level CMT grouping.
1544 */
1545 if (pg->cmt_parent == NULL &&
1546 pg->cmt_siblings != &cmt_root->cl_pgs) {
1547 group_expand(&cmt_root->cl_pgs,
1548 GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
1549 cmt_root->cl_npgs += cap_needed;
1550 }
1551 }
1552 }
1553
1554 /*
1555 * We're operating on the PG hierarchy. Pause CPUs to ensure
1556 * exclusivity with respect to the dispatcher.
1557 */
1558 pause_cpus(NULL, NULL);
1559
1560 /*
1561 * Prune all PG instances of the hardware sharing relationship
1562 * represented by pg.
1563 */
1564 group_iter_init(&hw_iter);
1565 while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
1566
1567 /*
1568 * Remove PG from it's group of siblings, if it's there.
1569 */
1570 if (pg->cmt_siblings) {
1571 (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
1572 }
1573 if (pg->cmt_parent == NULL &&
1574 pg->cmt_siblings != &cmt_root->cl_pgs) {
1575 (void) group_remove(&cmt_root->cl_pgs, pg,
1576 GRP_NORESIZE);
1577 }
1578
1579 /*
1580 * Indicate that no CMT policy will be implemented across
1581 * this PG.
1582 */
1583 pg->cmt_policy = CMT_NO_POLICY;
1584
1585 /*
1586 * Move PG's children from it's children set to it's parent's
1587 * children set. Note that the parent's children set, and PG's
1588 * siblings set are the same thing.
1589 *
1590 * Because we are iterating over the same group that we are
1591 * operating on (removing the children), first add all of PG's
1592 * children to the parent's children set, and once we are done
1593 * iterating, empty PG's children set.
1594 */
1595 if (pg->cmt_children != NULL) {
1596 children = pg->cmt_children;
1597
1598 group_iter_init(&child_iter);
1599 while ((child = group_iterate(children, &child_iter))
1600 != NULL) {
1601 if (pg->cmt_siblings != NULL) {
1602 r = group_add(pg->cmt_siblings, child,
1603 GRP_NORESIZE);
1604 ASSERT(r == 0);
1605
1606 if (pg->cmt_parent == NULL &&
1607 pg->cmt_siblings !=
1608 &cmt_root->cl_pgs) {
1609 r = group_add(&cmt_root->cl_pgs,
1610 child, GRP_NORESIZE);
1611 ASSERT(r == 0);
1612 }
1613 }
1614 }
1615 group_empty(pg->cmt_children);
1616 }
1617
1618 /*
1619 * Reset the callbacks to the defaults
1620 */
1621 pg_callback_set_defaults((pg_t *)pg);
1622
1623 /*
1624 * Update all the CPU lineages in each of PG's CPUs
1625 */
1626 PG_CPU_ITR_INIT(pg, cpu_iter);
1627 while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
1628 pg_cmt_t *cpu_pg;
1629 group_iter_t liter; /* Iterator for the lineage */
1630 cpu_pg_t *cpd; /* CPU's PG data */
1631
1632 /*
1633 * The CPU's lineage is under construction still
1634 * references the bootstrap CPU PG data structure.
1635 */
1636 if (pg_cpu_is_bootstrapped(cpu))
1637 cpd = pgdata;
1638 else
1639 cpd = cpu->cpu_pg;
1640
1641 /*
1642 * Iterate over the CPU's PGs updating the children
1643 * of the PG being promoted, since they have a new
1644 * parent and siblings set.
1645 */
1646 group_iter_init(&liter);
1647 while ((cpu_pg = group_iterate(&cpd->pgs,
1648 &liter)) != NULL) {
1649 if (cpu_pg->cmt_parent == pg) {
1650 cpu_pg->cmt_parent = pg->cmt_parent;
1651 cpu_pg->cmt_siblings = pg->cmt_siblings;
1652 }
1653 }
1654
1655 /*
1656 * Update the CPU's lineages
1657 *
1658 * Remove the PG from the CPU's group used for CMT
1659 * scheduling.
1660 */
1661 (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
1662 }
1663 }
1664 start_cpus();
1665 return (0);
1666 }
1667
1668 /*
1669 * Disable CMT scheduling
1670 */
1671 static void
pg_cmt_disable(void)1672 pg_cmt_disable(void)
1673 {
1674 cpu_t *cpu;
1675
1676 ASSERT(MUTEX_HELD(&cpu_lock));
1677
1678 pause_cpus(NULL, NULL);
1679 cpu = cpu_list;
1680
1681 do {
1682 if (cpu->cpu_pg)
1683 group_empty(&cpu->cpu_pg->cmt_pgs);
1684 } while ((cpu = cpu->cpu_next) != cpu_list);
1685
1686 cmt_sched_disabled = 1;
1687 start_cpus();
1688 cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
1689 }
1690
1691 /*
1692 * CMT lineage validation
1693 *
1694 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
1695 * of the PGs in a CPU's lineage. This is necessary because it's possible that
1696 * some groupings (power domain groupings in particular) may be defined by
1697 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
1698 * possible to integrate those groupings into the CMT PG hierarchy, if doing
1699 * so would violate the subset invariant of the hierarchy, which says that
1700 * a PG must be subset of its parent (if it has one).
1701 *
1702 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
1703 * would result in a violation of this invariant. If a violation is found,
1704 * and the PG is of a grouping type who's definition is known to originate from
1705 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
1706 * PG (and all other instances PG's sharing relationship type) from the CMT
1707 * hierarchy. Further, future instances of that sharing relationship type won't
1708 * be added. If the grouping definition doesn't originate from suspect
1709 * sources, then pg_cmt_disable() will be invoked to log an error, and disable
1710 * CMT scheduling altogether.
1711 *
1712 * This routine is invoked after the CPU has been added to the PGs in which
1713 * it belongs, but before those PGs have been added to (or had their place
1714 * adjusted in) the CMT PG hierarchy.
1715 *
1716 * The first argument is the CPUs PG lineage (essentially an array of PGs in
1717 * which the CPU belongs) that has already been sorted in ascending order
1718 * by CPU count. Some of the PGs in the CPUs lineage may already have other
1719 * CPUs in them, and have already been integrated into the CMT hierarchy.
1720 *
1721 * The addition of this new CPU to these pre-existing PGs means that those
1722 * PGs may need to be promoted up in the hierarchy to satisfy the subset
1723 * invariant. In additon to testing the subset invariant for the lineage,
1724 * this routine also verifies that the addition of the new CPU to the
1725 * existing PGs wouldn't cause the subset invariant to be violated in
1726 * the exiting lineages.
1727 *
1728 * This routine will normally return one of the following:
1729 * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
1730 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
1731 *
1732 * Otherwise, this routine will return a value indicating which error it
1733 * was unable to recover from (and set cmt_lineage_status along the way).
1734 *
1735 * This routine operates on the CPU specific processor group data (for the CPU
1736 * whose lineage is being validated), which is under-construction.
1737 * "pgdata" is a reference to the CPU's under-construction PG data.
1738 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
1739 */
1740 static cmt_lineage_validation_t
pg_cmt_lineage_validate(pg_cmt_t ** lineage,int * sz,cpu_pg_t * pgdata)1741 pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
1742 {
1743 int i, j, size;
1744 pg_cmt_t *pg, *pg_next, *pg_bad, *pg_tmp, *parent;
1745 cpu_t *cp;
1746 pg_cpu_itr_t cpu_iter;
1747 lgrp_handle_t lgrp;
1748
1749 ASSERT(MUTEX_HELD(&cpu_lock));
1750
1751 revalidate:
1752 size = *sz;
1753 pg_bad = NULL;
1754 lgrp = LGRP_NULL_HANDLE;
1755 for (i = 0; i < size; i++) {
1756
1757 pg = lineage[i];
1758 if (i < size - 1)
1759 pg_next = lineage[i + 1];
1760 else
1761 pg_next = NULL;
1762
1763 /*
1764 * We assume that the lineage has already been sorted
1765 * by the number of CPUs. In fact, we depend on it.
1766 */
1767 ASSERT(pg_next == NULL ||
1768 (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));
1769
1770 /*
1771 * The CPUs PG lineage was passed as the first argument to
1772 * this routine and contains the sorted list of the CPU's
1773 * PGs. Ultimately, the ordering of the PGs in that list, and
1774 * the ordering as traversed by the cmt_parent list must be
1775 * the same. PG promotion will be used as the mechanism to
1776 * achieve this, but first we need to look for cases where
1777 * promotion will be necessary, and validate that will be
1778 * possible without violating the subset invarient described
1779 * above.
1780 *
1781 * Since the PG topology is in the middle of being changed, we
1782 * need to check whether the PG's existing parent (if any) is
1783 * part of this CPU's lineage (and therefore should contain
1784 * the new CPU). If not, it means that the addition of the
1785 * new CPU should have made this PG have more CPUs than its
1786 * parent (and other ancestors not in the same lineage) and
1787 * will need to be promoted into place.
1788 *
1789 * We need to verify all of this to defend against a buggy
1790 * BIOS giving bad power domain CPU groupings. Sigh.
1791 */
1792 parent = pg->cmt_parent;
1793 while (parent != NULL) {
1794 /*
1795 * Determine if the parent/ancestor is in this lineage
1796 */
1797 pg_tmp = NULL;
1798 for (j = 0; (j < size) && (pg_tmp != parent); j++) {
1799 pg_tmp = lineage[j];
1800 }
1801 if (pg_tmp == parent) {
1802 /*
1803 * It's in the lineage. The concentricity
1804 * checks will handle the rest.
1805 */
1806 break;
1807 }
1808 /*
1809 * If it is not in the lineage, PG will eventually
1810 * need to be promoted above it. Verify the ancestor
1811 * is a proper subset. There is still an error if
1812 * the ancestor has the same number of CPUs as PG,
1813 * since that would imply it should be in the lineage,
1814 * and we already know it isn't.
1815 */
1816 if (PG_NUM_CPUS((pg_t *)parent) >=
1817 PG_NUM_CPUS((pg_t *)pg)) {
1818 /*
1819 * Not a proper subset if the parent/ancestor
1820 * has the same or more CPUs than PG.
1821 */
1822 cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
1823 goto handle_error;
1824 }
1825 parent = parent->cmt_parent;
1826 }
1827
1828 /*
1829 * Walk each of the CPUs in the PGs group and perform
1830 * consistency checks along the way.
1831 */
1832 PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
1833 while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
1834 /*
1835 * Verify that there aren't any CPUs contained in PG
1836 * that the next PG in the lineage (which is larger
1837 * or same size) doesn't also contain.
1838 */
1839 if (pg_next != NULL &&
1840 pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
1841 cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
1842 goto handle_error;
1843 }
1844
1845 /*
1846 * Verify that all the CPUs in the PG are in the same
1847 * lgroup.
1848 */
1849 if (lgrp == LGRP_NULL_HANDLE) {
1850 lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
1851 } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
1852 cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
1853 goto handle_error;
1854 }
1855 }
1856 }
1857
1858 handle_error:
1859 /*
1860 * Some of these validation errors can result when the CPU grouping
1861 * information is derived from buggy sources (for example, incorrect
1862 * ACPI tables on x86 systems).
1863 *
1864 * We'll try to recover in such cases by pruning out the illegal
1865 * groupings from the PG hierarchy, which means that we won't optimize
1866 * for those levels, but we will for the remaining ones.
1867 */
1868 switch (cmt_lineage_status) {
1869 case CMT_LINEAGE_VALID:
1870 case CMT_LINEAGE_REPAIRED:
1871 break;
1872 case CMT_LINEAGE_PG_SPANS_LGRPS:
1873 /*
1874 * We've detected a PG whose CPUs span lgroups.
1875 *
1876 * This isn't supported, as the dispatcher isn't allowed to
1877 * to do CMT thread placement across lgroups, as this would
1878 * conflict with policies implementing MPO thread affinity.
1879 *
1880 * If the PG is of a sharing relationship type known to
1881 * legitimately span lgroups, specify that no CMT thread
1882 * placement policy should be implemented, and prune the PG
1883 * from the existing CMT PG hierarchy.
1884 *
1885 * Otherwise, fall though to the case below for handling.
1886 */
1887 if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
1888 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1889 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1890 goto revalidate;
1891 }
1892 }
1893 /*LINTED*/
1894 case CMT_LINEAGE_NON_PROMOTABLE:
1895 /*
1896 * We've detected a PG that already exists in another CPU's
1897 * lineage that cannot cannot legally be promoted into place
1898 * without breaking the invariants of the hierarchy.
1899 */
1900 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1901 if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
1902 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1903 goto revalidate;
1904 }
1905 }
1906 /*
1907 * Something went wrong trying to prune out the bad level.
1908 * Disable CMT scheduling altogether.
1909 */
1910 pg_cmt_disable();
1911 break;
1912 case CMT_LINEAGE_NON_CONCENTRIC:
1913 /*
1914 * We've detected a non-concentric PG lineage, which means that
1915 * there's a PG in the lineage that has CPUs that the next PG
1916 * over in the lineage (which is the same size or larger)
1917 * doesn't have.
1918 *
1919 * In this case, we examine the two PGs to see if either
1920 * grouping is defined by potentially buggy sources.
1921 *
1922 * If one has less CPUs than the other, and contains CPUs
1923 * not found in the parent, and it is an untrusted enumeration,
1924 * then prune it. If both have the same number of CPUs, then
1925 * prune the one that is untrusted.
1926 *
1927 * This process repeats until we have a concentric lineage,
1928 * or we would have to prune out level derived from what we
1929 * thought was a reliable source, in which case CMT scheduling
1930 * is disabled altogether.
1931 */
1932 if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
1933 (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
1934 pg_bad = pg;
1935 } else if (PG_NUM_CPUS((pg_t *)pg) ==
1936 PG_NUM_CPUS((pg_t *)pg_next)) {
1937 if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
1938 pg_bad = pg_next;
1939 } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
1940 pg_bad = pg;
1941 }
1942 }
1943 if (pg_bad) {
1944 if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
1945 cmt_lineage_status = CMT_LINEAGE_REPAIRED;
1946 goto revalidate;
1947 }
1948 }
1949 /*
1950 * Something went wrong trying to identify and/or prune out
1951 * the bad level. Disable CMT scheduling altogether.
1952 */
1953 pg_cmt_disable();
1954 break;
1955 default:
1956 /*
1957 * If we're here, we've encountered a validation error for
1958 * which we don't know how to recover. In this case, disable
1959 * CMT scheduling altogether.
1960 */
1961 cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
1962 pg_cmt_disable();
1963 }
1964 return (cmt_lineage_status);
1965 }
1966