1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 * Copyright 2019 Joyent, Inc.
25 */
26
27 /*
28 * Basic NUMA support in terms of locality groups
29 *
30 * Solaris needs to know which CPUs, memory, etc. are near each other to
31 * provide good performance on NUMA machines by optimizing for locality.
32 * In order to do this, a new abstraction called a "locality group (lgroup)"
33 * has been introduced to keep track of which CPU-like and memory-like hardware
34 * resources are close to each other. Currently, latency is the only measure
35 * used to determine how to group hardware resources into lgroups, but this
36 * does not limit the groupings to be based solely on latency. Other factors
37 * may be used to determine the groupings in the future.
38 *
39 * Lgroups are organized into a hieararchy or topology that represents the
40 * latency topology of the machine. There is always at least a root lgroup in
41 * the system. It represents all the hardware resources in the machine at a
42 * latency big enough that any hardware resource can at least access any other
43 * hardware resource within that latency. A Uniform Memory Access (UMA)
44 * machine is represented with one lgroup (the root). In contrast, a NUMA
45 * machine is represented at least by the root lgroup and some number of leaf
46 * lgroups where the leaf lgroups contain the hardware resources within the
47 * least latency of each other and the root lgroup still contains all the
48 * resources in the machine. Some number of intermediate lgroups may exist
49 * which represent more levels of locality than just the local latency of the
50 * leaf lgroups and the system latency of the root lgroup. Non-leaf lgroups
51 * (eg. root and intermediate lgroups) contain the next nearest resources to
52 * its children lgroups. Thus, the lgroup hierarchy from a given leaf lgroup
53 * to the root lgroup shows the hardware resources from closest to farthest
54 * from the leaf lgroup such that each successive ancestor lgroup contains
55 * the next nearest resources at the next level of locality from the previous.
56 *
57 * The kernel uses the lgroup abstraction to know how to allocate resources
58 * near a given process/thread. At fork() and lwp/thread_create() time, a
59 * "home" lgroup is chosen for a thread. This is done by picking the lgroup
60 * with the lowest load average. Binding to a processor or processor set will
61 * change the home lgroup for a thread. The scheduler has been modified to try
62 * to dispatch a thread on a CPU in its home lgroup. Physical memory
63 * allocation is lgroup aware too, so memory will be allocated from the current
64 * thread's home lgroup if possible. If the desired resources are not
65 * available, the kernel traverses the lgroup hierarchy going to the parent
66 * lgroup to find resources at the next level of locality until it reaches the
67 * root lgroup.
68 */
69
70 #include <sys/lgrp.h>
71 #include <sys/lgrp_user.h>
72 #include <sys/types.h>
73 #include <sys/mman.h>
74 #include <sys/param.h>
75 #include <sys/var.h>
76 #include <sys/thread.h>
77 #include <sys/cpuvar.h>
78 #include <sys/cpupart.h>
79 #include <sys/kmem.h>
80 #include <vm/seg.h>
81 #include <vm/seg_kmem.h>
82 #include <vm/seg_spt.h>
83 #include <vm/seg_vn.h>
84 #include <vm/as.h>
85 #include <sys/atomic.h>
86 #include <sys/systm.h>
87 #include <sys/errno.h>
88 #include <sys/cmn_err.h>
89 #include <sys/kstat.h>
90 #include <sys/sysmacros.h>
91 #include <sys/pg.h>
92 #include <sys/promif.h>
93 #include <sys/sdt.h>
94 #include <sys/smt.h>
95
96 lgrp_gen_t lgrp_gen = 0; /* generation of lgroup hierarchy */
97 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
98 /* indexed by lgrp_id */
99 int nlgrps; /* number of lgroups in machine */
100 int lgrp_alloc_hint = -1; /* hint for where to try to allocate next */
101 int lgrp_alloc_max = 0; /* max lgroup ID allocated so far */
102
103 /*
104 * Kstat data for lgroups.
105 *
106 * Actual kstat data is collected in lgrp_stats array.
107 * The lgrp_kstat_data array of named kstats is used to extract data from
108 * lgrp_stats and present it to kstat framework. It is protected from partallel
109 * modifications by lgrp_kstat_mutex. This may cause some contention when
110 * several kstat commands run in parallel but this is not the
111 * performance-critical path.
112 */
113 extern struct lgrp_stats lgrp_stats[]; /* table of per-lgrp stats */
114
115 /*
116 * Declare kstat names statically for enums as defined in the header file.
117 */
118 LGRP_KSTAT_NAMES;
119
120 static void lgrp_kstat_init(void);
121 static int lgrp_kstat_extract(kstat_t *, int);
122 static void lgrp_kstat_reset(lgrp_id_t);
123
124 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
125 static kmutex_t lgrp_kstat_mutex;
126
127
128 /*
129 * max number of lgroups supported by the platform
130 */
131 int nlgrpsmax = 0;
132
133 /*
134 * The root lgroup. Represents the set of resources at the system wide
135 * level of locality.
136 */
137 lgrp_t *lgrp_root = NULL;
138
139 /*
140 * During system bootstrap cp_default does not contain the list of lgrp load
141 * averages (cp_lgrploads). The list is allocated after the first CPU is brought
142 * on-line when cp_default is initialized by cpupart_initialize_default().
143 * Configuring CPU0 may create a two-level topology with root and one leaf node
144 * containing CPU0. This topology is initially constructed in a special
145 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
146 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
147 * for all lpl operations until cp_default is fully constructed.
148 *
149 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
150 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
151 * the first element of lpl_bootstrap_list.
152 *
153 * CPUs that are added to the system, but have not yet been assigned to an
154 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
155 * on some architectures (x86) it's possible for the slave CPU startup thread
156 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
157 */
158 #define LPL_BOOTSTRAP_SIZE 2
159 static lpl_t lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
160 lpl_t *lpl_bootstrap;
161 static lpl_t *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
162 static int lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];
163
164 /*
165 * If cp still references the bootstrap lpl, it has not yet been added to
166 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
167 * a thread is trying to allocate memory close to a CPU that has no lgrp.
168 */
169 #define LGRP_CPU_HAS_NO_LGRP(cp) ((cp)->cpu_lpl == lpl_bootstrap)
170
171 static lgrp_t lroot;
172
173 /*
174 * Size, in bytes, beyond which random memory allocation policy is applied
175 * to non-shared memory. Default is the maximum size, so random memory
176 * allocation won't be used for non-shared memory by default.
177 */
178 size_t lgrp_privm_random_thresh = (size_t)(-1);
179
180 /* the maximum effect that a single thread can have on it's lgroup's load */
181 #define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
182 ((lgrp_loadavg_max_effect) / (ncpu))
183 uint32_t lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
184
185
186 /*
187 * Size, in bytes, beyond which random memory allocation policy is applied to
188 * shared memory. Default is 8MB (2 ISM pages).
189 */
190 size_t lgrp_shm_random_thresh = 8*1024*1024;
191
192 /*
193 * Whether to do processor set aware memory allocation by default
194 */
195 int lgrp_mem_pset_aware = 0;
196
197 /*
198 * Set the default memory allocation policy for root lgroup
199 */
200 lgrp_mem_policy_t lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
201
202 /*
203 * Set the default memory allocation policy. For most platforms,
204 * next touch is sufficient, but some platforms may wish to override
205 * this.
206 */
207 lgrp_mem_policy_t lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
208
209
210 /*
211 * lgroup CPU event handlers
212 */
213 static void lgrp_cpu_init(struct cpu *);
214 static void lgrp_cpu_fini(struct cpu *, lgrp_id_t);
215 static lgrp_t *lgrp_cpu_to_lgrp(struct cpu *);
216
217 /*
218 * lgroup memory event handlers
219 */
220 static void lgrp_mem_init(int, lgrp_handle_t, boolean_t);
221 static void lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
222 static void lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
223
224 /*
225 * lgroup CPU partition event handlers
226 */
227 static void lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
228 static void lgrp_part_del_cpu(struct cpu *);
229
230 /*
231 * lgroup framework initialization
232 */
233 static void lgrp_main_init(void);
234 static void lgrp_main_mp_init(void);
235 static void lgrp_root_init(void);
236 static void lgrp_setup(void);
237
238 /*
239 * lpl topology
240 */
241 static void lpl_init(lpl_t *, lpl_t *, lgrp_t *);
242 static void lpl_clear(lpl_t *);
243 static void lpl_leaf_insert(lpl_t *, struct cpupart *);
244 static void lpl_leaf_remove(lpl_t *, struct cpupart *);
245 static void lpl_rset_add(lpl_t *, lpl_t *);
246 static void lpl_rset_del(lpl_t *, lpl_t *);
247 static int lpl_rset_contains(lpl_t *, lpl_t *);
248 static void lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
249 static void lpl_child_update(lpl_t *, struct cpupart *);
250 static int lpl_pick(lpl_t *, lpl_t *);
251 static void lpl_verify_wrapper(struct cpupart *);
252
253 /*
254 * defines for lpl topology verifier return codes
255 */
256
257 #define LPL_TOPO_CORRECT 0
258 #define LPL_TOPO_PART_HAS_NO_LPL -1
259 #define LPL_TOPO_CPUS_NOT_EMPTY -2
260 #define LPL_TOPO_LGRP_MISMATCH -3
261 #define LPL_TOPO_MISSING_PARENT -4
262 #define LPL_TOPO_PARENT_MISMATCH -5
263 #define LPL_TOPO_BAD_CPUCNT -6
264 #define LPL_TOPO_RSET_MISMATCH -7
265 #define LPL_TOPO_LPL_ORPHANED -8
266 #define LPL_TOPO_LPL_BAD_NCPU -9
267 #define LPL_TOPO_RSET_MSSNG_LF -10
268 #define LPL_TOPO_CPU_HAS_BAD_LPL -11
269 #define LPL_TOPO_NONLEAF_HAS_CPUS -12
270 #define LPL_TOPO_LGRP_NOT_LEAF -13
271 #define LPL_TOPO_BAD_RSETCNT -14
272
273 /*
274 * Return whether lgroup optimizations should be enabled on this system
275 */
276 int
lgrp_optimizations(void)277 lgrp_optimizations(void)
278 {
279 /*
280 * System must have more than 2 lgroups to enable lgroup optimizations
281 *
282 * XXX This assumes that a 2 lgroup system has an empty root lgroup
283 * with one child lgroup containing all the resources. A 2 lgroup
284 * system with a root lgroup directly containing CPUs or memory might
285 * need lgroup optimizations with its child lgroup, but there
286 * isn't such a machine for now....
287 */
288 if (nlgrps > 2)
289 return (1);
290
291 return (0);
292 }
293
294 /*
295 * Setup root lgroup
296 */
297 static void
lgrp_root_init(void)298 lgrp_root_init(void)
299 {
300 lgrp_handle_t hand;
301 int i;
302 lgrp_id_t id;
303
304 /*
305 * Create the "root" lgroup
306 */
307 ASSERT(nlgrps == 0);
308 id = nlgrps++;
309
310 lgrp_root = &lroot;
311
312 lgrp_root->lgrp_cpu = NULL;
313 lgrp_root->lgrp_mnodes = 0;
314 lgrp_root->lgrp_nmnodes = 0;
315 hand = lgrp_plat_root_hand();
316 lgrp_root->lgrp_plathand = hand;
317
318 lgrp_root->lgrp_id = id;
319 lgrp_root->lgrp_cpucnt = 0;
320 lgrp_root->lgrp_childcnt = 0;
321 klgrpset_clear(lgrp_root->lgrp_children);
322 klgrpset_clear(lgrp_root->lgrp_leaves);
323 lgrp_root->lgrp_parent = NULL;
324 lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
325
326 for (i = 0; i < LGRP_RSRC_COUNT; i++)
327 klgrpset_clear(lgrp_root->lgrp_set[i]);
328
329 lgrp_root->lgrp_kstat = NULL;
330
331 lgrp_table[id] = lgrp_root;
332
333 /*
334 * Setup initial lpl list for CPU0 and initial t0 home.
335 * The only lpl space we have so far is lpl_bootstrap. It is used for
336 * all topology operations until cp_default is initialized at which
337 * point t0.t_lpl will be updated.
338 */
339 lpl_bootstrap = lpl_bootstrap_list;
340 t0.t_lpl = lpl_bootstrap;
341 cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
342 lpl_bootstrap_list[1].lpl_lgrpid = 1;
343
344 /*
345 * Set up the bootstrap rset
346 * Since the bootstrap toplogy has just the root, and a leaf,
347 * the rset contains just the leaf, and both lpls can use the same rset
348 */
349 lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
350 lpl_bootstrap_list[0].lpl_rset_sz = 1;
351 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
352 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
353
354 lpl_bootstrap_list[1].lpl_rset_sz = 1;
355 lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
356 lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;
357
358 cp_default.cp_lgrploads = lpl_bootstrap;
359 }
360
361 /*
362 * Initialize the lgroup framework and allow the platform to do the same
363 *
364 * This happens in stages during boot and is all funnelled through this routine
365 * (see definition of lgrp_init_stages_t to see what happens at each stage and
366 * when)
367 */
368 void
lgrp_init(lgrp_init_stages_t stage)369 lgrp_init(lgrp_init_stages_t stage)
370 {
371 /*
372 * Initialize the platform
373 */
374 lgrp_plat_init(stage);
375
376 switch (stage) {
377 case LGRP_INIT_STAGE1:
378 /*
379 * Set max number of lgroups supported on this platform which
380 * must be less than the max number of lgroups supported by the
381 * common lgroup framework (eg. NLGRPS_MAX is max elements in
382 * lgrp_table[], etc.)
383 */
384 nlgrpsmax = lgrp_plat_max_lgrps();
385 ASSERT(nlgrpsmax <= NLGRPS_MAX);
386 break;
387
388 case LGRP_INIT_STAGE2:
389 lgrp_setup();
390 break;
391
392 case LGRP_INIT_STAGE4:
393 lgrp_main_init();
394 break;
395
396 case LGRP_INIT_STAGE5:
397 lgrp_main_mp_init();
398 break;
399
400 default:
401 break;
402 }
403 }
404
405 /*
406 * Create the root and cpu0's lgroup, and set t0's home.
407 */
408 static void
lgrp_setup(void)409 lgrp_setup(void)
410 {
411 /*
412 * Setup the root lgroup
413 */
414 lgrp_root_init();
415
416 /*
417 * Add cpu0 to an lgroup
418 */
419 lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
420 lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
421 }
422
423 /*
424 * true when lgrp initialization has been completed.
425 */
426 int lgrp_initialized = 0;
427
428 /*
429 * True when lgrp topology is constructed.
430 */
431 int lgrp_topo_initialized = 0;
432
433 /*
434 * Init routine called after startup(), /etc/system has been processed,
435 * and cpu0 has been added to an lgroup.
436 */
437 static void
lgrp_main_init(void)438 lgrp_main_init(void)
439 {
440 cpu_t *cp = CPU;
441 lgrp_id_t lgrpid;
442 int i;
443 extern void pg_cpu0_reinit();
444
445 /*
446 * Enforce a valid lgrp_mem_default_policy
447 */
448 if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
449 (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
450 (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
451 lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
452
453 /*
454 * See if mpo should be disabled.
455 * This may happen in the case of null proc LPA on Starcat.
456 * The platform won't be able to detect null proc LPA until after
457 * cpu0 and memory have already been added to lgroups.
458 * When and if it is detected, the Starcat platform will return
459 * a different platform handle for cpu0 which is what we check for
460 * here. If mpo should be disabled move cpu0 to it's rightful place
461 * (the root), and destroy the remaining lgroups. This effectively
462 * provides an UMA lgroup topology.
463 */
464 lgrpid = cp->cpu_lpl->lpl_lgrpid;
465 if (lgrp_table[lgrpid]->lgrp_plathand !=
466 lgrp_plat_cpu_to_hand(cp->cpu_id)) {
467 lgrp_part_del_cpu(cp);
468 lgrp_cpu_fini(cp, lgrpid);
469
470 lgrp_cpu_init(cp);
471 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
472
473 ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
474
475 /*
476 * Notify the PG subsystem that the CPU's lgrp
477 * association has changed
478 */
479 pg_cpu0_reinit();
480
481 /*
482 * Destroy all lgroups except for root
483 */
484 for (i = 0; i <= lgrp_alloc_max; i++) {
485 if (LGRP_EXISTS(lgrp_table[i]) &&
486 lgrp_table[i] != lgrp_root)
487 lgrp_destroy(lgrp_table[i]);
488 }
489
490 /*
491 * Fix up root to point at itself for leaves and resources
492 * and not have any children
493 */
494 lgrp_root->lgrp_childcnt = 0;
495 klgrpset_clear(lgrp_root->lgrp_children);
496 klgrpset_clear(lgrp_root->lgrp_leaves);
497 klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
498 klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
499 klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
500 }
501
502 /*
503 * Initialize kstats framework.
504 */
505 lgrp_kstat_init();
506 /*
507 * cpu0 is finally where it should be, so create it's lgroup's kstats
508 */
509 mutex_enter(&cpu_lock);
510 lgrp_kstat_create(cp);
511 mutex_exit(&cpu_lock);
512
513 lgrp_initialized = 1;
514 }
515
516 /*
517 * Finish lgrp initialization after all CPUS are brought on-line.
518 * This routine is called after start_other_cpus().
519 */
520 static void
lgrp_main_mp_init(void)521 lgrp_main_mp_init(void)
522 {
523 klgrpset_t changed;
524
525 smt_init();
526
527 /*
528 * Update lgroup topology (if necessary)
529 */
530 klgrpset_clear(changed);
531 (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
532 lgrp_topo_initialized = 1;
533 }
534
535 /*
536 * Change latency of lgroup with specified lgroup platform handle (if one is
537 * given) or change all lgroups with old latency to new latency
538 */
539 void
lgrp_latency_change(lgrp_handle_t hand,u_longlong_t oldtime,u_longlong_t newtime)540 lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
541 u_longlong_t newtime)
542 {
543 lgrp_t *lgrp;
544 int i;
545
546 for (i = 0; i <= lgrp_alloc_max; i++) {
547 lgrp = lgrp_table[i];
548
549 if (!LGRP_EXISTS(lgrp))
550 continue;
551
552 if ((hand == LGRP_NULL_HANDLE &&
553 lgrp->lgrp_latency == oldtime) ||
554 (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
555 lgrp->lgrp_latency = (int)newtime;
556 }
557 }
558
559 /*
560 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
561 */
562 void
lgrp_config(lgrp_config_flag_t event,uintptr_t resource,uintptr_t where)563 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
564 {
565 klgrpset_t changed;
566 cpu_t *cp;
567 lgrp_id_t id;
568 int rc;
569
570 switch (event) {
571 /*
572 * The following (re)configuration events are common code
573 * initiated. lgrp_plat_config() is called here to inform the
574 * platform of the reconfiguration event.
575 */
576 case LGRP_CONFIG_CPU_ADD:
577 cp = (cpu_t *)resource;
578
579 /*
580 * Initialize the new CPU's lgrp related next/prev
581 * links, and give it a bootstrap lpl so that it can
582 * survive should it need to enter the dispatcher.
583 */
584 cp->cpu_next_lpl = cp;
585 cp->cpu_prev_lpl = cp;
586 cp->cpu_next_lgrp = cp;
587 cp->cpu_prev_lgrp = cp;
588 cp->cpu_lpl = lpl_bootstrap;
589
590 lgrp_plat_config(event, resource);
591 atomic_inc_32(&lgrp_gen);
592
593 break;
594 case LGRP_CONFIG_CPU_DEL:
595 lgrp_plat_config(event, resource);
596 atomic_inc_32(&lgrp_gen);
597
598 break;
599 case LGRP_CONFIG_CPU_ONLINE:
600 cp = (cpu_t *)resource;
601 lgrp_cpu_init(cp);
602 lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
603 rc = lpl_topo_verify(cp->cpu_part);
604 if (rc != LPL_TOPO_CORRECT) {
605 panic("lpl_topo_verify failed: %d", rc);
606 }
607 lgrp_plat_config(event, resource);
608 atomic_inc_32(&lgrp_gen);
609
610 break;
611 case LGRP_CONFIG_CPU_OFFLINE:
612 cp = (cpu_t *)resource;
613 id = cp->cpu_lpl->lpl_lgrpid;
614 lgrp_part_del_cpu(cp);
615 lgrp_cpu_fini(cp, id);
616 rc = lpl_topo_verify(cp->cpu_part);
617 if (rc != LPL_TOPO_CORRECT) {
618 panic("lpl_topo_verify failed: %d", rc);
619 }
620 lgrp_plat_config(event, resource);
621 atomic_inc_32(&lgrp_gen);
622
623 break;
624 case LGRP_CONFIG_CPUPART_ADD:
625 cp = (cpu_t *)resource;
626 lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
627 rc = lpl_topo_verify(cp->cpu_part);
628 if (rc != LPL_TOPO_CORRECT) {
629 panic("lpl_topo_verify failed: %d", rc);
630 }
631 lgrp_plat_config(event, resource);
632
633 break;
634 case LGRP_CONFIG_CPUPART_DEL:
635 cp = (cpu_t *)resource;
636 lgrp_part_del_cpu((cpu_t *)resource);
637 rc = lpl_topo_verify(cp->cpu_part);
638 if (rc != LPL_TOPO_CORRECT) {
639 panic("lpl_topo_verify failed: %d", rc);
640 }
641 lgrp_plat_config(event, resource);
642
643 break;
644 /*
645 * The following events are initiated by the memnode
646 * subsystem.
647 */
648 case LGRP_CONFIG_MEM_ADD:
649 lgrp_mem_init((int)resource, where, B_FALSE);
650 atomic_inc_32(&lgrp_gen);
651
652 break;
653 case LGRP_CONFIG_MEM_DEL:
654 lgrp_mem_fini((int)resource, where, B_FALSE);
655 atomic_inc_32(&lgrp_gen);
656
657 break;
658 case LGRP_CONFIG_MEM_RENAME: {
659 lgrp_config_mem_rename_t *ren_arg =
660 (lgrp_config_mem_rename_t *)where;
661
662 lgrp_mem_rename((int)resource,
663 ren_arg->lmem_rename_from,
664 ren_arg->lmem_rename_to);
665 atomic_inc_32(&lgrp_gen);
666
667 break;
668 }
669 case LGRP_CONFIG_GEN_UPDATE:
670 atomic_inc_32(&lgrp_gen);
671
672 break;
673 case LGRP_CONFIG_FLATTEN:
674 if (where == 0)
675 lgrp_topo_levels = (int)resource;
676 else
677 (void) lgrp_topo_flatten(resource,
678 lgrp_table, lgrp_alloc_max, &changed);
679
680 break;
681 /*
682 * Update any lgroups with old latency to new latency
683 */
684 case LGRP_CONFIG_LAT_CHANGE_ALL:
685 lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
686 (u_longlong_t)where);
687
688 break;
689 /*
690 * Update lgroup with specified lgroup platform handle to have
691 * new latency
692 */
693 case LGRP_CONFIG_LAT_CHANGE:
694 lgrp_latency_change((lgrp_handle_t)resource, 0,
695 (u_longlong_t)where);
696
697 break;
698 case LGRP_CONFIG_NOP:
699
700 break;
701 default:
702 break;
703 }
704
705 }
706
707 /*
708 * Called to add lgrp info into cpu structure from cpu_add_unit;
709 * do not assume cpu is in cpu[] yet!
710 *
711 * CPUs are brought online with all other CPUs paused so we can't
712 * allocate memory or we could deadlock the system, so we rely on
713 * the platform to statically allocate as much space as we need
714 * for the lgrp structs and stats.
715 */
716 static void
lgrp_cpu_init(struct cpu * cp)717 lgrp_cpu_init(struct cpu *cp)
718 {
719 klgrpset_t changed;
720 int count;
721 lgrp_handle_t hand;
722 int first_cpu;
723 lgrp_t *my_lgrp;
724 lgrp_id_t lgrpid;
725 struct cpu *cptr;
726
727 /*
728 * This is the first time through if the resource set
729 * for the root lgroup is empty. After cpu0 has been
730 * initially added to an lgroup, the root's CPU resource
731 * set can never be empty, since the system's last CPU
732 * cannot be offlined.
733 */
734 if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
735 /*
736 * First time through.
737 */
738 first_cpu = 1;
739 } else {
740 /*
741 * If cpu0 needs to move lgroups, we may come
742 * through here again, at which time cpu_lock won't
743 * be held, and lgrp_initialized will be false.
744 */
745 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
746 ASSERT(cp->cpu_part != NULL);
747 first_cpu = 0;
748 }
749
750 hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
751 my_lgrp = lgrp_hand_to_lgrp(hand);
752
753 if (my_lgrp == NULL) {
754 /*
755 * Create new lgrp and add it to lgroup topology
756 */
757 my_lgrp = lgrp_create();
758 my_lgrp->lgrp_plathand = hand;
759 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
760 lgrpid = my_lgrp->lgrp_id;
761 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
762 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
763
764 count = 0;
765 klgrpset_clear(changed);
766 count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
767 &changed);
768 /*
769 * May have added new intermediate lgroups, so need to add
770 * resources other than CPUs which are added below
771 */
772 (void) lgrp_mnode_update(changed, NULL);
773 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
774 > 0) {
775 /*
776 * Leaf lgroup was created, but latency wasn't available
777 * then. So, set latency for it and fill in rest of lgroup
778 * topology now that we know how far it is from other leaf
779 * lgroups.
780 */
781 lgrpid = my_lgrp->lgrp_id;
782 klgrpset_clear(changed);
783 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
784 lgrpid))
785 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
786 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
787 &changed);
788
789 /*
790 * May have added new intermediate lgroups, so need to add
791 * resources other than CPUs which are added below
792 */
793 (void) lgrp_mnode_update(changed, NULL);
794 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
795 my_lgrp->lgrp_id)) {
796 int i;
797
798 /*
799 * Update existing lgroup and lgroups containing it with CPU
800 * resource
801 */
802 lgrpid = my_lgrp->lgrp_id;
803 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
804 for (i = 0; i <= lgrp_alloc_max; i++) {
805 lgrp_t *lgrp;
806
807 lgrp = lgrp_table[i];
808 if (!LGRP_EXISTS(lgrp) ||
809 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
810 continue;
811
812 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
813 }
814 }
815
816 lgrpid = my_lgrp->lgrp_id;
817 cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
818
819 /*
820 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
821 * end up in lpl for lgroup 0 whether it is supposed to be in there or
822 * not since none of lgroup IDs in the lpl's have been set yet.
823 */
824 if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
825 cp->cpu_lpl->lpl_lgrpid = lgrpid;
826
827 /*
828 * link the CPU into the lgrp's CPU list
829 */
830 if (my_lgrp->lgrp_cpucnt == 0) {
831 my_lgrp->lgrp_cpu = cp;
832 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
833 } else {
834 cptr = my_lgrp->lgrp_cpu;
835 cp->cpu_next_lgrp = cptr;
836 cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
837 cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
838 cptr->cpu_prev_lgrp = cp;
839 }
840 my_lgrp->lgrp_cpucnt++;
841 }
842
843 lgrp_t *
lgrp_create(void)844 lgrp_create(void)
845 {
846 lgrp_t *my_lgrp;
847 lgrp_id_t lgrpid;
848 int i;
849
850 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
851 lgrpid = 0;
852
853 /*
854 * Find an open slot in the lgroup table and recycle unused lgroup
855 * left there if any
856 */
857 my_lgrp = NULL;
858 if (lgrp_alloc_hint == -1)
859 /*
860 * Allocate from end when hint not set yet because no lgroups
861 * have been deleted yet
862 */
863 lgrpid = nlgrps++;
864 else {
865 /*
866 * Start looking for next open slot from hint and leave hint
867 * at slot allocated
868 */
869 for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
870 my_lgrp = lgrp_table[i];
871 if (!LGRP_EXISTS(my_lgrp)) {
872 lgrpid = i;
873 nlgrps++;
874 break;
875 }
876 }
877 lgrp_alloc_hint = lgrpid;
878 }
879
880 /*
881 * Keep track of max lgroup ID allocated so far to cut down on searches
882 */
883 if (lgrpid > lgrp_alloc_max)
884 lgrp_alloc_max = lgrpid;
885
886 /*
887 * Need to allocate new lgroup if next open slot didn't have one
888 * for recycling
889 */
890 if (my_lgrp == NULL)
891 my_lgrp = lgrp_plat_alloc(lgrpid);
892
893 if (nlgrps > nlgrpsmax || my_lgrp == NULL)
894 panic("Too many lgrps for platform (%d)", nlgrps);
895
896 my_lgrp->lgrp_id = lgrpid;
897 my_lgrp->lgrp_latency = 0;
898 my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
899 my_lgrp->lgrp_parent = NULL;
900 my_lgrp->lgrp_childcnt = 0;
901 my_lgrp->lgrp_mnodes = (mnodeset_t)0;
902 my_lgrp->lgrp_nmnodes = 0;
903 klgrpset_clear(my_lgrp->lgrp_children);
904 klgrpset_clear(my_lgrp->lgrp_leaves);
905 for (i = 0; i < LGRP_RSRC_COUNT; i++)
906 klgrpset_clear(my_lgrp->lgrp_set[i]);
907
908 my_lgrp->lgrp_cpu = NULL;
909 my_lgrp->lgrp_cpucnt = 0;
910
911 if (my_lgrp->lgrp_kstat != NULL)
912 lgrp_kstat_reset(lgrpid);
913
914 lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
915
916 return (my_lgrp);
917 }
918
919 void
lgrp_destroy(lgrp_t * lgrp)920 lgrp_destroy(lgrp_t *lgrp)
921 {
922 int i;
923
924 /*
925 * Unless this lgroup is being destroyed on behalf of
926 * the boot CPU, cpu_lock must be held
927 */
928 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
929
930 if (nlgrps == 1)
931 cmn_err(CE_PANIC, "Can't destroy only lgroup!");
932
933 if (!LGRP_EXISTS(lgrp))
934 return;
935
936 /*
937 * Set hint to lgroup being deleted and try to keep lower numbered
938 * hints to facilitate finding empty slots
939 */
940 if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
941 lgrp_alloc_hint = lgrp->lgrp_id;
942
943 /*
944 * Mark this lgroup to be recycled by setting its lgroup ID to
945 * LGRP_NONE and clear relevant fields
946 */
947 lgrp->lgrp_id = LGRP_NONE;
948 lgrp->lgrp_latency = 0;
949 lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
950 lgrp->lgrp_parent = NULL;
951 lgrp->lgrp_childcnt = 0;
952
953 klgrpset_clear(lgrp->lgrp_children);
954 klgrpset_clear(lgrp->lgrp_leaves);
955 for (i = 0; i < LGRP_RSRC_COUNT; i++)
956 klgrpset_clear(lgrp->lgrp_set[i]);
957
958 lgrp->lgrp_mnodes = (mnodeset_t)0;
959 lgrp->lgrp_nmnodes = 0;
960
961 lgrp->lgrp_cpu = NULL;
962 lgrp->lgrp_cpucnt = 0;
963
964 nlgrps--;
965 }
966
967 /*
968 * Initialize kstat data. Called from lgrp intialization code.
969 */
970 static void
lgrp_kstat_init(void)971 lgrp_kstat_init(void)
972 {
973 lgrp_stat_t stat;
974
975 mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
976
977 for (stat = 0; stat < LGRP_NUM_STATS; stat++)
978 kstat_named_init(&lgrp_kstat_data[stat],
979 lgrp_kstat_names[stat], KSTAT_DATA_INT64);
980 }
981
982 /*
983 * initialize an lgrp's kstats if needed
984 * called with cpu_lock held but not with cpus paused.
985 * we don't tear these down now because we don't know about
986 * memory leaving the lgrp yet...
987 */
988
989 void
lgrp_kstat_create(cpu_t * cp)990 lgrp_kstat_create(cpu_t *cp)
991 {
992 kstat_t *lgrp_kstat;
993 lgrp_id_t lgrpid;
994 lgrp_t *my_lgrp;
995
996 ASSERT(MUTEX_HELD(&cpu_lock));
997
998 lgrpid = cp->cpu_lpl->lpl_lgrpid;
999 my_lgrp = lgrp_table[lgrpid];
1000
1001 if (my_lgrp->lgrp_kstat != NULL)
1002 return; /* already initialized */
1003
1004 lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
1005 KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
1006 KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
1007
1008 if (lgrp_kstat != NULL) {
1009 lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
1010 lgrp_kstat->ks_private = my_lgrp;
1011 lgrp_kstat->ks_data = &lgrp_kstat_data;
1012 lgrp_kstat->ks_update = lgrp_kstat_extract;
1013 my_lgrp->lgrp_kstat = lgrp_kstat;
1014 kstat_install(lgrp_kstat);
1015 }
1016 }
1017
1018 /*
1019 * this will do something when we manage to remove now unused lgrps
1020 */
1021
1022 /* ARGSUSED */
1023 void
lgrp_kstat_destroy(cpu_t * cp)1024 lgrp_kstat_destroy(cpu_t *cp)
1025 {
1026 ASSERT(MUTEX_HELD(&cpu_lock));
1027 }
1028
1029 /*
1030 * Called when a CPU is off-lined.
1031 */
1032 static void
lgrp_cpu_fini(struct cpu * cp,lgrp_id_t lgrpid)1033 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
1034 {
1035 lgrp_t *my_lgrp;
1036 struct cpu *prev;
1037 struct cpu *next;
1038
1039 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
1040
1041 prev = cp->cpu_prev_lgrp;
1042 next = cp->cpu_next_lgrp;
1043
1044 prev->cpu_next_lgrp = next;
1045 next->cpu_prev_lgrp = prev;
1046
1047 /*
1048 * just because I'm paranoid doesn't mean...
1049 */
1050
1051 cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1052
1053 my_lgrp = lgrp_table[lgrpid];
1054 my_lgrp->lgrp_cpucnt--;
1055
1056 /*
1057 * Removing last CPU in lgroup, so update lgroup topology
1058 */
1059 if (my_lgrp->lgrp_cpucnt == 0) {
1060 klgrpset_t changed;
1061 int count;
1062 int i;
1063
1064 my_lgrp->lgrp_cpu = NULL;
1065
1066 /*
1067 * Remove this lgroup from its lgroup CPU resources and remove
1068 * lgroup from lgroup topology if it doesn't have any more
1069 * resources in it now
1070 */
1071 klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1072 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1073 count = 0;
1074 klgrpset_clear(changed);
1075 count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1076 lgrp_alloc_max + 1, &changed);
1077 return;
1078 }
1079
1080 /*
1081 * This lgroup isn't empty, so just remove it from CPU
1082 * resources of any lgroups that contain it as such
1083 */
1084 for (i = 0; i <= lgrp_alloc_max; i++) {
1085 lgrp_t *lgrp;
1086
1087 lgrp = lgrp_table[i];
1088 if (!LGRP_EXISTS(lgrp) ||
1089 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1090 lgrpid))
1091 continue;
1092
1093 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1094 }
1095 return;
1096 }
1097
1098 if (my_lgrp->lgrp_cpu == cp)
1099 my_lgrp->lgrp_cpu = next;
1100
1101 }
1102
1103 /*
1104 * Update memory nodes in target lgroups and return ones that get changed
1105 */
1106 int
lgrp_mnode_update(klgrpset_t target,klgrpset_t * changed)1107 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1108 {
1109 int count;
1110 int i;
1111 int j;
1112 lgrp_t *lgrp;
1113 lgrp_t *lgrp_rsrc;
1114
1115 count = 0;
1116 if (changed)
1117 klgrpset_clear(*changed);
1118
1119 if (klgrpset_isempty(target))
1120 return (0);
1121
1122 /*
1123 * Find each lgroup in target lgroups
1124 */
1125 for (i = 0; i <= lgrp_alloc_max; i++) {
1126 /*
1127 * Skip any lgroups that don't exist or aren't in target group
1128 */
1129 lgrp = lgrp_table[i];
1130 if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1131 continue;
1132 }
1133
1134 /*
1135 * Initialize memnodes for intermediate lgroups to 0
1136 * and update them from scratch since they may have completely
1137 * changed
1138 */
1139 if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1140 lgrp->lgrp_mnodes = (mnodeset_t)0;
1141 lgrp->lgrp_nmnodes = 0;
1142 }
1143
1144 /*
1145 * Update memory nodes of of target lgroup with memory nodes
1146 * from each lgroup in its lgroup memory resource set
1147 */
1148 for (j = 0; j <= lgrp_alloc_max; j++) {
1149 int k;
1150
1151 /*
1152 * Skip any lgroups that don't exist or aren't in
1153 * memory resources of target lgroup
1154 */
1155 lgrp_rsrc = lgrp_table[j];
1156 if (!LGRP_EXISTS(lgrp_rsrc) ||
1157 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1158 j))
1159 continue;
1160
1161 /*
1162 * Update target lgroup's memnodes to include memnodes
1163 * of this lgroup
1164 */
1165 for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1166 mnodeset_t mnode_mask;
1167
1168 mnode_mask = (mnodeset_t)1 << k;
1169 if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1170 !(lgrp->lgrp_mnodes & mnode_mask)) {
1171 lgrp->lgrp_mnodes |= mnode_mask;
1172 lgrp->lgrp_nmnodes++;
1173 }
1174 }
1175 count++;
1176 if (changed)
1177 klgrpset_add(*changed, lgrp->lgrp_id);
1178 }
1179 }
1180
1181 return (count);
1182 }
1183
1184 /*
1185 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1186 * is moved from one board to another. The "from" and "to" arguments specify the
1187 * source and the destination of the move.
1188 *
1189 * See plat_lgrp_config() for a detailed description of the copy-rename
1190 * semantics.
1191 *
1192 * The lgrp_mem_rename() is called by the platform copy-rename code to update
1193 * the lgroup topology which is changing as memory moves from one lgroup to
1194 * another. It removes the mnode from the source lgroup and re-inserts it in the
1195 * target lgroup.
1196 *
1197 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1198 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1199 * copy-rename operation.
1200 *
1201 * There is one case which requires special handling. If the system contains
1202 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1203 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1204 * lgrp_mem_init), but there is a window when the system has no memory in the
1205 * lgroup hierarchy. If another thread tries to allocate memory during this
1206 * window, the allocation will fail, although the system has physical memory.
1207 * This may cause a system panic or a deadlock (some sleeping memory allocations
1208 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1209 * the mnode back).
1210 *
1211 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1212 * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1213 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1214 * but it updates the rest of the lgroup topology as if the mnode was actually
1215 * removed. The lgrp_mem_init() function recognizes that the mnode being
1216 * inserted represents such a special case and updates the topology
1217 * appropriately.
1218 */
1219 void
lgrp_mem_rename(int mnode,lgrp_handle_t from,lgrp_handle_t to)1220 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1221 {
1222 /*
1223 * Remove the memory from the source node and add it to the destination
1224 * node.
1225 */
1226 lgrp_mem_fini(mnode, from, B_TRUE);
1227 lgrp_mem_init(mnode, to, B_TRUE);
1228 }
1229
1230 /*
1231 * Called to indicate that the lgrp with platform handle "hand" now
1232 * contains the memory identified by "mnode".
1233 *
1234 * LOCKING for this routine is a bit tricky. Usually it is called without
1235 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1236 * callers. During DR of the board containing the caged memory it may be called
1237 * with cpu_lock already held and CPUs paused.
1238 *
1239 * If the insertion is part of the DR copy-rename and the inserted mnode (and
1240 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1241 * dealing with the special case of DR copy-rename described in
1242 * lgrp_mem_rename().
1243 */
1244 void
lgrp_mem_init(int mnode,lgrp_handle_t hand,boolean_t is_copy_rename)1245 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1246 {
1247 klgrpset_t changed;
1248 int count;
1249 int i;
1250 lgrp_t *my_lgrp;
1251 lgrp_id_t lgrpid;
1252 mnodeset_t mnodes_mask = ((mnodeset_t)1 << mnode);
1253 boolean_t drop_lock = B_FALSE;
1254 boolean_t need_synch = B_FALSE;
1255
1256 /*
1257 * Grab CPU lock (if we haven't already)
1258 */
1259 if (!MUTEX_HELD(&cpu_lock)) {
1260 mutex_enter(&cpu_lock);
1261 drop_lock = B_TRUE;
1262 }
1263
1264 /*
1265 * This routine may be called from a context where we already
1266 * hold cpu_lock, and have already paused cpus.
1267 */
1268 if (!cpus_paused())
1269 need_synch = B_TRUE;
1270
1271 /*
1272 * Check if this mnode is already configured and return immediately if
1273 * it is.
1274 *
1275 * NOTE: in special case of copy-rename of the only remaining mnode,
1276 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1277 * recognize this case and continue as usual, but skip the update to
1278 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1279 * in topology, temporarily introduced by lgrp_mem_fini().
1280 */
1281 if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1282 lgrp_root->lgrp_mnodes & mnodes_mask) {
1283 if (drop_lock)
1284 mutex_exit(&cpu_lock);
1285 return;
1286 }
1287
1288 /*
1289 * Update lgroup topology with new memory resources, keeping track of
1290 * which lgroups change
1291 */
1292 count = 0;
1293 klgrpset_clear(changed);
1294 my_lgrp = lgrp_hand_to_lgrp(hand);
1295 if (my_lgrp == NULL) {
1296 /* new lgrp */
1297 my_lgrp = lgrp_create();
1298 lgrpid = my_lgrp->lgrp_id;
1299 my_lgrp->lgrp_plathand = hand;
1300 my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1301 klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1302 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1303
1304 if (need_synch)
1305 pause_cpus(NULL, NULL);
1306 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1307 &changed);
1308 if (need_synch)
1309 start_cpus();
1310 } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1311 > 0) {
1312 /*
1313 * Leaf lgroup was created, but latency wasn't available
1314 * then. So, set latency for it and fill in rest of lgroup
1315 * topology now that we know how far it is from other leaf
1316 * lgroups.
1317 */
1318 klgrpset_clear(changed);
1319 lgrpid = my_lgrp->lgrp_id;
1320 if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1321 lgrpid))
1322 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1323 if (need_synch)
1324 pause_cpus(NULL, NULL);
1325 count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1326 &changed);
1327 if (need_synch)
1328 start_cpus();
1329 } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1330 my_lgrp->lgrp_id)) {
1331 /*
1332 * Add new lgroup memory resource to existing lgroup
1333 */
1334 lgrpid = my_lgrp->lgrp_id;
1335 klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1336 klgrpset_add(changed, lgrpid);
1337 count++;
1338 for (i = 0; i <= lgrp_alloc_max; i++) {
1339 lgrp_t *lgrp;
1340
1341 lgrp = lgrp_table[i];
1342 if (!LGRP_EXISTS(lgrp) ||
1343 !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1344 continue;
1345
1346 klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1347 klgrpset_add(changed, lgrp->lgrp_id);
1348 count++;
1349 }
1350 } else {
1351 if (drop_lock)
1352 mutex_exit(&cpu_lock);
1353 return;
1354 }
1355
1356 /*
1357 * Add memory node to lgroup and remove lgroup from ones that need
1358 * to be updated
1359 */
1360 if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1361 my_lgrp->lgrp_mnodes |= mnodes_mask;
1362 my_lgrp->lgrp_nmnodes++;
1363 }
1364 klgrpset_del(changed, lgrpid);
1365
1366 /*
1367 * Update memory node information for all lgroups that changed and
1368 * contain new memory node as a resource
1369 */
1370 if (count)
1371 (void) lgrp_mnode_update(changed, NULL);
1372
1373 if (drop_lock)
1374 mutex_exit(&cpu_lock);
1375 }
1376
1377 /*
1378 * Called to indicate that the lgroup associated with the platform
1379 * handle "hand" no longer contains given memory node
1380 *
1381 * LOCKING for this routine is a bit tricky. Usually it is called without
1382 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1383 * callers. During DR of the board containing the caged memory it may be called
1384 * with cpu_lock already held and CPUs paused.
1385 *
1386 * If the deletion is part of the DR copy-rename and the deleted mnode is the
1387 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1388 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1389 * the same mnode back into the topology. See lgrp_mem_rename() and
1390 * lgrp_mem_init() for additional details.
1391 */
1392 void
lgrp_mem_fini(int mnode,lgrp_handle_t hand,boolean_t is_copy_rename)1393 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1394 {
1395 klgrpset_t changed;
1396 int count;
1397 int i;
1398 lgrp_t *my_lgrp;
1399 lgrp_id_t lgrpid;
1400 mnodeset_t mnodes_mask;
1401 boolean_t drop_lock = B_FALSE;
1402 boolean_t need_synch = B_FALSE;
1403
1404 /*
1405 * Grab CPU lock (if we haven't already)
1406 */
1407 if (!MUTEX_HELD(&cpu_lock)) {
1408 mutex_enter(&cpu_lock);
1409 drop_lock = B_TRUE;
1410 }
1411
1412 /*
1413 * This routine may be called from a context where we already
1414 * hold cpu_lock and have already paused cpus.
1415 */
1416 if (!cpus_paused())
1417 need_synch = B_TRUE;
1418
1419 my_lgrp = lgrp_hand_to_lgrp(hand);
1420
1421 /*
1422 * The lgrp *must* be pre-existing
1423 */
1424 ASSERT(my_lgrp != NULL);
1425
1426 /*
1427 * Delete memory node from lgroups which contain it
1428 */
1429 mnodes_mask = ((mnodeset_t)1 << mnode);
1430 for (i = 0; i <= lgrp_alloc_max; i++) {
1431 lgrp_t *lgrp = lgrp_table[i];
1432 /*
1433 * Skip any non-existent lgroups and any lgroups that don't
1434 * contain leaf lgroup of memory as a memory resource
1435 */
1436 if (!LGRP_EXISTS(lgrp) ||
1437 !(lgrp->lgrp_mnodes & mnodes_mask))
1438 continue;
1439
1440 /*
1441 * Avoid removing the last mnode from the root in the DR
1442 * copy-rename case. See lgrp_mem_rename() for details.
1443 */
1444 if (is_copy_rename &&
1445 (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1446 continue;
1447
1448 /*
1449 * Remove memory node from lgroup.
1450 */
1451 lgrp->lgrp_mnodes &= ~mnodes_mask;
1452 ASSERT(lgrp->lgrp_nmnodes > 0);
1453 lgrp->lgrp_nmnodes--;
1454 }
1455 ASSERT(lgrp_root->lgrp_nmnodes > 0);
1456
1457 /*
1458 * Don't need to update lgroup topology if this lgroup still has memory.
1459 *
1460 * In the special case of DR copy-rename with the only mnode being
1461 * removed, the lgrp_mnodes for the root is always non-zero, but we
1462 * still need to update the lgroup topology.
1463 */
1464 if ((my_lgrp->lgrp_nmnodes > 0) &&
1465 !(is_copy_rename && (my_lgrp == lgrp_root) &&
1466 (my_lgrp->lgrp_mnodes == mnodes_mask))) {
1467 if (drop_lock)
1468 mutex_exit(&cpu_lock);
1469 return;
1470 }
1471
1472 /*
1473 * This lgroup does not contain any memory now
1474 */
1475 klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1476
1477 /*
1478 * Remove this lgroup from lgroup topology if it does not contain any
1479 * resources now
1480 */
1481 lgrpid = my_lgrp->lgrp_id;
1482 count = 0;
1483 klgrpset_clear(changed);
1484 if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1485 /*
1486 * Delete lgroup when no more resources
1487 */
1488 if (need_synch)
1489 pause_cpus(NULL, NULL);
1490 count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1491 lgrp_alloc_max + 1, &changed);
1492 ASSERT(count > 0);
1493 if (need_synch)
1494 start_cpus();
1495 } else {
1496 /*
1497 * Remove lgroup from memory resources of any lgroups that
1498 * contain it as such
1499 */
1500 for (i = 0; i <= lgrp_alloc_max; i++) {
1501 lgrp_t *lgrp;
1502
1503 lgrp = lgrp_table[i];
1504 if (!LGRP_EXISTS(lgrp) ||
1505 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1506 lgrpid))
1507 continue;
1508
1509 klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1510 }
1511 }
1512 if (drop_lock)
1513 mutex_exit(&cpu_lock);
1514 }
1515
1516 /*
1517 * Return lgroup with given platform handle
1518 */
1519 lgrp_t *
lgrp_hand_to_lgrp(lgrp_handle_t hand)1520 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1521 {
1522 int i;
1523 lgrp_t *lgrp;
1524
1525 if (hand == LGRP_NULL_HANDLE)
1526 return (NULL);
1527
1528 for (i = 0; i <= lgrp_alloc_max; i++) {
1529 lgrp = lgrp_table[i];
1530 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1531 return (lgrp);
1532 }
1533 return (NULL);
1534 }
1535
1536 /*
1537 * Return the home lgroup of the current thread.
1538 * We must do this with kernel preemption disabled, since we don't want our
1539 * thread to be re-homed while we're poking around with its lpl, and the lpl
1540 * should never be NULL.
1541 *
1542 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1543 * is enabled because of DR. Callers can use disable kernel preemption
1544 * around this call to guarantee that the lgroup will be valid beyond this
1545 * routine, since kernel preemption can be recursive.
1546 */
1547 lgrp_t *
lgrp_home_lgrp(void)1548 lgrp_home_lgrp(void)
1549 {
1550 lgrp_t *lgrp;
1551 lpl_t *lpl;
1552
1553 kpreempt_disable();
1554
1555 lpl = curthread->t_lpl;
1556 ASSERT(lpl != NULL);
1557 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1558 ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1559 lgrp = lgrp_table[lpl->lpl_lgrpid];
1560
1561 kpreempt_enable();
1562
1563 return (lgrp);
1564 }
1565
1566 /*
1567 * Return ID of home lgroup for given thread
1568 * (See comments for lgrp_home_lgrp() for special care and handling
1569 * instructions)
1570 */
1571 lgrp_id_t
lgrp_home_id(kthread_t * t)1572 lgrp_home_id(kthread_t *t)
1573 {
1574 lgrp_id_t lgrp;
1575 lpl_t *lpl;
1576
1577 ASSERT(t != NULL);
1578 /*
1579 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1580 * cannot since the HAT layer can call into this routine to
1581 * determine the locality for its data structures in the context
1582 * of a page fault.
1583 */
1584
1585 kpreempt_disable();
1586
1587 lpl = t->t_lpl;
1588 ASSERT(lpl != NULL);
1589 ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1590 lgrp = lpl->lpl_lgrpid;
1591
1592 kpreempt_enable();
1593
1594 return (lgrp);
1595 }
1596
1597 /*
1598 * Return lgroup containing the physical memory for the given page frame number
1599 */
1600 lgrp_t *
lgrp_pfn_to_lgrp(pfn_t pfn)1601 lgrp_pfn_to_lgrp(pfn_t pfn)
1602 {
1603 lgrp_handle_t hand;
1604 int i;
1605 lgrp_t *lgrp;
1606
1607 hand = lgrp_plat_pfn_to_hand(pfn);
1608 if (hand != LGRP_NULL_HANDLE)
1609 for (i = 0; i <= lgrp_alloc_max; i++) {
1610 lgrp = lgrp_table[i];
1611 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1612 return (lgrp);
1613 }
1614 return (NULL);
1615 }
1616
1617 /*
1618 * Return lgroup containing the physical memory for the given page frame number
1619 */
1620 lgrp_t *
lgrp_phys_to_lgrp(u_longlong_t physaddr)1621 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1622 {
1623 lgrp_handle_t hand;
1624 int i;
1625 lgrp_t *lgrp;
1626 pfn_t pfn;
1627
1628 pfn = btop(physaddr);
1629 hand = lgrp_plat_pfn_to_hand(pfn);
1630 if (hand != LGRP_NULL_HANDLE)
1631 for (i = 0; i <= lgrp_alloc_max; i++) {
1632 lgrp = lgrp_table[i];
1633 if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1634 return (lgrp);
1635 }
1636 return (NULL);
1637 }
1638
1639 /*
1640 * Return the leaf lgroup containing the given CPU
1641 *
1642 * The caller needs to take precautions necessary to prevent
1643 * "cpu", and it's lpl from going away across a call to this function.
1644 * hint: kpreempt_disable()/kpreempt_enable()
1645 */
1646 static lgrp_t *
lgrp_cpu_to_lgrp(cpu_t * cpu)1647 lgrp_cpu_to_lgrp(cpu_t *cpu)
1648 {
1649 return (cpu->cpu_lpl->lpl_lgrp);
1650 }
1651
1652 /*
1653 * Return the sum of the partition loads in an lgrp divided by
1654 * the number of CPUs in the lgrp. This is our best approximation
1655 * of an 'lgroup load average' for a useful per-lgroup kstat.
1656 */
1657 static uint64_t
lgrp_sum_loadavgs(lgrp_t * lgrp)1658 lgrp_sum_loadavgs(lgrp_t *lgrp)
1659 {
1660 cpu_t *cpu;
1661 int ncpu;
1662 uint64_t loads = 0;
1663
1664 mutex_enter(&cpu_lock);
1665
1666 cpu = lgrp->lgrp_cpu;
1667 ncpu = lgrp->lgrp_cpucnt;
1668
1669 if (cpu == NULL || ncpu == 0) {
1670 mutex_exit(&cpu_lock);
1671 return (0ull);
1672 }
1673
1674 do {
1675 loads += cpu->cpu_lpl->lpl_loadavg;
1676 cpu = cpu->cpu_next_lgrp;
1677 } while (cpu != lgrp->lgrp_cpu);
1678
1679 mutex_exit(&cpu_lock);
1680
1681 return (loads / ncpu);
1682 }
1683
1684 void
lgrp_stat_add(lgrp_id_t lgrpid,lgrp_stat_t stat,int64_t val)1685 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1686 {
1687 struct lgrp_stats *pstats;
1688
1689 /*
1690 * Verify that the caller isn't trying to add to
1691 * a statistic for an lgroup that has gone away
1692 */
1693 if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1694 return;
1695
1696 pstats = &lgrp_stats[lgrpid];
1697 atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1698 }
1699
1700 int64_t
lgrp_stat_read(lgrp_id_t lgrpid,lgrp_stat_t stat)1701 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1702 {
1703 uint64_t val;
1704 struct lgrp_stats *pstats;
1705
1706 if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1707 return ((int64_t)0);
1708
1709 pstats = &lgrp_stats[lgrpid];
1710 LGRP_STAT_READ(pstats, stat, val);
1711 return (val);
1712 }
1713
1714 /*
1715 * Reset all kstats for lgrp specified by its lgrpid.
1716 */
1717 static void
lgrp_kstat_reset(lgrp_id_t lgrpid)1718 lgrp_kstat_reset(lgrp_id_t lgrpid)
1719 {
1720 lgrp_stat_t stat;
1721
1722 if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1723 return;
1724
1725 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1726 LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1727 }
1728 }
1729
1730 /*
1731 * Collect all per-lgrp statistics for the lgrp associated with this
1732 * kstat, and store them in the ks_data array.
1733 *
1734 * The superuser can reset all the running counter statistics for an
1735 * lgrp by writing to any of the lgrp's stats.
1736 */
1737 static int
lgrp_kstat_extract(kstat_t * ksp,int rw)1738 lgrp_kstat_extract(kstat_t *ksp, int rw)
1739 {
1740 lgrp_stat_t stat;
1741 struct kstat_named *ksd;
1742 lgrp_t *lgrp;
1743 lgrp_id_t lgrpid;
1744
1745 lgrp = (lgrp_t *)ksp->ks_private;
1746
1747 ksd = (struct kstat_named *)ksp->ks_data;
1748 ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1749
1750 lgrpid = lgrp->lgrp_id;
1751
1752 if (lgrpid == LGRP_NONE) {
1753 /*
1754 * Return all zeroes as stats for freed lgrp.
1755 */
1756 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1757 ksd[stat].value.i64 = 0;
1758 }
1759 ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1760 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1761 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1762 ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1763 ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1764 } else if (rw != KSTAT_WRITE) {
1765 /*
1766 * Handle counter stats
1767 */
1768 for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1769 ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1770 }
1771
1772 /*
1773 * Handle kernel data snapshot stats
1774 */
1775 ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1776 ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1777 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1778 ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1779 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1780 ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1781 lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1782 ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1783 ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
1784 lgrp_loadavg_max_effect;
1785 } else {
1786 lgrp_kstat_reset(lgrpid);
1787 }
1788
1789 return (0);
1790 }
1791
1792 int
lgrp_query_cpu(processorid_t id,lgrp_id_t * lp)1793 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1794 {
1795 cpu_t *cp;
1796
1797 mutex_enter(&cpu_lock);
1798
1799 if ((cp = cpu_get(id)) == NULL) {
1800 mutex_exit(&cpu_lock);
1801 return (EINVAL);
1802 }
1803
1804 if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1805 mutex_exit(&cpu_lock);
1806 return (EINVAL);
1807 }
1808
1809 ASSERT(cp->cpu_lpl != NULL);
1810
1811 *lp = cp->cpu_lpl->lpl_lgrpid;
1812
1813 mutex_exit(&cpu_lock);
1814
1815 return (0);
1816 }
1817
1818 int
lgrp_query_load(processorid_t id,lgrp_load_t * lp)1819 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1820 {
1821 cpu_t *cp;
1822
1823 mutex_enter(&cpu_lock);
1824
1825 if ((cp = cpu_get(id)) == NULL) {
1826 mutex_exit(&cpu_lock);
1827 return (EINVAL);
1828 }
1829
1830 ASSERT(cp->cpu_lpl != NULL);
1831
1832 *lp = cp->cpu_lpl->lpl_loadavg;
1833
1834 mutex_exit(&cpu_lock);
1835
1836 return (0);
1837 }
1838
1839 /*
1840 * Add a resource named by lpl_leaf to rset of lpl_target
1841 *
1842 * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1843 * resource. It is adjusted here, as this is presently the only place that we
1844 * can be certain a resource addition has succeeded.
1845 *
1846 * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1847 * list in order until it reaches a NULL. (This list is required to be NULL
1848 * terminated, too). This is done so that we can mark start pos + 1, so that
1849 * each lpl is traversed sequentially, but in a different order. We hope this
1850 * will improve performance a bit. (Hopefully, less read-to-own traffic...)
1851 */
1852
1853 void
lpl_rset_add(lpl_t * lpl_target,lpl_t * lpl_leaf)1854 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1855 {
1856 int i;
1857 int entry_slot = 0;
1858
1859 /* return if leaf is already present */
1860 for (i = 0; i < lpl_target->lpl_nrset; i++) {
1861 if (lpl_target->lpl_rset[i] == lpl_leaf) {
1862 return;
1863 }
1864
1865 if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1866 lpl_leaf->lpl_lgrpid) {
1867 break;
1868 }
1869 }
1870
1871 /* insert leaf, update counts */
1872 entry_slot = i;
1873 i = lpl_target->lpl_nrset++;
1874
1875 /*
1876 * Start at the end of the rset array and work backwards towards the
1877 * slot into which the new lpl will be inserted. This effectively
1878 * preserves the current ordering by scooting everybody over one entry,
1879 * and placing the new entry into the space created.
1880 */
1881 while (i-- > entry_slot) {
1882 lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1883 lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
1884 i + 1;
1885 }
1886
1887 lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1888 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;
1889
1890 lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1891 }
1892
1893 /*
1894 * Update each of lpl_parent's children with a reference to their parent.
1895 * The lgrp topology is used as the reference since it is fully
1896 * consistent and correct at this point.
1897 * This should be called after any potential change in lpl_parent's
1898 * rset.
1899 */
1900 static void
lpl_child_update(lpl_t * lpl_parent,struct cpupart * cp)1901 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1902 {
1903 klgrpset_t children;
1904 int i;
1905
1906 children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1907 if (klgrpset_isempty(children))
1908 return; /* nothing to do */
1909
1910 for (i = 0; i <= lgrp_alloc_max; i++) {
1911 if (klgrpset_ismember(children, i)) {
1912 /*
1913 * (Re)set the parent. It may be incorrect if
1914 * lpl_parent is new in the topology.
1915 */
1916 cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1917 }
1918 }
1919 }
1920
1921 /*
1922 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1923 *
1924 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1925 * resource. The values are adjusted here, as this is the only place that we can
1926 * be certain a resource was successfully deleted.
1927 */
1928 void
lpl_rset_del(lpl_t * lpl_target,lpl_t * lpl_leaf)1929 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1930 {
1931 int i;
1932 lpl_t *leaf;
1933
1934 if (lpl_target->lpl_nrset == 0)
1935 return;
1936
1937 /* find leaf in intermediate node */
1938 for (i = 0; i < lpl_target->lpl_nrset; i++) {
1939 if (lpl_target->lpl_rset[i] == lpl_leaf)
1940 break;
1941 }
1942
1943 /* return if leaf not found */
1944 if (lpl_target->lpl_rset[i] != lpl_leaf)
1945 return;
1946
1947 /* prune leaf, compress array */
1948 lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1949 lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
1950 lpl_target->lpl_ncpu--;
1951 do {
1952 lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1953 /*
1954 * Update the lgrp id <=> rset mapping
1955 */
1956 if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
1957 lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
1958 }
1959 } while (i++ < lpl_target->lpl_nrset);
1960 }
1961
1962 /*
1963 * Check to see if the resource set of the target lpl contains the
1964 * supplied leaf lpl. This returns 1 if the lpl is found, 0 if it is not.
1965 */
1966
1967 int
lpl_rset_contains(lpl_t * lpl_target,lpl_t * lpl_leaf)1968 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1969 {
1970 int i;
1971
1972 for (i = 0; i < lpl_target->lpl_nrset; i++) {
1973 if (lpl_target->lpl_rset[i] == lpl_leaf)
1974 return (1);
1975 }
1976
1977 return (0);
1978 }
1979
1980 /*
1981 * Called when we change cpu lpl membership. This increments or decrements the
1982 * per-cpu counter in every lpl in which our leaf appears.
1983 */
1984 void
lpl_cpu_adjcnt(lpl_act_t act,cpu_t * cp)1985 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1986 {
1987 cpupart_t *cpupart;
1988 lgrp_t *lgrp_leaf;
1989 lgrp_t *lgrp_cur;
1990 lpl_t *lpl_leaf;
1991 lpl_t *lpl_cur;
1992 int i;
1993
1994 ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
1995
1996 cpupart = cp->cpu_part;
1997 lpl_leaf = cp->cpu_lpl;
1998 lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
1999
2000 for (i = 0; i <= lgrp_alloc_max; i++) {
2001 lgrp_cur = lgrp_table[i];
2002
2003 /*
2004 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2005 * for the cpu in question, or if the current lgrp and leaf
2006 * don't share the same resources.
2007 */
2008
2009 if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2010 !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2011 lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2012 continue;
2013
2014
2015 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2016
2017 if (lpl_cur->lpl_nrset > 0) {
2018 if (act == LPL_INCREMENT) {
2019 lpl_cur->lpl_ncpu++;
2020 } else if (act == LPL_DECREMENT) {
2021 lpl_cur->lpl_ncpu--;
2022 }
2023 }
2024 }
2025 }
2026
2027 /*
2028 * Initialize lpl with given resources and specified lgrp
2029 */
2030 void
lpl_init(lpl_t * lpl,lpl_t * lpl_leaf,lgrp_t * lgrp)2031 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2032 {
2033 lpl->lpl_lgrpid = lgrp->lgrp_id;
2034 lpl->lpl_loadavg = 0;
2035 if (lpl == lpl_leaf)
2036 lpl->lpl_ncpu = 1;
2037 else
2038 lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2039 lpl->lpl_nrset = 1;
2040 lpl->lpl_rset[0] = lpl_leaf;
2041 lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
2042 lpl->lpl_lgrp = lgrp;
2043 lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2044 lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2045 }
2046
2047 /*
2048 * Clear an unused lpl
2049 */
2050 void
lpl_clear(lpl_t * lpl)2051 lpl_clear(lpl_t *lpl)
2052 {
2053 /*
2054 * Clear out all fields in the lpl except:
2055 * lpl_lgrpid - to facilitate debugging
2056 * lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
2057 *
2058 * Note that the lpl's rset and id2rset mapping are cleared as well.
2059 */
2060 lpl->lpl_loadavg = 0;
2061 lpl->lpl_ncpu = 0;
2062 lpl->lpl_lgrp = NULL;
2063 lpl->lpl_parent = NULL;
2064 lpl->lpl_cpus = NULL;
2065 lpl->lpl_nrset = 0;
2066 lpl->lpl_homed_time = 0;
2067 bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
2068 bzero(lpl->lpl_id2rset,
2069 sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
2070 }
2071
2072 /*
2073 * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2074 * is in sync with the lgroup toplogy in the system. The lpl topology may not
2075 * make full use of all of the lgroup topology, but this checks to make sure
2076 * that for the parts that it does use, it has correctly understood the
2077 * relationships that exist. This function returns
2078 * 0 if the topology is correct, and a non-zero error code, for non-debug
2079 * kernels if incorrect. Asserts are spread throughout the code to aid in
2080 * debugging on a DEBUG kernel.
2081 */
2082 int
lpl_topo_verify(cpupart_t * cpupart)2083 lpl_topo_verify(cpupart_t *cpupart)
2084 {
2085 lgrp_t *lgrp;
2086 lpl_t *lpl;
2087 klgrpset_t rset;
2088 klgrpset_t cset;
2089 cpu_t *cpu;
2090 cpu_t *cp_start;
2091 int i;
2092 int j;
2093 int sum;
2094
2095 /* topology can't be incorrect if it doesn't exist */
2096 if (!lgrp_topo_initialized || !lgrp_initialized)
2097 return (LPL_TOPO_CORRECT);
2098
2099 ASSERT(cpupart != NULL);
2100
2101 for (i = 0; i <= lgrp_alloc_max; i++) {
2102 lgrp = lgrp_table[i];
2103 lpl = NULL;
2104 /* make sure lpls are allocated */
2105 ASSERT(cpupart->cp_lgrploads);
2106 if (!cpupart->cp_lgrploads)
2107 return (LPL_TOPO_PART_HAS_NO_LPL);
2108
2109 lpl = &cpupart->cp_lgrploads[i];
2110 /* make sure our index is good */
2111 ASSERT(i < cpupart->cp_nlgrploads);
2112
2113 /* if lgroup doesn't exist, make sure lpl is empty */
2114 if (!LGRP_EXISTS(lgrp)) {
2115 ASSERT(lpl->lpl_ncpu == 0);
2116 if (lpl->lpl_ncpu > 0) {
2117 return (LPL_TOPO_CPUS_NOT_EMPTY);
2118 } else {
2119 continue;
2120 }
2121 }
2122
2123 /* verify that lgroup and lpl are identically numbered */
2124 ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2125
2126 /* if lgroup isn't in our partition, make sure lpl is empty */
2127 if (!klgrpset_intersects(lgrp->lgrp_leaves,
2128 cpupart->cp_lgrpset)) {
2129 ASSERT(lpl->lpl_ncpu == 0);
2130 if (lpl->lpl_ncpu > 0) {
2131 return (LPL_TOPO_CPUS_NOT_EMPTY);
2132 }
2133 /*
2134 * lpl is empty, and lgroup isn't in partition. verify
2135 * that lpl doesn't show up in anyone else's rsets (in
2136 * this partition, anyway)
2137 */
2138 for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2139 lpl_t *i_lpl; /* lpl we're iterating over */
2140
2141 i_lpl = &cpupart->cp_lgrploads[j];
2142
2143 ASSERT(!lpl_rset_contains(i_lpl, lpl));
2144 if (lpl_rset_contains(i_lpl, lpl)) {
2145 return (LPL_TOPO_LPL_ORPHANED);
2146 }
2147 }
2148 /* lgroup is empty, and everything is ok. continue */
2149 continue;
2150 }
2151
2152
2153 /* lgroup is in this partition, now check it against lpl */
2154
2155 /* do both have matching lgrps? */
2156 ASSERT(lgrp == lpl->lpl_lgrp);
2157 if (lgrp != lpl->lpl_lgrp) {
2158 return (LPL_TOPO_LGRP_MISMATCH);
2159 }
2160
2161 /* do the parent lgroups exist and do they match? */
2162 if (lgrp->lgrp_parent) {
2163 ASSERT(lpl->lpl_parent != NULL &&
2164 lgrp->lgrp_parent->lgrp_id ==
2165 lpl->lpl_parent->lpl_lgrpid);
2166
2167 if (!lpl->lpl_parent) {
2168 return (LPL_TOPO_MISSING_PARENT);
2169 } else if (lgrp->lgrp_parent->lgrp_id !=
2170 lpl->lpl_parent->lpl_lgrpid) {
2171 return (LPL_TOPO_PARENT_MISMATCH);
2172 }
2173 }
2174
2175 /* only leaf lgroups keep a cpucnt, only check leaves */
2176 if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2177
2178 /* verify that lgrp is also a leaf */
2179 ASSERT((lgrp->lgrp_childcnt == 0) &&
2180 (klgrpset_ismember(lgrp->lgrp_leaves,
2181 lpl->lpl_lgrpid)));
2182
2183 if ((lgrp->lgrp_childcnt > 0) ||
2184 (!klgrpset_ismember(lgrp->lgrp_leaves,
2185 lpl->lpl_lgrpid))) {
2186 return (LPL_TOPO_LGRP_NOT_LEAF);
2187 }
2188
2189 ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2190 (lpl->lpl_ncpu > 0));
2191 if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2192 (lpl->lpl_ncpu <= 0)) {
2193 return (LPL_TOPO_BAD_CPUCNT);
2194 }
2195
2196 /*
2197 * Check that lpl_ncpu also matches the number of
2198 * cpus in the lpl's linked list. This only exists in
2199 * leaves, but they should always match.
2200 */
2201 j = 0;
2202 cpu = cp_start = lpl->lpl_cpus;
2203 while (cpu != NULL) {
2204 j++;
2205
2206 /* check to make sure cpu's lpl is leaf lpl */
2207 ASSERT(cpu->cpu_lpl == lpl);
2208 if (cpu->cpu_lpl != lpl) {
2209 return (LPL_TOPO_CPU_HAS_BAD_LPL);
2210 }
2211
2212 /* check next cpu */
2213 if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2214 continue;
2215 } else {
2216 cpu = NULL;
2217 }
2218 }
2219
2220 ASSERT(j == lpl->lpl_ncpu);
2221 if (j != lpl->lpl_ncpu) {
2222 return (LPL_TOPO_LPL_BAD_NCPU);
2223 }
2224
2225 /*
2226 * Also, check that leaf lpl is contained in all
2227 * intermediate lpls that name the leaf as a descendant
2228 */
2229 for (j = 0; j <= lgrp_alloc_max; j++) {
2230 klgrpset_t intersect;
2231 lgrp_t *lgrp_cand;
2232 lpl_t *lpl_cand;
2233
2234 lgrp_cand = lgrp_table[j];
2235 intersect = klgrpset_intersects(
2236 lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2237 cpupart->cp_lgrpset);
2238
2239 if (!LGRP_EXISTS(lgrp_cand) ||
2240 !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2241 cpupart->cp_lgrpset) ||
2242 (intersect == 0))
2243 continue;
2244
2245 lpl_cand =
2246 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2247
2248 if (klgrpset_ismember(intersect,
2249 lgrp->lgrp_id)) {
2250 ASSERT(lpl_rset_contains(lpl_cand,
2251 lpl));
2252
2253 if (!lpl_rset_contains(lpl_cand, lpl)) {
2254 return (LPL_TOPO_RSET_MSSNG_LF);
2255 }
2256 }
2257 }
2258
2259 } else { /* non-leaf specific checks */
2260
2261 /*
2262 * Non-leaf lpls should have lpl_cpus == NULL
2263 * verify that this is so
2264 */
2265 ASSERT(lpl->lpl_cpus == NULL);
2266 if (lpl->lpl_cpus != NULL) {
2267 return (LPL_TOPO_NONLEAF_HAS_CPUS);
2268 }
2269
2270 /*
2271 * verify that the sum of the cpus in the leaf resources
2272 * is equal to the total ncpu in the intermediate
2273 */
2274 for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2275 sum += lpl->lpl_rset[j]->lpl_ncpu;
2276 }
2277
2278 ASSERT(sum == lpl->lpl_ncpu);
2279 if (sum != lpl->lpl_ncpu) {
2280 return (LPL_TOPO_LPL_BAD_NCPU);
2281 }
2282 }
2283
2284 /*
2285 * Check the rset of the lpl in question. Make sure that each
2286 * rset contains a subset of the resources in
2287 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset. This also makes
2288 * sure that each rset doesn't include resources that are
2289 * outside of that set. (Which would be resources somehow not
2290 * accounted for).
2291 */
2292 klgrpset_clear(rset);
2293 for (j = 0; j < lpl->lpl_nrset; j++) {
2294 klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2295 }
2296 klgrpset_copy(cset, rset);
2297 /* make sure lpl rset matches lgrp rset */
2298 klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2299 /* make sure rset is contained with in partition, too */
2300 klgrpset_diff(cset, cpupart->cp_lgrpset);
2301
2302 ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
2303 if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
2304 return (LPL_TOPO_RSET_MISMATCH);
2305 }
2306
2307 /*
2308 * check to make sure lpl_nrset matches the number of rsets
2309 * contained in the lpl
2310 */
2311 for (j = 0; j < lpl->lpl_nrset; j++) {
2312 if (lpl->lpl_rset[j] == NULL)
2313 break;
2314 }
2315
2316 ASSERT(j == lpl->lpl_nrset);
2317 if (j != lpl->lpl_nrset) {
2318 return (LPL_TOPO_BAD_RSETCNT);
2319 }
2320
2321 }
2322 return (LPL_TOPO_CORRECT);
2323 }
2324
2325 /*
2326 * Flatten lpl topology to given number of levels. This is presently only
2327 * implemented for a flatten to 2 levels, which will prune out the intermediates
2328 * and home the leaf lpls to the root lpl.
2329 */
2330 int
lpl_topo_flatten(int levels)2331 lpl_topo_flatten(int levels)
2332 {
2333 int i;
2334 uint_t sum;
2335 lgrp_t *lgrp_cur;
2336 lpl_t *lpl_cur;
2337 lpl_t *lpl_root;
2338 cpupart_t *cp;
2339
2340 if (levels != 2)
2341 return (0);
2342
2343 /* called w/ cpus paused - grab no locks! */
2344 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2345 !lgrp_initialized);
2346
2347 cp = cp_list_head;
2348 do {
2349 lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2350 ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2351
2352 for (i = 0; i <= lgrp_alloc_max; i++) {
2353 lgrp_cur = lgrp_table[i];
2354 lpl_cur = &cp->cp_lgrploads[i];
2355
2356 if ((lgrp_cur == lgrp_root) ||
2357 (!LGRP_EXISTS(lgrp_cur) &&
2358 (lpl_cur->lpl_ncpu == 0)))
2359 continue;
2360
2361 if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2362 /*
2363 * this should be a deleted intermediate, so
2364 * clear it
2365 */
2366 lpl_clear(lpl_cur);
2367 } else if ((lpl_cur->lpl_nrset == 1) &&
2368 (lpl_cur->lpl_rset[0] == lpl_cur) &&
2369 ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2370 (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2371 /*
2372 * this is a leaf whose parent was deleted, or
2373 * whose parent had their lgrp deleted. (And
2374 * whose parent will soon be deleted). Point
2375 * this guy back to the root lpl.
2376 */
2377 lpl_cur->lpl_parent = lpl_root;
2378 lpl_rset_add(lpl_root, lpl_cur);
2379 }
2380
2381 }
2382
2383 /*
2384 * Now that we're done, make sure the count on the root lpl is
2385 * correct, and update the hints of the children for the sake of
2386 * thoroughness
2387 */
2388 for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2389 sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2390 }
2391 lpl_root->lpl_ncpu = sum;
2392 lpl_child_update(lpl_root, cp);
2393
2394 cp = cp->cp_next;
2395 } while (cp != cp_list_head);
2396
2397 return (levels);
2398 }
2399
2400 /*
2401 * Insert a lpl into the resource hierarchy and create any additional lpls that
2402 * are necessary to represent the varying states of locality for the cpu
2403 * resoruces newly added to the partition.
2404 *
2405 * This routine is clever enough that it can correctly add resources from the
2406 * new leaf into both direct and indirect resource sets in the hierarchy. (Ie,
2407 * those for which the lpl is a leaf as opposed to simply a named equally local
2408 * resource). The one special case that needs additional processing is when a
2409 * new intermediate lpl is introduced. Since the main loop only traverses
2410 * looking to add the leaf resource where it does not yet exist, additional work
2411 * is necessary to add other leaf resources that may need to exist in the newly
2412 * created intermediate. This is performed by the second inner loop, and is
2413 * only done when the check for more than one overlapping resource succeeds.
2414 */
2415
2416 void
lpl_leaf_insert(lpl_t * lpl_leaf,cpupart_t * cpupart)2417 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2418 {
2419 int i;
2420 int j;
2421 int rset_num_intersect;
2422 lgrp_t *lgrp_cur;
2423 lpl_t *lpl_cur;
2424 lpl_t *lpl_parent;
2425 lgrp_id_t parent_id;
2426 klgrpset_t rset_intersect; /* resources in cpupart and lgrp */
2427
2428 for (i = 0; i <= lgrp_alloc_max; i++) {
2429 lgrp_cur = lgrp_table[i];
2430
2431 /*
2432 * Don't insert if the lgrp isn't there, if the leaf isn't
2433 * contained within the current lgrp, or if the current lgrp has
2434 * no leaves in this partition
2435 */
2436
2437 if (!LGRP_EXISTS(lgrp_cur) ||
2438 !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2439 lpl_leaf->lpl_lgrpid) ||
2440 !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2441 cpupart->cp_lgrpset))
2442 continue;
2443
2444 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2445 if (lgrp_cur->lgrp_parent != NULL) {
2446 /* if lgrp has a parent, assign it properly */
2447 parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2448 lpl_parent = &cpupart->cp_lgrploads[parent_id];
2449 } else {
2450 /* if not, make sure parent ptr gets set to null */
2451 lpl_parent = NULL;
2452 }
2453
2454 if (lpl_cur == lpl_leaf) {
2455 /*
2456 * Almost all leaf state was initialized elsewhere. The
2457 * only thing left to do is to set the parent.
2458 */
2459 lpl_cur->lpl_parent = lpl_parent;
2460 continue;
2461 }
2462
2463 lpl_clear(lpl_cur);
2464 lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2465
2466 lpl_cur->lpl_parent = lpl_parent;
2467
2468 /* does new lpl need to be populated with other resources? */
2469 rset_intersect =
2470 klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2471 cpupart->cp_lgrpset);
2472 klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2473
2474 if (rset_num_intersect > 1) {
2475 /*
2476 * If so, figure out what lpls have resources that
2477 * intersect this one, and add them.
2478 */
2479 for (j = 0; j <= lgrp_alloc_max; j++) {
2480 lgrp_t *lgrp_cand; /* candidate lgrp */
2481 lpl_t *lpl_cand; /* candidate lpl */
2482
2483 lgrp_cand = lgrp_table[j];
2484 if (!LGRP_EXISTS(lgrp_cand) ||
2485 !klgrpset_ismember(rset_intersect,
2486 lgrp_cand->lgrp_id))
2487 continue;
2488 lpl_cand =
2489 &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2490 lpl_rset_add(lpl_cur, lpl_cand);
2491 }
2492 }
2493 /*
2494 * This lpl's rset has changed. Update the hint in it's
2495 * children.
2496 */
2497 lpl_child_update(lpl_cur, cpupart);
2498 }
2499 }
2500
2501 /*
2502 * remove a lpl from the hierarchy of resources, clearing its state when
2503 * finished. If the lpls at the intermediate levels of the hierarchy have no
2504 * remaining resources, or no longer name a leaf resource in the cpu-partition,
2505 * delete them as well.
2506 */
2507
2508 void
lpl_leaf_remove(lpl_t * lpl_leaf,cpupart_t * cpupart)2509 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2510 {
2511 int i;
2512 lgrp_t *lgrp_cur;
2513 lpl_t *lpl_cur;
2514 klgrpset_t leaf_intersect; /* intersection of leaves */
2515
2516 for (i = 0; i <= lgrp_alloc_max; i++) {
2517 lgrp_cur = lgrp_table[i];
2518
2519 /*
2520 * Don't attempt to remove from lgrps that aren't there, that
2521 * don't contain our leaf, or from the leaf itself. (We do that
2522 * later)
2523 */
2524
2525 if (!LGRP_EXISTS(lgrp_cur))
2526 continue;
2527
2528 lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2529
2530 if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2531 lpl_leaf->lpl_lgrpid) ||
2532 (lpl_cur == lpl_leaf)) {
2533 continue;
2534 }
2535
2536 /*
2537 * This is a slightly sleazy simplification in that we have
2538 * already marked the cp_lgrpset as no longer containing the
2539 * leaf we've deleted. Any lpls that pass the above checks
2540 * based upon lgrp membership but not necessarily cpu-part
2541 * membership also get cleared by the checks below. Currently
2542 * this is harmless, as the lpls should be empty anyway.
2543 *
2544 * In particular, we want to preserve lpls that have additional
2545 * leaf resources, even though we don't yet have a processor
2546 * architecture that represents resources this way.
2547 */
2548
2549 leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2550 cpupart->cp_lgrpset);
2551
2552 lpl_rset_del(lpl_cur, lpl_leaf);
2553 if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2554 lpl_clear(lpl_cur);
2555 } else {
2556 /*
2557 * Update this lpl's children
2558 */
2559 lpl_child_update(lpl_cur, cpupart);
2560 }
2561 }
2562 lpl_clear(lpl_leaf);
2563 }
2564
2565 /*
2566 * add a cpu to a partition in terms of lgrp load avg bookeeping
2567 *
2568 * The lpl (cpu partition load average information) is now arranged in a
2569 * hierarchical fashion whereby resources that are closest, ie. most local, to
2570 * the cpu in question are considered to be leaves in a tree of resources.
2571 * There are two general cases for cpu additon:
2572 *
2573 * 1. A lpl structure that contains resources already in the hierarchy tree.
2574 * In this case, all of the associated lpl relationships have been defined, and
2575 * all that is necessary is that we link the new cpu into the per-lpl list of
2576 * cpus, and increment the ncpu count of all places where this cpu resource will
2577 * be accounted for. lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2578 * pushing is accomplished by this routine.
2579 *
2580 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2581 * not exist yet. In this case, it is necessary to build the leaf lpl, and
2582 * construct the hierarchy of state necessary to name it's more distant
2583 * resources, if they should exist. The leaf structure is initialized by this
2584 * routine, as is the cpu-partition state for the lgrp membership. This routine
2585 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2586 * and builds all of the "ancestoral" state necessary to identify resources at
2587 * differing levels of locality.
2588 */
2589 void
lgrp_part_add_cpu(cpu_t * cp,lgrp_id_t lgrpid)2590 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2591 {
2592 cpupart_t *cpupart;
2593 lgrp_t *lgrp_leaf;
2594 lpl_t *lpl_leaf;
2595
2596 /* called sometimes w/ cpus paused - grab no locks */
2597 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2598
2599 cpupart = cp->cpu_part;
2600 lgrp_leaf = lgrp_table[lgrpid];
2601
2602 /* don't add non-existent lgrp */
2603 ASSERT(LGRP_EXISTS(lgrp_leaf));
2604 lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2605 cp->cpu_lpl = lpl_leaf;
2606
2607 /* only leaf lpls contain cpus */
2608
2609 if (lpl_leaf->lpl_ncpu++ == 0) {
2610 lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2611 klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2612 lpl_leaf_insert(lpl_leaf, cpupart);
2613 } else {
2614 /*
2615 * the lpl should already exist in the parent, so just update
2616 * the count of available CPUs
2617 */
2618 lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2619 }
2620
2621 /* link cpu into list of cpus in lpl */
2622
2623 if (lpl_leaf->lpl_cpus) {
2624 cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2625 cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2626 lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2627 lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2628 } else {
2629 /*
2630 * We increment ncpu immediately after we create a new leaf
2631 * lpl, so assert that ncpu == 1 for the case where we don't
2632 * have any cpu pointers yet.
2633 */
2634 ASSERT(lpl_leaf->lpl_ncpu == 1);
2635 lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2636 }
2637
2638 }
2639
2640
2641 /*
2642 * remove a cpu from a partition in terms of lgrp load avg bookeeping
2643 *
2644 * The lpl (cpu partition load average information) is now arranged in a
2645 * hierarchical fashion whereby resources that are closest, ie. most local, to
2646 * the cpu in question are considered to be leaves in a tree of resources.
2647 * There are two removal cases in question:
2648 *
2649 * 1. Removal of the resource in the leaf leaves other resources remaining in
2650 * that leaf. (Another cpu still exists at this level of locality). In this
2651 * case, the count of available cpus is decremented in all assocated lpls by
2652 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2653 * from the per-cpu lpl list.
2654 *
2655 * 2. Removal of the resource results in the lpl containing no resources. (It's
2656 * empty) In this case, all of what has occurred for the first step must take
2657 * place; however, additionally we must remove the lpl structure itself, prune
2658 * out any stranded lpls that do not directly name a leaf resource, and mark the
2659 * cpu partition in question as no longer containing resources from the lgrp of
2660 * the lpl that has been delted. Cpu-partition changes are handled by this
2661 * method, but the lpl_leaf_remove function deals with the details of pruning
2662 * out the empty lpl and any of its orphaned direct ancestors.
2663 */
2664 void
lgrp_part_del_cpu(cpu_t * cp)2665 lgrp_part_del_cpu(cpu_t *cp)
2666 {
2667 lpl_t *lpl;
2668 lpl_t *leaf_lpl;
2669 lgrp_t *lgrp_leaf;
2670
2671 /* called sometimes w/ cpus paused - grab no locks */
2672
2673 ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2674
2675 lpl = leaf_lpl = cp->cpu_lpl;
2676 lgrp_leaf = leaf_lpl->lpl_lgrp;
2677
2678 /* don't delete a leaf that isn't there */
2679 ASSERT(LGRP_EXISTS(lgrp_leaf));
2680
2681 /* no double-deletes */
2682 ASSERT(lpl->lpl_ncpu);
2683 if (--lpl->lpl_ncpu == 0) {
2684 /*
2685 * This was the last cpu in this lgroup for this partition,
2686 * clear its bit in the partition's lgroup bitmask
2687 */
2688 klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2689
2690 /* eliminate remaning lpl link pointers in cpu, lpl */
2691 lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2692
2693 lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2694 } else {
2695
2696 /* unlink cpu from lists of cpus in lpl */
2697 cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2698 cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2699 if (lpl->lpl_cpus == cp) {
2700 lpl->lpl_cpus = cp->cpu_next_lpl;
2701 }
2702
2703 /*
2704 * Update the cpu count in the lpls associated with parent
2705 * lgroups.
2706 */
2707 lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2708
2709 }
2710 /* clear cpu's lpl ptr when we're all done */
2711 cp->cpu_lpl = NULL;
2712 }
2713
2714 /*
2715 * Recompute load average for the specified partition/lgrp fragment.
2716 *
2717 * We rely on the fact that this routine is called from the clock thread
2718 * at a point before the clock thread can block (i.e. before its first
2719 * lock request). Since the clock thread can not be preempted (since it
2720 * runs at highest priority), we know that cpu partitions can not change
2721 * (since doing so would require either the repartition requester or the
2722 * cpu_pause thread to run on this cpu), so we can update the cpu's load
2723 * without grabbing cpu_lock.
2724 */
2725 void
lgrp_loadavg(lpl_t * lpl,uint_t nrcpus,int ageflag)2726 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2727 {
2728 uint_t ncpu;
2729 int64_t old, new, f;
2730
2731 /*
2732 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2733 */
2734 static short expval[] = {
2735 0, 3196, 1618, 1083,
2736 814, 652, 543, 466,
2737 408, 363, 326, 297,
2738 272, 251, 233, 218,
2739 204, 192, 181, 172,
2740 163, 155, 148, 142,
2741 136, 130, 125, 121,
2742 116, 112, 109, 105
2743 };
2744
2745 /* ASSERT (called from clock level) */
2746
2747 if ((lpl == NULL) || /* we're booting - this is easiest for now */
2748 ((ncpu = lpl->lpl_ncpu) == 0)) {
2749 return;
2750 }
2751
2752 for (;;) {
2753
2754 if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2755 f = expval[1]/ncpu; /* good approx. for large ncpu */
2756 else
2757 f = expval[ncpu];
2758
2759 /*
2760 * Modify the load average atomically to avoid losing
2761 * anticipatory load updates (see lgrp_move_thread()).
2762 */
2763 if (ageflag) {
2764 /*
2765 * We're supposed to both update and age the load.
2766 * This happens 10 times/sec. per cpu. We do a
2767 * little hoop-jumping to avoid integer overflow.
2768 */
2769 int64_t q, r;
2770
2771 do {
2772 old = new = lpl->lpl_loadavg;
2773 q = (old >> 16) << 7;
2774 r = (old & 0xffff) << 7;
2775 new += ((long long)(nrcpus - q) * f -
2776 ((r * f) >> 16)) >> 7;
2777
2778 /*
2779 * Check for overflow
2780 */
2781 if (new > LGRP_LOADAVG_MAX)
2782 new = LGRP_LOADAVG_MAX;
2783 else if (new < 0)
2784 new = 0;
2785 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2786 old, new) != old);
2787 } else {
2788 /*
2789 * We're supposed to update the load, but not age it.
2790 * This option is used to update the load (which either
2791 * has already been aged in this 1/10 sec. interval or
2792 * soon will be) to account for a remotely executing
2793 * thread.
2794 */
2795 do {
2796 old = new = lpl->lpl_loadavg;
2797 new += f;
2798 /*
2799 * Check for overflow
2800 * Underflow not possible here
2801 */
2802 if (new < old)
2803 new = LGRP_LOADAVG_MAX;
2804 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
2805 old, new) != old);
2806 }
2807
2808 /*
2809 * Do the same for this lpl's parent
2810 */
2811 if ((lpl = lpl->lpl_parent) == NULL)
2812 break;
2813 ncpu = lpl->lpl_ncpu;
2814 }
2815 }
2816
2817 /*
2818 * Initialize lpl topology in the target based on topology currently present in
2819 * lpl_bootstrap.
2820 *
2821 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2822 * initialize cp_default list of lpls. Up to this point all topology operations
2823 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2824 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2825 * `target' points to the list of lpls in cp_default and `size' is the size of
2826 * this list.
2827 *
2828 * This function walks the lpl topology in lpl_bootstrap and does for things:
2829 *
2830 * 1) Copies all fields from lpl_bootstrap to the target.
2831 *
2832 * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2833 *
2834 * 3) Updates lpl_parent pointers to point to the lpls in the target list
2835 * instead of lpl_bootstrap.
2836 *
2837 * 4) Updates pointers in the resource list of the target to point to the lpls
2838 * in the target list instead of lpl_bootstrap.
2839 *
2840 * After lpl_topo_bootstrap() completes, target contains the same information
2841 * that would be present there if it were used during boot instead of
2842 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2843 * and it is bzeroed.
2844 */
2845 void
lpl_topo_bootstrap(lpl_t * target,int size)2846 lpl_topo_bootstrap(lpl_t *target, int size)
2847 {
2848 lpl_t *lpl = lpl_bootstrap;
2849 lpl_t *target_lpl = target;
2850 lpl_t **rset;
2851 int *id2rset;
2852 int sz;
2853 int howmany;
2854 int id;
2855 int i;
2856
2857 /*
2858 * The only target that should be passed here is cp_default lpl list.
2859 */
2860 ASSERT(target == cp_default.cp_lgrploads);
2861 ASSERT(size == cp_default.cp_nlgrploads);
2862 ASSERT(!lgrp_topo_initialized);
2863 ASSERT(ncpus == 1);
2864
2865 howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2866 for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2867 /*
2868 * Copy all fields from lpl, except for the rset,
2869 * lgrp id <=> rset mapping storage,
2870 * and amount of storage
2871 */
2872 rset = target_lpl->lpl_rset;
2873 id2rset = target_lpl->lpl_id2rset;
2874 sz = target_lpl->lpl_rset_sz;
2875
2876 *target_lpl = *lpl;
2877
2878 target_lpl->lpl_rset_sz = sz;
2879 target_lpl->lpl_rset = rset;
2880 target_lpl->lpl_id2rset = id2rset;
2881
2882 /*
2883 * Substitute CPU0 lpl pointer with one relative to target.
2884 */
2885 if (lpl->lpl_cpus == CPU) {
2886 ASSERT(CPU->cpu_lpl == lpl);
2887 CPU->cpu_lpl = target_lpl;
2888 }
2889
2890 /*
2891 * Substitute parent information with parent relative to target.
2892 */
2893 if (lpl->lpl_parent != NULL)
2894 target_lpl->lpl_parent = (lpl_t *)
2895 (((uintptr_t)lpl->lpl_parent -
2896 (uintptr_t)lpl_bootstrap) +
2897 (uintptr_t)target);
2898
2899 /*
2900 * Walk over resource set substituting pointers relative to
2901 * lpl_bootstrap's rset to pointers relative to target's
2902 */
2903 ASSERT(lpl->lpl_nrset <= 1);
2904
2905 for (id = 0; id < lpl->lpl_nrset; id++) {
2906 if (lpl->lpl_rset[id] != NULL) {
2907 target_lpl->lpl_rset[id] = (lpl_t *)
2908 (((uintptr_t)lpl->lpl_rset[id] -
2909 (uintptr_t)lpl_bootstrap) +
2910 (uintptr_t)target);
2911 }
2912 target_lpl->lpl_id2rset[id] =
2913 lpl->lpl_id2rset[id];
2914 }
2915 }
2916
2917 /*
2918 * Clean up the bootstrap lpls since we have switched over to the
2919 * actual lpl array in the default cpu partition.
2920 *
2921 * We still need to keep one empty lpl around for newly starting
2922 * slave CPUs to reference should they need to make it through the
2923 * dispatcher prior to their lgrp/lpl initialization.
2924 *
2925 * The lpl related dispatcher code has been designed to work properly
2926 * (and without extra checks) for this special case of a zero'ed
2927 * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
2928 * with lgrpid 0 and an empty resource set. Iteration over the rset
2929 * array by the dispatcher is also NULL terminated for this reason.
2930 *
2931 * This provides the desired behaviour for an uninitialized CPU.
2932 * It shouldn't see any other CPU to either dispatch to or steal
2933 * from until it is properly initialized.
2934 */
2935 bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2936 bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
2937 bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));
2938
2939 lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
2940 lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
2941 }
2942
2943 /*
2944 * If the lowest load among the lgroups a process' threads are currently
2945 * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2946 * expanding the process to a new lgroup.
2947 */
2948 #define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2949 lgrp_load_t lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2950
2951 #define LGRP_EXPAND_PROC_THRESH(ncpu) \
2952 ((lgrp_expand_proc_thresh) / (ncpu))
2953
2954 /*
2955 * A process will be expanded to a new lgroup only if the difference between
2956 * the lowest load on the lgroups the process' thread's are currently spread
2957 * across and the lowest load on the other lgroups in the process' partition
2958 * is greater than lgrp_expand_proc_diff.
2959 */
2960 #define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2961 lgrp_load_t lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2962
2963 #define LGRP_EXPAND_PROC_DIFF(ncpu) \
2964 ((lgrp_expand_proc_diff) / (ncpu))
2965
2966 /*
2967 * The loadavg tolerance accounts for "noise" inherent in the load, which may
2968 * be present due to impreciseness of the load average decay algorithm.
2969 *
2970 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2971 * tolerance is scaled by the number of cpus in the lgroup just like
2972 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2973 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2974 * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2975 */
2976 uint32_t lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2977 #define LGRP_LOADAVG_TOLERANCE(ncpu) \
2978 ((lgrp_loadavg_tolerance) / ncpu)
2979
2980 /*
2981 * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2982 * average is above this threshold
2983 */
2984 uint32_t lgrp_load_thresh = UINT32_MAX;
2985
2986 /*
2987 * lgrp_choose() will try to skip any lgroups with less memory
2988 * than this free when choosing a home lgroup
2989 */
2990 pgcnt_t lgrp_mem_free_thresh = 0;
2991
2992 /*
2993 * When choosing between similarly loaded lgroups, lgrp_choose() will pick
2994 * one based on one of the following policies:
2995 * - Random selection
2996 * - Pseudo round robin placement
2997 * - Longest time since a thread was last placed
2998 */
2999 #define LGRP_CHOOSE_RANDOM 1
3000 #define LGRP_CHOOSE_RR 2
3001 #define LGRP_CHOOSE_TIME 3
3002
3003 int lgrp_choose_policy = LGRP_CHOOSE_TIME;
3004
3005 /*
3006 * Choose a suitable leaf lgroup for a kthread. The kthread is assumed not to
3007 * be bound to a CPU or processor set.
3008 *
3009 * Arguments:
3010 * t The thread
3011 * cpupart The partition the thread belongs to.
3012 *
3013 * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3014 * disabled, or thread_lock held (at splhigh) to protect against the CPU
3015 * partitions changing out from under us and assumes that given thread is
3016 * protected. Also, called sometimes w/ cpus paused or kernel preemption
3017 * disabled, so don't grab any locks because we should never block under
3018 * those conditions.
3019 */
3020 lpl_t *
lgrp_choose(kthread_t * t,cpupart_t * cpupart)3021 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3022 {
3023 lgrp_load_t bestload, bestrload;
3024 int lgrpid_offset, lgrp_count;
3025 lgrp_id_t lgrpid, lgrpid_start;
3026 lpl_t *lpl, *bestlpl, *bestrlpl;
3027 klgrpset_t lgrpset;
3028 proc_t *p;
3029
3030 ASSERT(t != NULL);
3031 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3032 THREAD_LOCK_HELD(t));
3033 ASSERT(cpupart != NULL);
3034
3035 p = t->t_procp;
3036
3037 /* A process should always be in an active partition */
3038 ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3039
3040 bestlpl = bestrlpl = NULL;
3041 bestload = bestrload = LGRP_LOADAVG_MAX;
3042 lgrpset = cpupart->cp_lgrpset;
3043
3044 switch (lgrp_choose_policy) {
3045 case LGRP_CHOOSE_RR:
3046 lgrpid = cpupart->cp_lgrp_hint;
3047 do {
3048 if (++lgrpid > lgrp_alloc_max)
3049 lgrpid = 0;
3050 } while (!klgrpset_ismember(lgrpset, lgrpid));
3051
3052 break;
3053 default:
3054 case LGRP_CHOOSE_TIME:
3055 case LGRP_CHOOSE_RANDOM:
3056 klgrpset_nlgrps(lgrpset, lgrp_count);
3057 lgrpid_offset =
3058 (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3059 for (lgrpid = 0; ; lgrpid++) {
3060 if (klgrpset_ismember(lgrpset, lgrpid)) {
3061 if (--lgrpid_offset == 0)
3062 break;
3063 }
3064 }
3065 break;
3066 }
3067
3068 lgrpid_start = lgrpid;
3069
3070 DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3071 lgrp_id_t, cpupart->cp_lgrp_hint);
3072
3073 /*
3074 * Use lgroup affinities (if any) to choose best lgroup
3075 *
3076 * NOTE: Assumes that thread is protected from going away and its
3077 * lgroup affinities won't change (ie. p_lock, or
3078 * thread_lock() being held and/or CPUs paused)
3079 */
3080 if (t->t_lgrp_affinity) {
3081 lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
3082 if (lpl != NULL)
3083 return (lpl);
3084 }
3085
3086 ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3087
3088 do {
3089 pgcnt_t npgs;
3090
3091 /*
3092 * Skip any lgroups outside of thread's pset
3093 */
3094 if (!klgrpset_ismember(lgrpset, lgrpid)) {
3095 if (++lgrpid > lgrp_alloc_max)
3096 lgrpid = 0; /* wrap the search */
3097 continue;
3098 }
3099
3100 /*
3101 * Skip any non-leaf lgroups
3102 */
3103 if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3104 continue;
3105
3106 /*
3107 * Skip any lgroups without enough free memory
3108 * (when threshold set to nonzero positive value)
3109 */
3110 if (lgrp_mem_free_thresh > 0) {
3111 npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3112 if (npgs < lgrp_mem_free_thresh) {
3113 if (++lgrpid > lgrp_alloc_max)
3114 lgrpid = 0; /* wrap the search */
3115 continue;
3116 }
3117 }
3118
3119 lpl = &cpupart->cp_lgrploads[lgrpid];
3120 if (klgrpset_isempty(p->p_lgrpset) ||
3121 klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3122 /*
3123 * Either this is a new process or the process already
3124 * has threads on this lgrp, so this is a preferred
3125 * lgroup for the thread.
3126 */
3127 if (bestlpl == NULL ||
3128 lpl_pick(lpl, bestlpl)) {
3129 bestload = lpl->lpl_loadavg;
3130 bestlpl = lpl;
3131 }
3132 } else {
3133 /*
3134 * The process doesn't have any threads on this lgrp,
3135 * but we're willing to consider this lgrp if the load
3136 * difference is big enough to justify splitting up
3137 * the process' threads.
3138 */
3139 if (bestrlpl == NULL ||
3140 lpl_pick(lpl, bestrlpl)) {
3141 bestrload = lpl->lpl_loadavg;
3142 bestrlpl = lpl;
3143 }
3144 }
3145 if (++lgrpid > lgrp_alloc_max)
3146 lgrpid = 0; /* wrap the search */
3147 } while (lgrpid != lgrpid_start);
3148
3149 /*
3150 * Return root lgroup if threshold isn't set to maximum value and
3151 * lowest lgroup load average more than a certain threshold
3152 */
3153 if (lgrp_load_thresh != UINT32_MAX &&
3154 bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3155 return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3156
3157 /*
3158 * If all the lgroups over which the thread's process is spread are
3159 * heavily loaded, or otherwise undesirable, we'll consider placing
3160 * the thread on one of the other leaf lgroups in the thread's
3161 * partition.
3162 */
3163 if ((bestlpl == NULL) ||
3164 ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3165 (bestrload < bestload) && /* paranoid about wraparound */
3166 (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3167 bestload))) {
3168 bestlpl = bestrlpl;
3169 }
3170
3171 if (bestlpl == NULL) {
3172 /*
3173 * No lgroup looked particularly good, but we still
3174 * have to pick something. Go with the randomly selected
3175 * legal lgroup we started with above.
3176 */
3177 bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3178 }
3179
3180 cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3181 bestlpl->lpl_homed_time = gethrtime_unscaled();
3182
3183 ASSERT(bestlpl->lpl_ncpu > 0);
3184 return (bestlpl);
3185 }
3186
3187 /*
3188 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
3189 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
3190 */
3191 static int
lpl_pick(lpl_t * lpl1,lpl_t * lpl2)3192 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3193 {
3194 lgrp_load_t l1, l2;
3195 lgrp_load_t tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3196
3197 l1 = lpl1->lpl_loadavg;
3198 l2 = lpl2->lpl_loadavg;
3199
3200 if ((l1 + tolerance < l2) && (l1 < l2)) {
3201 /* lpl1 is significantly less loaded than lpl2 */
3202 return (1);
3203 }
3204
3205 if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3206 l1 + tolerance >= l2 && l1 < l2 &&
3207 lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3208 /*
3209 * lpl1's load is within the tolerance of lpl2. We're
3210 * willing to consider it be to better however if
3211 * it has been longer since we last homed a thread there
3212 */
3213 return (1);
3214 }
3215
3216 return (0);
3217 }
3218
3219 /*
3220 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
3221 * process that uses text replication changed home lgrp. This info is used by
3222 * segvn asyncronous thread to detect if it needs to recheck what lgrps
3223 * should be used for text replication.
3224 */
3225 static uint64_t lgrp_trthr_moves = 0;
3226
3227 uint64_t
lgrp_get_trthr_migrations(void)3228 lgrp_get_trthr_migrations(void)
3229 {
3230 return (lgrp_trthr_moves);
3231 }
3232
3233 void
lgrp_update_trthr_migrations(uint64_t incr)3234 lgrp_update_trthr_migrations(uint64_t incr)
3235 {
3236 atomic_add_64(&lgrp_trthr_moves, incr);
3237 }
3238
3239 /*
3240 * An LWP is expected to be assigned to an lgroup for at least this long
3241 * for its anticipatory load to be justified. NOTE that this value should
3242 * not be set extremely huge (say, larger than 100 years), to avoid problems
3243 * with overflow in the calculation that uses it.
3244 */
3245 #define LGRP_MIN_NSEC (NANOSEC / 10) /* 1/10 of a second */
3246 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3247
3248 /*
3249 * Routine to change a thread's lgroup affiliation. This routine updates
3250 * the thread's kthread_t struct and its process' proc_t struct to note the
3251 * thread's new lgroup affiliation, and its lgroup affinities.
3252 *
3253 * Note that this is the only routine that modifies a thread's t_lpl field,
3254 * and that adds in or removes anticipatory load.
3255 *
3256 * If the thread is exiting, newlpl is NULL.
3257 *
3258 * Locking:
3259 * The following lock must be held on entry:
3260 * cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3261 * doesn't get removed from t's partition
3262 *
3263 * This routine is not allowed to grab any locks, since it may be called
3264 * with cpus paused (such as from cpu_offline).
3265 */
3266 void
lgrp_move_thread(kthread_t * t,lpl_t * newlpl,int do_lgrpset_delete)3267 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3268 {
3269 proc_t *p;
3270 lpl_t *lpl, *oldlpl;
3271 lgrp_id_t oldid;
3272 kthread_t *tp;
3273 uint_t ncpu;
3274 lgrp_load_t old, new;
3275
3276 ASSERT(t);
3277 ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3278 THREAD_LOCK_HELD(t));
3279
3280 /*
3281 * If not changing lpls, just return
3282 */
3283 if ((oldlpl = t->t_lpl) == newlpl)
3284 return;
3285
3286 /*
3287 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3288 * associated with process 0 rather than with its original process).
3289 */
3290 if (t->t_proc_flag & TP_LWPEXIT) {
3291 if (newlpl != NULL) {
3292 t->t_lpl = newlpl;
3293 }
3294 return;
3295 }
3296
3297 p = ttoproc(t);
3298
3299 /*
3300 * If the thread had a previous lgroup, update its process' p_lgrpset
3301 * to account for it being moved from its old lgroup.
3302 */
3303 if ((oldlpl != NULL) && /* thread had a previous lgroup */
3304 (p->p_tlist != NULL)) {
3305 oldid = oldlpl->lpl_lgrpid;
3306
3307 if (newlpl != NULL)
3308 lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3309
3310 if ((do_lgrpset_delete) &&
3311 (klgrpset_ismember(p->p_lgrpset, oldid))) {
3312 for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3313 /*
3314 * Check if a thread other than the thread
3315 * that's moving is assigned to the same
3316 * lgroup as the thread that's moving. Note
3317 * that we have to compare lgroup IDs, rather
3318 * than simply comparing t_lpl's, since the
3319 * threads may belong to different partitions
3320 * but be assigned to the same lgroup.
3321 */
3322 ASSERT(tp->t_lpl != NULL);
3323
3324 if ((tp != t) &&
3325 (tp->t_lpl->lpl_lgrpid == oldid)) {
3326 /*
3327 * Another thread is assigned to the
3328 * same lgroup as the thread that's
3329 * moving, p_lgrpset doesn't change.
3330 */
3331 break;
3332 } else if (tp == p->p_tlist) {
3333 /*
3334 * No other thread is assigned to the
3335 * same lgroup as the exiting thread,
3336 * clear the lgroup's bit in p_lgrpset.
3337 */
3338 klgrpset_del(p->p_lgrpset, oldid);
3339 break;
3340 }
3341 }
3342 }
3343
3344 /*
3345 * If this thread was assigned to its old lgroup for such a
3346 * short amount of time that the anticipatory load that was
3347 * added on its behalf has aged very little, remove that
3348 * anticipatory load.
3349 */
3350 if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3351 ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3352 lpl = oldlpl;
3353 for (;;) {
3354 do {
3355 old = new = lpl->lpl_loadavg;
3356 new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3357 if (new > old) {
3358 /*
3359 * this can happen if the load
3360 * average was aged since we
3361 * added in the anticipatory
3362 * load
3363 */
3364 new = 0;
3365 }
3366 } while (atomic_cas_32(
3367 (lgrp_load_t *)&lpl->lpl_loadavg, old,
3368 new) != old);
3369
3370 lpl = lpl->lpl_parent;
3371 if (lpl == NULL)
3372 break;
3373
3374 ncpu = lpl->lpl_ncpu;
3375 ASSERT(ncpu > 0);
3376 }
3377 }
3378 }
3379 /*
3380 * If the thread has a new lgroup (i.e. it's not exiting), update its
3381 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3382 * to its new lgroup to account for its move to its new lgroup.
3383 */
3384 if (newlpl != NULL) {
3385 /*
3386 * This thread is moving to a new lgroup
3387 */
3388 t->t_lpl = newlpl;
3389 if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
3390 p->p_t1_lgrpid = newlpl->lpl_lgrpid;
3391 membar_producer();
3392 if (p->p_tr_lgrpid != LGRP_NONE &&
3393 p->p_tr_lgrpid != p->p_t1_lgrpid) {
3394 lgrp_update_trthr_migrations(1);
3395 }
3396 }
3397
3398 /*
3399 * Reflect move in load average of new lgroup
3400 * unless it is root lgroup
3401 */
3402 if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3403 return;
3404
3405 if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3406 klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3407 }
3408
3409 /*
3410 * It'll take some time for the load on the new lgroup
3411 * to reflect this thread's placement on it. We'd
3412 * like not, however, to have all threads between now
3413 * and then also piling on to this lgroup. To avoid
3414 * this pileup, we anticipate the load this thread
3415 * will generate on its new lgroup. The goal is to
3416 * make the lgroup's load appear as though the thread
3417 * had been there all along. We're very conservative
3418 * in calculating this anticipatory load, we assume
3419 * the worst case case (100% CPU-bound thread). This
3420 * may be modified in the future to be more accurate.
3421 */
3422 lpl = newlpl;
3423 for (;;) {
3424 ncpu = lpl->lpl_ncpu;
3425 ASSERT(ncpu > 0);
3426 do {
3427 old = new = lpl->lpl_loadavg;
3428 new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3429 /*
3430 * Check for overflow
3431 * Underflow not possible here
3432 */
3433 if (new < old)
3434 new = UINT32_MAX;
3435 } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
3436 old, new) != old);
3437
3438 lpl = lpl->lpl_parent;
3439 if (lpl == NULL)
3440 break;
3441 }
3442 t->t_anttime = gethrtime();
3443 }
3444 }
3445
3446 /*
3447 * Return lgroup memory allocation policy given advice from madvise(3C)
3448 */
3449 lgrp_mem_policy_t
lgrp_madv_to_policy(uchar_t advice,size_t size,int type)3450 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3451 {
3452 switch (advice) {
3453 case MADV_ACCESS_LWP:
3454 return (LGRP_MEM_POLICY_NEXT);
3455 case MADV_ACCESS_MANY:
3456 return (LGRP_MEM_POLICY_RANDOM);
3457 default:
3458 return (lgrp_mem_policy_default(size, type));
3459 }
3460 }
3461
3462 /*
3463 * Figure out default policy
3464 */
3465 lgrp_mem_policy_t
lgrp_mem_policy_default(size_t size,int type)3466 lgrp_mem_policy_default(size_t size, int type)
3467 {
3468 cpupart_t *cp;
3469 lgrp_mem_policy_t policy;
3470 size_t pset_mem_size;
3471
3472 /*
3473 * Randomly allocate memory across lgroups for shared memory
3474 * beyond a certain threshold
3475 */
3476 if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3477 (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3478 /*
3479 * Get total memory size of current thread's pset
3480 */
3481 kpreempt_disable();
3482 cp = curthread->t_cpupart;
3483 klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3484 kpreempt_enable();
3485
3486 /*
3487 * Choose policy to randomly allocate memory across
3488 * lgroups in pset if it will fit and is not default
3489 * partition. Otherwise, allocate memory randomly
3490 * across machine.
3491 */
3492 if (lgrp_mem_pset_aware && size < pset_mem_size)
3493 policy = LGRP_MEM_POLICY_RANDOM_PSET;
3494 else
3495 policy = LGRP_MEM_POLICY_RANDOM;
3496 } else
3497 /*
3498 * Apply default policy for private memory and
3499 * shared memory under the respective random
3500 * threshold.
3501 */
3502 policy = lgrp_mem_default_policy;
3503
3504 return (policy);
3505 }
3506
3507 /*
3508 * Get memory allocation policy for this segment
3509 */
3510 lgrp_mem_policy_info_t *
lgrp_mem_policy_get(struct seg * seg,caddr_t vaddr)3511 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3512 {
3513 lgrp_mem_policy_info_t *policy_info;
3514 extern struct seg_ops segspt_ops;
3515 extern struct seg_ops segspt_shmops;
3516
3517 /*
3518 * This is for binary compatibility to protect against third party
3519 * segment drivers which haven't recompiled to allow for
3520 * SEGOP_GETPOLICY()
3521 */
3522 if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3523 seg->s_ops != &segspt_shmops)
3524 return (NULL);
3525
3526 policy_info = NULL;
3527 if (seg->s_ops->getpolicy != NULL)
3528 policy_info = SEGOP_GETPOLICY(seg, vaddr);
3529
3530 return (policy_info);
3531 }
3532
3533 /*
3534 * Set policy for allocating private memory given desired policy, policy info,
3535 * size in bytes of memory that policy is being applied.
3536 * Return 0 if policy wasn't set already and 1 if policy was set already
3537 */
3538 int
lgrp_privm_policy_set(lgrp_mem_policy_t policy,lgrp_mem_policy_info_t * policy_info,size_t size)3539 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3540 lgrp_mem_policy_info_t *policy_info, size_t size)
3541 {
3542
3543 ASSERT(policy_info != NULL);
3544
3545 if (policy == LGRP_MEM_POLICY_DEFAULT)
3546 policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3547
3548 /*
3549 * Policy set already?
3550 */
3551 if (policy == policy_info->mem_policy)
3552 return (1);
3553
3554 /*
3555 * Set policy
3556 */
3557 policy_info->mem_policy = policy;
3558 policy_info->mem_lgrpid = LGRP_NONE;
3559
3560 return (0);
3561 }
3562
3563
3564 /*
3565 * Get shared memory allocation policy with given tree and offset
3566 */
3567 lgrp_mem_policy_info_t *
lgrp_shm_policy_get(struct anon_map * amp,ulong_t anon_index,vnode_t * vp,u_offset_t vn_off)3568 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3569 u_offset_t vn_off)
3570 {
3571 u_offset_t off;
3572 lgrp_mem_policy_info_t *policy_info;
3573 lgrp_shm_policy_seg_t *policy_seg;
3574 lgrp_shm_locality_t *shm_locality;
3575 avl_tree_t *tree;
3576 avl_index_t where;
3577
3578 shm_locality = NULL;
3579 tree = NULL;
3580 /*
3581 * Get policy segment tree from anon_map or vnode and use specified
3582 * anon index or vnode offset as offset
3583 *
3584 * Assume that no lock needs to be held on anon_map or vnode, since
3585 * they should be protected by their reference count which must be
3586 * nonzero for an existing segment
3587 */
3588 if (amp) {
3589 ASSERT(amp->refcnt != 0);
3590 shm_locality = amp->locality;
3591 if (shm_locality == NULL)
3592 return (NULL);
3593 tree = shm_locality->loc_tree;
3594 off = ptob(anon_index);
3595 } else if (vp) {
3596 shm_locality = vp->v_locality;
3597 if (shm_locality == NULL)
3598 return (NULL);
3599 ASSERT(shm_locality->loc_count != 0);
3600 tree = shm_locality->loc_tree;
3601 off = vn_off;
3602 }
3603
3604 if (tree == NULL)
3605 return (NULL);
3606
3607 /*
3608 * Lookup policy segment for offset into shared object and return
3609 * policy info
3610 */
3611 rw_enter(&shm_locality->loc_lock, RW_READER);
3612 policy_info = NULL;
3613 policy_seg = avl_find(tree, &off, &where);
3614 if (policy_seg)
3615 policy_info = &policy_seg->shm_policy;
3616 rw_exit(&shm_locality->loc_lock);
3617
3618 return (policy_info);
3619 }
3620
3621 /*
3622 * Default memory allocation policy for kernel segmap pages
3623 */
3624 lgrp_mem_policy_t lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;
3625
3626 /*
3627 * Return lgroup to use for allocating memory
3628 * given the segment and address
3629 *
3630 * There isn't any mutual exclusion that exists between calls
3631 * to this routine and DR, so this routine and whomever calls it
3632 * should be mindful of the possibility that the lgrp returned
3633 * may be deleted. If this happens, dereferences of the lgrp
3634 * pointer will still be safe, but the resources in the lgrp will
3635 * be gone, and LGRP_EXISTS() will no longer be true.
3636 */
3637 lgrp_t *
lgrp_mem_choose(struct seg * seg,caddr_t vaddr,size_t pgsz)3638 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3639 {
3640 int i;
3641 lgrp_t *lgrp;
3642 klgrpset_t lgrpset;
3643 int lgrps_spanned;
3644 unsigned long off;
3645 lgrp_mem_policy_t policy;
3646 lgrp_mem_policy_info_t *policy_info;
3647 ushort_t random;
3648 int stat = 0;
3649 extern struct seg *segkmap;
3650
3651 /*
3652 * Just return null if the lgrp framework hasn't finished
3653 * initializing or if this is a UMA machine.
3654 */
3655 if (nlgrps == 1 || !lgrp_initialized)
3656 return (lgrp_root);
3657
3658 /*
3659 * Get memory allocation policy for this segment
3660 */
3661 policy = lgrp_mem_default_policy;
3662 if (seg != NULL) {
3663 if (seg->s_as == &kas) {
3664 if (seg == segkmap)
3665 policy = lgrp_segmap_default_policy;
3666 if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3667 policy == LGRP_MEM_POLICY_RANDOM_PSET)
3668 policy = LGRP_MEM_POLICY_RANDOM;
3669 } else {
3670 policy_info = lgrp_mem_policy_get(seg, vaddr);
3671 if (policy_info != NULL) {
3672 policy = policy_info->mem_policy;
3673 if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
3674 lgrp_id_t id = policy_info->mem_lgrpid;
3675 ASSERT(id != LGRP_NONE);
3676 ASSERT(id < NLGRPS_MAX);
3677 lgrp = lgrp_table[id];
3678 if (!LGRP_EXISTS(lgrp)) {
3679 policy = LGRP_MEM_POLICY_NEXT;
3680 } else {
3681 lgrp_stat_add(id,
3682 LGRP_NUM_NEXT_SEG, 1);
3683 return (lgrp);
3684 }
3685 }
3686 }
3687 }
3688 }
3689 lgrpset = 0;
3690
3691 /*
3692 * Initialize lgroup to home by default
3693 */
3694 lgrp = lgrp_home_lgrp();
3695
3696 /*
3697 * When homing threads on root lgrp, override default memory
3698 * allocation policies with root lgroup memory allocation policy
3699 */
3700 if (lgrp == lgrp_root)
3701 policy = lgrp_mem_policy_root;
3702
3703 /*
3704 * Implement policy
3705 */
3706 switch (policy) {
3707 case LGRP_MEM_POLICY_NEXT_CPU:
3708
3709 /*
3710 * Return lgroup of current CPU which faulted on memory
3711 * If the CPU isn't currently in an lgrp, then opt to
3712 * allocate from the root.
3713 *
3714 * Kernel preemption needs to be disabled here to prevent
3715 * the current CPU from going away before lgrp is found.
3716 */
3717 if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3718 lgrp = lgrp_root;
3719 } else {
3720 kpreempt_disable();
3721 lgrp = lgrp_cpu_to_lgrp(CPU);
3722 kpreempt_enable();
3723 }
3724 break;
3725
3726 case LGRP_MEM_POLICY_NEXT:
3727 case LGRP_MEM_POLICY_DEFAULT:
3728 default:
3729
3730 /*
3731 * Just return current thread's home lgroup
3732 * for default policy (next touch)
3733 * If the thread is homed to the root,
3734 * then the default policy is random across lgroups.
3735 * Fallthrough to the random case.
3736 */
3737 if (lgrp != lgrp_root) {
3738 if (policy == LGRP_MEM_POLICY_NEXT)
3739 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3740 else
3741 lgrp_stat_add(lgrp->lgrp_id,
3742 LGRP_NUM_DEFAULT, 1);
3743 break;
3744 }
3745 /* FALLTHROUGH */
3746 case LGRP_MEM_POLICY_RANDOM:
3747
3748 /*
3749 * Return a random leaf lgroup with memory
3750 */
3751 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3752 /*
3753 * Count how many lgroups are spanned
3754 */
3755 klgrpset_nlgrps(lgrpset, lgrps_spanned);
3756
3757 /*
3758 * There may be no memnodes in the root lgroup during DR copy
3759 * rename on a system with only two boards (memnodes)
3760 * configured. In this case just return the root lgrp.
3761 */
3762 if (lgrps_spanned == 0) {
3763 lgrp = lgrp_root;
3764 break;
3765 }
3766
3767 /*
3768 * Pick a random offset within lgroups spanned
3769 * and return lgroup at that offset
3770 */
3771 random = (ushort_t)gethrtime() >> 4;
3772 off = random % lgrps_spanned;
3773 ASSERT(off <= lgrp_alloc_max);
3774
3775 for (i = 0; i <= lgrp_alloc_max; i++) {
3776 if (!klgrpset_ismember(lgrpset, i))
3777 continue;
3778 if (off)
3779 off--;
3780 else {
3781 lgrp = lgrp_table[i];
3782 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3783 1);
3784 break;
3785 }
3786 }
3787 break;
3788
3789 case LGRP_MEM_POLICY_RANDOM_PROC:
3790
3791 /*
3792 * Grab copy of bitmask of lgroups spanned by
3793 * this process
3794 */
3795 klgrpset_copy(lgrpset, curproc->p_lgrpset);
3796 stat = LGRP_NUM_RANDOM_PROC;
3797
3798 /* FALLTHROUGH */
3799 case LGRP_MEM_POLICY_RANDOM_PSET:
3800
3801 if (!stat)
3802 stat = LGRP_NUM_RANDOM_PSET;
3803
3804 if (klgrpset_isempty(lgrpset)) {
3805 /*
3806 * Grab copy of bitmask of lgroups spanned by
3807 * this processor set
3808 */
3809 kpreempt_disable();
3810 klgrpset_copy(lgrpset,
3811 curthread->t_cpupart->cp_lgrpset);
3812 kpreempt_enable();
3813 }
3814
3815 /*
3816 * Count how many lgroups are spanned
3817 */
3818 klgrpset_nlgrps(lgrpset, lgrps_spanned);
3819 ASSERT(lgrps_spanned <= nlgrps);
3820
3821 /*
3822 * Probably lgrps_spanned should be always non-zero, but to be
3823 * on the safe side we return lgrp_root if it is empty.
3824 */
3825 if (lgrps_spanned == 0) {
3826 lgrp = lgrp_root;
3827 break;
3828 }
3829
3830 /*
3831 * Pick a random offset within lgroups spanned
3832 * and return lgroup at that offset
3833 */
3834 random = (ushort_t)gethrtime() >> 4;
3835 off = random % lgrps_spanned;
3836 ASSERT(off <= lgrp_alloc_max);
3837
3838 for (i = 0; i <= lgrp_alloc_max; i++) {
3839 if (!klgrpset_ismember(lgrpset, i))
3840 continue;
3841 if (off)
3842 off--;
3843 else {
3844 lgrp = lgrp_table[i];
3845 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3846 1);
3847 break;
3848 }
3849 }
3850 break;
3851
3852 case LGRP_MEM_POLICY_ROUNDROBIN:
3853
3854 /*
3855 * Use offset within segment to determine
3856 * offset from home lgroup to choose for
3857 * next lgroup to allocate memory from
3858 */
3859 off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3860 (lgrp_alloc_max + 1);
3861
3862 kpreempt_disable();
3863 lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3864 i = lgrp->lgrp_id;
3865 kpreempt_enable();
3866
3867 while (off > 0) {
3868 i = (i + 1) % (lgrp_alloc_max + 1);
3869 lgrp = lgrp_table[i];
3870 if (klgrpset_ismember(lgrpset, i))
3871 off--;
3872 }
3873 lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3874
3875 break;
3876 }
3877
3878 ASSERT(lgrp != NULL);
3879 return (lgrp);
3880 }
3881
3882 /*
3883 * Return the number of pages in an lgroup
3884 *
3885 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3886 * could cause tests that rely on the numat driver to fail....
3887 */
3888 pgcnt_t
lgrp_mem_size(lgrp_id_t lgrpid,lgrp_mem_query_t query)3889 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3890 {
3891 lgrp_t *lgrp;
3892
3893 lgrp = lgrp_table[lgrpid];
3894 if (!LGRP_EXISTS(lgrp) ||
3895 klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3896 !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3897 return (0);
3898
3899 return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3900 }
3901
3902 /*
3903 * Initialize lgroup shared memory allocation policy support
3904 */
3905 void
lgrp_shm_policy_init(struct anon_map * amp,vnode_t * vp)3906 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3907 {
3908 lgrp_shm_locality_t *shm_locality;
3909
3910 /*
3911 * Initialize locality field in anon_map
3912 * Don't need any locks because this is called when anon_map is
3913 * allocated, but not used anywhere yet.
3914 */
3915 if (amp) {
3916 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
3917 if (amp->locality == NULL) {
3918 /*
3919 * Allocate and initialize shared memory locality info
3920 * and set anon_map locality pointer to it
3921 * Drop lock across kmem_alloc(KM_SLEEP)
3922 */
3923 ANON_LOCK_EXIT(&->a_rwlock);
3924 shm_locality = kmem_alloc(sizeof (*shm_locality),
3925 KM_SLEEP);
3926 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3927 NULL);
3928 shm_locality->loc_count = 1; /* not used for amp */
3929 shm_locality->loc_tree = NULL;
3930
3931 /*
3932 * Reacquire lock and check to see whether anyone beat
3933 * us to initializing the locality info
3934 */
3935 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
3936 if (amp->locality != NULL) {
3937 rw_destroy(&shm_locality->loc_lock);
3938 kmem_free(shm_locality,
3939 sizeof (*shm_locality));
3940 } else
3941 amp->locality = shm_locality;
3942 }
3943 ANON_LOCK_EXIT(&->a_rwlock);
3944 return;
3945 }
3946
3947 /*
3948 * Allocate shared vnode policy info if vnode is not locality aware yet
3949 */
3950 mutex_enter(&vp->v_lock);
3951 if ((vp->v_flag & V_LOCALITY) == 0) {
3952 /*
3953 * Allocate and initialize shared memory locality info
3954 */
3955 mutex_exit(&vp->v_lock);
3956 shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3957 rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3958 shm_locality->loc_count = 1;
3959 shm_locality->loc_tree = NULL;
3960
3961 /*
3962 * Point vnode locality field at shared vnode policy info
3963 * and set locality aware flag in vnode
3964 */
3965 mutex_enter(&vp->v_lock);
3966 if ((vp->v_flag & V_LOCALITY) == 0) {
3967 vp->v_locality = shm_locality;
3968 vp->v_flag |= V_LOCALITY;
3969 } else {
3970 /*
3971 * Lost race so free locality info and increment count.
3972 */
3973 rw_destroy(&shm_locality->loc_lock);
3974 kmem_free(shm_locality, sizeof (*shm_locality));
3975 shm_locality = vp->v_locality;
3976 shm_locality->loc_count++;
3977 }
3978 mutex_exit(&vp->v_lock);
3979
3980 return;
3981 }
3982
3983 /*
3984 * Increment reference count of number of segments mapping this vnode
3985 * shared
3986 */
3987 shm_locality = vp->v_locality;
3988 shm_locality->loc_count++;
3989 mutex_exit(&vp->v_lock);
3990 }
3991
3992 /*
3993 * Destroy the given shared memory policy segment tree
3994 */
3995 void
lgrp_shm_policy_tree_destroy(avl_tree_t * tree)3996 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3997 {
3998 lgrp_shm_policy_seg_t *cur;
3999 lgrp_shm_policy_seg_t *next;
4000
4001 if (tree == NULL)
4002 return;
4003
4004 cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
4005 while (cur != NULL) {
4006 next = AVL_NEXT(tree, cur);
4007 avl_remove(tree, cur);
4008 kmem_free(cur, sizeof (*cur));
4009 cur = next;
4010 }
4011 kmem_free(tree, sizeof (avl_tree_t));
4012 }
4013
4014 /*
4015 * Uninitialize lgroup shared memory allocation policy support
4016 */
4017 void
lgrp_shm_policy_fini(struct anon_map * amp,vnode_t * vp)4018 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
4019 {
4020 lgrp_shm_locality_t *shm_locality;
4021
4022 /*
4023 * For anon_map, deallocate shared memory policy tree and
4024 * zero locality field
4025 * Don't need any locks because anon_map is being freed
4026 */
4027 if (amp) {
4028 if (amp->locality == NULL)
4029 return;
4030 shm_locality = amp->locality;
4031 shm_locality->loc_count = 0; /* not really used for amp */
4032 rw_destroy(&shm_locality->loc_lock);
4033 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4034 kmem_free(shm_locality, sizeof (*shm_locality));
4035 amp->locality = 0;
4036 return;
4037 }
4038
4039 /*
4040 * For vnode, decrement reference count of segments mapping this vnode
4041 * shared and delete locality info if reference count drops to 0
4042 */
4043 mutex_enter(&vp->v_lock);
4044 shm_locality = vp->v_locality;
4045 shm_locality->loc_count--;
4046
4047 if (shm_locality->loc_count == 0) {
4048 rw_destroy(&shm_locality->loc_lock);
4049 lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4050 kmem_free(shm_locality, sizeof (*shm_locality));
4051 vp->v_locality = 0;
4052 vp->v_flag &= ~V_LOCALITY;
4053 }
4054 mutex_exit(&vp->v_lock);
4055 }
4056
4057 /*
4058 * Compare two shared memory policy segments
4059 * Used by AVL tree code for searching
4060 */
4061 int
lgrp_shm_policy_compar(const void * x,const void * y)4062 lgrp_shm_policy_compar(const void *x, const void *y)
4063 {
4064 lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4065 lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4066
4067 if (a->shm_off < b->shm_off)
4068 return (-1);
4069 if (a->shm_off >= b->shm_off + b->shm_size)
4070 return (1);
4071 return (0);
4072 }
4073
4074 /*
4075 * Concatenate seg1 with seg2 and remove seg2
4076 */
4077 static int
lgrp_shm_policy_concat(avl_tree_t * tree,lgrp_shm_policy_seg_t * seg1,lgrp_shm_policy_seg_t * seg2)4078 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4079 lgrp_shm_policy_seg_t *seg2)
4080 {
4081 if (!seg1 || !seg2 ||
4082 seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4083 seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4084 return (-1);
4085
4086 seg1->shm_size += seg2->shm_size;
4087 avl_remove(tree, seg2);
4088 kmem_free(seg2, sizeof (*seg2));
4089 return (0);
4090 }
4091
4092 /*
4093 * Split segment at given offset and return rightmost (uppermost) segment
4094 * Assumes that there are no overlapping segments
4095 */
4096 static lgrp_shm_policy_seg_t *
lgrp_shm_policy_split(avl_tree_t * tree,lgrp_shm_policy_seg_t * seg,u_offset_t off)4097 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4098 u_offset_t off)
4099 {
4100 lgrp_shm_policy_seg_t *newseg;
4101 avl_index_t where;
4102
4103 ASSERT(seg != NULL && (off >= seg->shm_off &&
4104 off <= seg->shm_off + seg->shm_size));
4105
4106 if (!seg || off < seg->shm_off ||
4107 off > seg->shm_off + seg->shm_size) {
4108 return (NULL);
4109 }
4110
4111 if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4112 return (seg);
4113
4114 /*
4115 * Adjust size of left segment and allocate new (right) segment
4116 */
4117 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4118 newseg->shm_policy = seg->shm_policy;
4119 newseg->shm_off = off;
4120 newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4121 seg->shm_size = off - seg->shm_off;
4122
4123 /*
4124 * Find where to insert new segment in AVL tree and insert it
4125 */
4126 (void) avl_find(tree, &off, &where);
4127 avl_insert(tree, newseg, where);
4128
4129 return (newseg);
4130 }
4131
4132 /*
4133 * Set shared memory allocation policy on specified shared object at given
4134 * offset and length
4135 *
4136 * Return 0 if policy wasn't set already, 1 if policy was set already, and
4137 * -1 if can't set policy.
4138 */
4139 int
lgrp_shm_policy_set(lgrp_mem_policy_t policy,struct anon_map * amp,ulong_t anon_index,vnode_t * vp,u_offset_t vn_off,size_t len)4140 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4141 ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4142 {
4143 u_offset_t eoff;
4144 lgrp_shm_policy_seg_t *next;
4145 lgrp_shm_policy_seg_t *newseg;
4146 u_offset_t off;
4147 u_offset_t oldeoff;
4148 lgrp_shm_policy_seg_t *prev;
4149 int retval;
4150 lgrp_shm_policy_seg_t *seg;
4151 lgrp_shm_locality_t *shm_locality;
4152 avl_tree_t *tree;
4153 avl_index_t where;
4154
4155 ASSERT(amp || vp);
4156 ASSERT((len & PAGEOFFSET) == 0);
4157
4158 if (len == 0)
4159 return (-1);
4160
4161 retval = 0;
4162
4163 /*
4164 * Get locality info and starting offset into shared object
4165 * Try anon map first and then vnode
4166 * Assume that no locks need to be held on anon_map or vnode, since
4167 * it should be protected by its reference count which must be nonzero
4168 * for an existing segment.
4169 */
4170 if (amp) {
4171 /*
4172 * Get policy info from anon_map
4173 *
4174 */
4175 ASSERT(amp->refcnt != 0);
4176 if (amp->locality == NULL)
4177 lgrp_shm_policy_init(amp, NULL);
4178 shm_locality = amp->locality;
4179 off = ptob(anon_index);
4180 } else if (vp) {
4181 /*
4182 * Get policy info from vnode
4183 */
4184 if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4185 lgrp_shm_policy_init(NULL, vp);
4186 shm_locality = vp->v_locality;
4187 ASSERT(shm_locality->loc_count != 0);
4188 off = vn_off;
4189 } else
4190 return (-1);
4191
4192 ASSERT((off & PAGEOFFSET) == 0);
4193
4194 /*
4195 * Figure out default policy
4196 */
4197 if (policy == LGRP_MEM_POLICY_DEFAULT)
4198 policy = lgrp_mem_policy_default(len, MAP_SHARED);
4199
4200 /*
4201 * Create AVL tree if there isn't one yet
4202 * and set locality field to point at it
4203 */
4204 rw_enter(&shm_locality->loc_lock, RW_WRITER);
4205 tree = shm_locality->loc_tree;
4206 if (!tree) {
4207 rw_exit(&shm_locality->loc_lock);
4208
4209 tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4210
4211 rw_enter(&shm_locality->loc_lock, RW_WRITER);
4212 if (shm_locality->loc_tree == NULL) {
4213 avl_create(tree, lgrp_shm_policy_compar,
4214 sizeof (lgrp_shm_policy_seg_t),
4215 offsetof(lgrp_shm_policy_seg_t, shm_tree));
4216 shm_locality->loc_tree = tree;
4217 } else {
4218 /*
4219 * Another thread managed to set up the tree
4220 * before we could. Free the tree we allocated
4221 * and use the one that's already there.
4222 */
4223 kmem_free(tree, sizeof (*tree));
4224 tree = shm_locality->loc_tree;
4225 }
4226 }
4227
4228 /*
4229 * Set policy
4230 *
4231 * Need to maintain hold on writer's lock to keep tree from
4232 * changing out from under us
4233 */
4234 while (len != 0) {
4235 /*
4236 * Find policy segment for specified offset into shared object
4237 */
4238 seg = avl_find(tree, &off, &where);
4239
4240 /*
4241 * Didn't find any existing segment that contains specified
4242 * offset, so allocate new segment, insert it, and concatenate
4243 * with adjacent segments if possible
4244 */
4245 if (seg == NULL) {
4246 newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4247 KM_SLEEP);
4248 newseg->shm_policy.mem_policy = policy;
4249 newseg->shm_policy.mem_lgrpid = LGRP_NONE;
4250 newseg->shm_off = off;
4251 avl_insert(tree, newseg, where);
4252
4253 /*
4254 * Check to see whether new segment overlaps with next
4255 * one, set length of new segment accordingly, and
4256 * calculate remaining length and next offset
4257 */
4258 seg = AVL_NEXT(tree, newseg);
4259 if (seg == NULL || off + len <= seg->shm_off) {
4260 newseg->shm_size = len;
4261 len = 0;
4262 } else {
4263 newseg->shm_size = seg->shm_off - off;
4264 off = seg->shm_off;
4265 len -= newseg->shm_size;
4266 }
4267
4268 /*
4269 * Try to concatenate new segment with next and
4270 * previous ones, since they might have the same policy
4271 * now. Grab previous and next segments first because
4272 * they will change on concatenation.
4273 */
4274 prev = AVL_PREV(tree, newseg);
4275 next = AVL_NEXT(tree, newseg);
4276 (void) lgrp_shm_policy_concat(tree, newseg, next);
4277 (void) lgrp_shm_policy_concat(tree, prev, newseg);
4278
4279 continue;
4280 }
4281
4282 eoff = off + len;
4283 oldeoff = seg->shm_off + seg->shm_size;
4284
4285 /*
4286 * Policy set already?
4287 */
4288 if (policy == seg->shm_policy.mem_policy) {
4289 /*
4290 * Nothing left to do if offset and length
4291 * fall within this segment
4292 */
4293 if (eoff <= oldeoff) {
4294 retval = 1;
4295 break;
4296 } else {
4297 len = eoff - oldeoff;
4298 off = oldeoff;
4299 continue;
4300 }
4301 }
4302
4303 /*
4304 * Specified offset and length match existing segment exactly
4305 */
4306 if (off == seg->shm_off && len == seg->shm_size) {
4307 /*
4308 * Set policy and update current length
4309 */
4310 seg->shm_policy.mem_policy = policy;
4311 seg->shm_policy.mem_lgrpid = LGRP_NONE;
4312 len = 0;
4313
4314 /*
4315 * Try concatenating new segment with previous and next
4316 * segments, since they might have the same policy now.
4317 * Grab previous and next segments first because they
4318 * will change on concatenation.
4319 */
4320 prev = AVL_PREV(tree, seg);
4321 next = AVL_NEXT(tree, seg);
4322 (void) lgrp_shm_policy_concat(tree, seg, next);
4323 (void) lgrp_shm_policy_concat(tree, prev, seg);
4324 } else {
4325 /*
4326 * Specified offset and length only apply to part of
4327 * existing segment
4328 */
4329
4330 /*
4331 * New segment starts in middle of old one, so split
4332 * new one off near beginning of old one
4333 */
4334 newseg = NULL;
4335 if (off > seg->shm_off) {
4336 newseg = lgrp_shm_policy_split(tree, seg, off);
4337
4338 /*
4339 * New segment ends where old one did, so try
4340 * to concatenate with next segment
4341 */
4342 if (eoff == oldeoff) {
4343 newseg->shm_policy.mem_policy = policy;
4344 newseg->shm_policy.mem_lgrpid =
4345 LGRP_NONE;
4346 (void) lgrp_shm_policy_concat(tree,
4347 newseg, AVL_NEXT(tree, newseg));
4348 break;
4349 }
4350 }
4351
4352 /*
4353 * New segment ends before old one, so split off end of
4354 * old one
4355 */
4356 if (eoff < oldeoff) {
4357 if (newseg) {
4358 (void) lgrp_shm_policy_split(tree,
4359 newseg, eoff);
4360 newseg->shm_policy.mem_policy = policy;
4361 newseg->shm_policy.mem_lgrpid =
4362 LGRP_NONE;
4363 } else {
4364 (void) lgrp_shm_policy_split(tree, seg,
4365 eoff);
4366 seg->shm_policy.mem_policy = policy;
4367 seg->shm_policy.mem_lgrpid = LGRP_NONE;
4368 }
4369
4370 if (off == seg->shm_off)
4371 (void) lgrp_shm_policy_concat(tree,
4372 AVL_PREV(tree, seg), seg);
4373 break;
4374 }
4375
4376 /*
4377 * Calculate remaining length and next offset
4378 */
4379 len = eoff - oldeoff;
4380 off = oldeoff;
4381 }
4382 }
4383
4384 rw_exit(&shm_locality->loc_lock);
4385 return (retval);
4386 }
4387
4388 /*
4389 * Return the best memnode from which to allocate memory given
4390 * an lgroup.
4391 *
4392 * "c" is for cookie, which is good enough for me.
4393 * It references a cookie struct that should be zero'ed to initialize.
4394 * The cookie should live on the caller's stack.
4395 *
4396 * The routine returns -1 when:
4397 * - traverse is 0, and all the memnodes in "lgrp" have been returned.
4398 * - traverse is 1, and all the memnodes in the system have been
4399 * returned.
4400 */
4401 int
lgrp_memnode_choose(lgrp_mnode_cookie_t * c)4402 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4403 {
4404 lgrp_t *lp = c->lmc_lgrp;
4405 mnodeset_t nodes = c->lmc_nodes;
4406 int cnt = c->lmc_cnt;
4407 int offset, mnode;
4408
4409 extern int max_mem_nodes;
4410
4411 /*
4412 * If the set is empty, and the caller is willing, traverse
4413 * up the hierarchy until we find a non-empty set.
4414 */
4415 while (nodes == (mnodeset_t)0 || cnt <= 0) {
4416 if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4417 ((lp = lp->lgrp_parent) == NULL))
4418 return (-1);
4419
4420 nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4421 cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4422 }
4423
4424 /*
4425 * Select a memnode by picking one at a "random" offset.
4426 * Because of DR, memnodes can come and go at any time.
4427 * This code must be able to cope with the possibility
4428 * that the nodes count "cnt" is inconsistent with respect
4429 * to the number of elements actually in "nodes", and
4430 * therefore that the offset chosen could be greater than
4431 * the number of elements in the set (some memnodes may
4432 * have dissapeared just before cnt was read).
4433 * If this happens, the search simply wraps back to the
4434 * beginning of the set.
4435 */
4436 ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4437 offset = c->lmc_rand % cnt;
4438 do {
4439 for (mnode = 0; mnode < max_mem_nodes; mnode++)
4440 if (nodes & ((mnodeset_t)1 << mnode))
4441 if (!offset--)
4442 break;
4443 } while (mnode >= max_mem_nodes);
4444
4445 /* Found a node. Store state before returning. */
4446 c->lmc_lgrp = lp;
4447 c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4448 c->lmc_cnt = cnt - 1;
4449 c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4450 c->lmc_ntried++;
4451
4452 return (mnode);
4453 }
4454