1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * These functions implement the process of commitment for a pool 30 * configuration. This process can be described as taking instructions 31 * from a static configuration file and using the information about 32 * the target system contained in the dynamic configuration to make 33 * decisions about how best to allocate resources to meet the 34 * constraints specified in the static configuration file. 35 * 36 * Mechanically, this process relies upon ordering the individual 37 * components of the file and stepping through the lists of components 38 * and taking actions depending on their type and which file they are 39 * part of. 40 * 41 * Configuration components can be broken down into different types 42 * which are then treated according to the following table: 43 * 44 * Element Type Action 45 * system || pool || 46 * res_comp || res_agg If the element is a required element, then create or 47 * update it (don't destroy required elements in the 48 * static configuration) otherwise manipulate the 49 * dynamic configuration to create, destroy or update 50 * the element on the system. 51 * comp Create, destroy or update the static configuration 52 * component. 53 * 54 * The treatment of the different elements reflects the fact that all 55 * elements other than comp are configurable and thus libpool can 56 * create, destroy and modify these elements at will. comp elements 57 * reflect the disposition of the system, these elements can be moved 58 * around but they can't be created or destroyed in the dynamic 59 * configuration in the commit process. comp elements can be created 60 * and destroyed in the static configuration file as a result of a 61 * commit operation, since it's possible for a comp to not appear in 62 * the dynamic configuration. For instance, if the static 63 * configuration file was created on a different machine or after a DR 64 * operation which has removed or added components. 65 * 66 */ 67 #include <assert.h> 68 #include <stdio.h> 69 #include <stdlib.h> 70 #include <sys/types.h> 71 #include <errno.h> 72 #include <string.h> 73 #include <limits.h> 74 #include <unistd.h> 75 76 #include <pool.h> 77 #include "pool_internal.h" 78 #include "pool_impl.h" 79 80 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 81 #define MAX(x, y) ((x) > (y) ? (x) : (y)) 82 #define POA_IMPORTANCE_NUM 0 83 #define POA_SURPLUS_TO_DEFAULT_NUM 1 84 85 /* 86 * This resource specific structure is used to determine allocation of resources 87 * during resource set allocation. Each set will receive its min, plus 88 * some number of dealt resources based on the global allocation policy. 89 */ 90 typedef struct res_info { 91 pool_resource_t *ri_res; /* Resource set */ 92 uint64_t ri_min; /* Resource set's low watermark */ 93 uint64_t ri_max; /* Resource set's high watermark */ 94 uint64_t ri_oldsize; /* Size of resource set at the start */ 95 uint64_t ri_newsize; /* New resource set size allocated */ 96 uint64_t ri_pinned; /* Count of pinned resources in set */ 97 uint64_t ri_dealt; /* Count of resources dealt to set */ 98 int64_t ri_transfer; /* oldsize - newsize */ 99 /* The signed quantity of resources */ 100 /* to tranfer into or out of this */ 101 /* resource set */ 102 /* + transfer: tranfer resources out */ 103 /* - transfer: tranfer resources in */ 104 } res_info_t; 105 106 /* 107 * diff_and_fix operations 108 */ 109 static int commit_create(pool_conf_t *, pool_elem_t **); 110 static int commit_delete(pool_elem_t *); 111 static int commit_update(pool_elem_t *, pool_elem_t *, int); 112 113 /* 114 * configuration commit processing 115 */ 116 static int diff_and_fix(pool_conf_t *, pool_conf_t *); 117 static int process_elem_lt(pool_elem_t *, pool_conf_t *); 118 static int process_elem_gt(pool_elem_t *, pool_conf_t *, 119 pool_conf_t *); 120 static int process_lists(int, pool_conf_t *, 121 pool_conf_t *, int); 122 static pool_elem_t **get_elem_list(const pool_conf_t *, int, uint_t *); 123 static int share_resources(pool_conf_t *); 124 static int resource_allocate(const char *, pool_resource_t **, 125 uint_t); 126 static int resource_allocate_default(pool_resource_t **, uint_t); 127 static int pset_allocate_imp(pool_resource_t **, uint_t); 128 static int resource_compare_by_descending_importance(const void *, 129 const void *); 130 static int compute_size_to_transfer(const void *, const void *); 131 static int set_importance_cb(pool_conf_t *, pool_t *, void *); 132 static int unset_importance_cb(pool_conf_t *, pool_t *, void *); 133 static int add_importance_props(pool_conf_t *); 134 static int remove_importance_props(pool_conf_t *); 135 static int clone_element(pool_conf_t *, pool_elem_t *, 136 const char *, pool_value_t *, void *); 137 static int clean_element(pool_conf_t *, pool_elem_t *, 138 const char *, pool_value_t *, void *); 139 140 /* 141 * commit_create() is used to create a configuration element upon the 142 * system. Since only pools and resource actually need to perform any 143 * action, other elements are ignored as a no-op. 144 */ 145 static int 146 commit_create(pool_conf_t *conf, pool_elem_t **e1) 147 { 148 pool_resource_t *res; 149 pool_t *pool; 150 const char *res_type; 151 pool_elem_t *src = *e1; 152 uint64_t smin, smax, dmax; 153 pool_value_t val = POOL_VALUE_INITIALIZER; 154 char *name; 155 156 switch (pool_elem_class(src)) { 157 case PEC_SYSTEM: /* NO-OP */ 158 break; 159 case PEC_POOL: 160 name = elem_get_name(src); 161 if ((pool = pool_create(conf, name)) == NULL) { 162 free(name); 163 return (PO_FAIL); 164 } 165 free(name); 166 /* 167 * Now copy the properties from the original pool to the 168 * new one 169 */ 170 if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(pool), 171 clone_element) != PO_SUCCESS) 172 return (PO_FAIL); 173 /* 174 * Add a pointer to the src element which can be 175 * updated with a sys_id when the sys_id is allocated 176 * to the created element. 177 */ 178 pool_set_pair(TO_ELEM(pool), src); 179 *e1 = TO_ELEM(pool); 180 break; 181 case PEC_RES_COMP: 182 case PEC_RES_AGG: 183 name = elem_get_name(src); 184 res_type = pool_elem_class_string(src); 185 if ((res = pool_resource_create(conf, res_type, name)) == 186 NULL) { 187 free(name); 188 return (PO_FAIL); 189 } 190 free(name); 191 /* 192 * Need to do some ordering of property updates. 193 * Compare the values of source min/max and 194 * destination min/max. If smin < dmax then update the 195 * smin first, else update the max first. 196 */ 197 if (resource_get_min(pool_elem_res(src), &smin) != PO_SUCCESS || 198 resource_get_max(pool_elem_res(src), &smax) != PO_SUCCESS || 199 resource_get_max(res, &dmax) != PO_SUCCESS) 200 return (PO_FAIL); 201 if (smin < dmax) { 202 pool_value_set_uint64(&val, smin); 203 if (pool_put_ns_property(TO_ELEM(res), c_min_prop, 204 &val) != PO_SUCCESS) 205 return (PO_FAIL); 206 } else { 207 pool_value_set_uint64(&val, smax); 208 if (pool_put_ns_property(TO_ELEM(res), c_max_prop, 209 &val) != PO_SUCCESS) 210 return (PO_FAIL); 211 } 212 /* 213 * Now copy the properties from the original resource 214 * to the new one 215 */ 216 if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(res), 217 clone_element) != PO_SUCCESS) 218 return (PO_FAIL); 219 /* 220 * Add a pointer to the src element which can be 221 * updated with a sys_id when the sys_id is allocated 222 * to the created element. 223 */ 224 pool_set_pair(TO_ELEM(res), src); 225 *e1 = TO_ELEM(res); 226 break; 227 case PEC_COMP: /* NO-OP */ 228 break; 229 default: 230 return (PO_FAIL); 231 } 232 return (PO_SUCCESS); 233 } 234 235 236 /* 237 * commit_delete() is used to delete a configuration element upon the 238 * system. Since only pools and resources actually need to perform 239 * any action, other elements are ignored as a no-op. 240 */ 241 static int 242 commit_delete(pool_elem_t *pe) 243 { 244 pool_resource_t *res; 245 pool_t *pool; 246 int ret = 0; 247 248 if (elem_is_tmp(pe)) 249 return (PO_SUCCESS); 250 251 switch (pool_elem_class(pe)) { 252 case PEC_SYSTEM: /* NO-OP */ 253 break; 254 case PEC_POOL: 255 pool = pool_elem_pool(pe); 256 ret = pool_destroy(TO_CONF(pe), pool); 257 break; 258 case PEC_RES_COMP: 259 case PEC_RES_AGG: 260 res = pool_elem_res(pe); 261 ret = pool_resource_destroy(TO_CONF(pe), res); 262 break; 263 case PEC_COMP: /* NO-OP */ 264 break; 265 default: 266 return (PO_FAIL); 267 } 268 return (ret); 269 } 270 271 /* 272 * commit_update() is used to update a configuration element upon the 273 * system or in a static configuration file. The pass parameter 274 * governs whether properties are being updated or associations. In 275 * pass 0, properties are updated. If the element is of class 276 * PEC_COMP, then make sure that the element in the static 277 * configuration file is correctly located before proceeding with the 278 * update. Then, the element in the dynamic configuration file is 279 * updated. In pass 1, ie. pass != 0, any pool components have their 280 * associations updated in the dynamic configuration. 281 */ 282 static int 283 commit_update(pool_elem_t *e1, pool_elem_t *e2, int pass) 284 { 285 if (pass == 0) { 286 pool_resource_t *res1; 287 pool_resource_t *res2; 288 if (pool_elem_class(e1) == PEC_COMP) { 289 res1 = pool_get_owning_resource(TO_CONF(e1), 290 pool_elem_comp(e1)); 291 res2 = pool_get_owning_resource(TO_CONF(e2), 292 pool_elem_comp(e2)); 293 if (pool_elem_compare_name(TO_ELEM(res1), 294 TO_ELEM(res2)) != 0) { 295 char *name; 296 const pool_resource_t *newres; 297 pool_component_t *comps[2] = { NULL }; 298 299 comps[0] = pool_elem_comp(e2); 300 name = elem_get_name(TO_ELEM(res1)); 301 newres = pool_get_resource(TO_CONF(e2), 302 pool_elem_class_string(TO_ELEM(res1)), 303 name); 304 free(name); 305 assert(newres); 306 #ifdef DEBUG 307 dprintf("transferring: res, comp\n"); 308 pool_elem_dprintf(TO_ELEM(newres)); 309 pool_elem_dprintf(e2); 310 #endif /* DEBUG */ 311 (void) pool_resource_xtransfer(TO_CONF(e2), 312 res2, (pool_resource_t *)newres, comps); 313 } 314 } 315 if (pool_walk_properties(TO_CONF(e2), e2, NULL, 316 clean_element) != PO_SUCCESS) { 317 return (PO_FAIL); 318 } 319 /* 320 * Need to do some ordering of property updates if the 321 * element to be updated is a resource. Compare the 322 * values of source min/max and destination 323 * min/max. If smin < dmax then update the smin first, 324 * else update the max first. 325 */ 326 if (pool_elem_class(e1) == PEC_RES_COMP || 327 pool_elem_class(e1) == PEC_RES_AGG) { 328 uint64_t smin, smax, dmax; 329 pool_value_t val = POOL_VALUE_INITIALIZER; 330 331 if (resource_get_min(pool_elem_res(e1), &smin) != 332 PO_SUCCESS || 333 resource_get_max(pool_elem_res(e1), &smax) != 334 PO_SUCCESS || 335 resource_get_max(pool_elem_res(e2), &dmax) != 336 PO_SUCCESS) 337 return (PO_FAIL); 338 if (smin < dmax) { 339 pool_value_set_uint64(&val, smin); 340 if (pool_put_ns_property(e2, c_min_prop, 341 &val) != PO_SUCCESS) 342 return (PO_FAIL); 343 } else { 344 pool_value_set_uint64(&val, smax); 345 if (pool_put_ns_property(e2, c_max_prop, 346 &val) != PO_SUCCESS) 347 return (PO_FAIL); 348 } 349 } 350 /* 351 * This next couple of steps needs some 352 * explanation. The first walk, copies all the 353 * properties that are writeable from the static 354 * configuration to the dynamic configuration. The 355 * second walk copies all properties (writeable or 356 * not) from the dynamic configuration element back to 357 * the static configuration element. This ensures that 358 * updates from the static configuration element are 359 * correctly applied to the dynamic configuration and 360 * then the static configuration element is updated 361 * with the latest values of the read-only xproperties 362 * from the dynamic configuration element. The 363 * enforcing of permisssions is performed in 364 * clone_element by its choice of property 365 * manipulation function. 366 */ 367 if (pool_walk_properties(TO_CONF(e1), e1, e2, clone_element) != 368 PO_SUCCESS) { 369 return (PO_FAIL); 370 } 371 if (pool_walk_properties(TO_CONF(e2), e2, e1, clone_element) != 372 PO_SUCCESS) { 373 return (PO_FAIL); 374 } 375 } else { 376 if (pool_elem_class(e1) == PEC_POOL) { 377 pool_resource_t **rs; 378 uint_t nelem; 379 int i; 380 pool_value_t val = POOL_VALUE_INITIALIZER; 381 pool_value_t *pvals[] = { NULL, NULL }; 382 383 pvals[0] = &val; 384 if (pool_value_set_string(&val, "pset") != PO_SUCCESS || 385 pool_value_set_name(&val, c_type) != PO_SUCCESS) 386 return (PO_FAIL); 387 if ((rs = pool_query_pool_resources(TO_CONF(e1), 388 pool_elem_pool(e1), &nelem, pvals)) != NULL) { 389 for (i = 0; i < nelem; i++) { 390 const pool_resource_t *tgt_res; 391 char *res_name = 392 elem_get_name(TO_ELEM(rs[i])); 393 394 if ((tgt_res = pool_get_resource( 395 TO_CONF(e2), pool_elem_class_string( 396 TO_ELEM(rs[i])), res_name)) == 397 NULL) { 398 tgt_res = get_default_resource( 399 rs[i]); 400 } 401 free(res_name); 402 if (pool_associate(TO_CONF(e2), 403 pool_elem_pool(e2), tgt_res) != 404 PO_SUCCESS) { 405 free(rs); 406 return (PO_FAIL); 407 } 408 } 409 free(rs); 410 } 411 } 412 } 413 return (PO_SUCCESS); 414 } 415 416 /* 417 * diff_and_fix() works out the differences between two configurations 418 * and modifies the state of the system to match the operations 419 * required to bring the two configurations into sync. 420 * 421 * Returns PO_SUCCESS/PO_FAIL. 422 */ 423 static int 424 diff_and_fix(pool_conf_t *stc, pool_conf_t *dyn) 425 { 426 /* 427 * The ordering of the operations is significant, we must 428 * process the system element, then the pools elements, then 429 * the resource elements, then the pools elements again and 430 * finally the resource components. 431 * 432 * TODO 433 * PEC_RES_COMP are the only type of resources 434 * currently. When PEC_RES_AGG resources are added they must 435 * also be processed. 436 */ 437 if (process_lists(PEC_SYSTEM, stc, dyn, 0) != PO_SUCCESS) { 438 return (PO_FAIL); 439 } 440 if (process_lists(PEC_POOL, stc, dyn, 0) != PO_SUCCESS) { 441 return (PO_FAIL); 442 } 443 if (process_lists(PEC_RES_COMP, stc, dyn, 0) != PO_SUCCESS) { 444 return (PO_FAIL); 445 } 446 if (process_lists(PEC_COMP, stc, dyn, 0) != PO_SUCCESS) { 447 return (PO_FAIL); 448 } 449 if (process_lists(PEC_POOL, stc, dyn, 1) != PO_SUCCESS) { 450 return (PO_FAIL); 451 } 452 /* 453 * Share the resources. It has to be called for both 454 * configurations to ensure that the configurations still look 455 * the same. 456 */ 457 if (share_resources(dyn) != PO_SUCCESS) { 458 return (PO_FAIL); 459 } 460 if (share_resources(stc) != PO_SUCCESS) { 461 return (PO_FAIL); 462 } 463 return (PO_SUCCESS); 464 } 465 466 static int 467 process_elem_lt(pool_elem_t *pe, pool_conf_t *dyn) 468 { 469 if (pool_elem_class(pe) == PEC_COMP) { 470 if (pool_component_destroy(pool_elem_comp(pe)) == PO_FAIL) { 471 return (PO_FAIL); 472 } 473 } else if (! elem_is_default(pe)) { 474 if (commit_create(dyn, &pe) != PO_SUCCESS) { 475 return (PO_FAIL); 476 } 477 } 478 return (PO_SUCCESS); 479 } 480 481 static int 482 process_elem_gt(pool_elem_t *pe, pool_conf_t *stc, pool_conf_t *dyn) 483 { 484 if (pool_elem_class(pe) == PEC_COMP) { 485 pool_resource_t *owner; 486 const pool_resource_t *parent_res; 487 pool_value_t val = POOL_VALUE_INITIALIZER; 488 const pool_component_t *newcomp; 489 const char *resname; 490 const char *restype; 491 /* 492 * I have to find the right parent in the static 493 * configuration. It may not exist, in which case it's 494 * correct to put it in the default 495 */ 496 owner = pool_get_owning_resource(dyn, 497 pool_elem_comp(pe)); 498 if (pool_get_ns_property(TO_ELEM(owner), "name", &val) == 499 POC_INVAL) 500 return (PO_FAIL); 501 502 if (pool_value_get_string(&val, &resname) == PO_FAIL) 503 return (PO_FAIL); 504 505 if ((resname = strdup(resname)) == NULL) 506 return (PO_FAIL); 507 508 restype = pool_elem_class_string(TO_ELEM(owner)); 509 parent_res = pool_get_resource(stc, restype, resname); 510 free((void *)resname); 511 if (parent_res == NULL) 512 parent_res = resource_by_sysid(stc, PS_NONE, restype); 513 /* 514 * Now need to make a copy of the component in the 515 * dynamic configuration in the static configuration. 516 */ 517 if ((newcomp = pool_component_create(stc, parent_res, 518 elem_get_sysid(pe))) == NULL) 519 return (PO_FAIL); 520 521 if (pool_walk_properties(TO_CONF(pe), pe, TO_ELEM(newcomp), 522 clone_element) != PO_SUCCESS) 523 return (PO_FAIL); 524 } else if (elem_is_default(pe)) { 525 pool_resource_t *newres; 526 pool_t *newpool; 527 char *name; 528 529 if ((name = elem_get_name(pe)) == NULL) 530 return (PO_FAIL); 531 switch (pool_elem_class(pe)) { 532 case PEC_POOL: 533 if ((newpool = pool_create(stc, name)) == NULL) { 534 free(name); 535 return (PO_FAIL); 536 } 537 free(name); 538 if (pool_walk_properties(TO_CONF(pe), pe, 539 TO_ELEM(newpool), clone_element) != PO_SUCCESS) 540 return (PO_FAIL); 541 break; 542 case PEC_RES_AGG: 543 case PEC_RES_COMP: 544 if ((newres = pool_resource_create(stc, 545 pool_elem_class_string(pe), name)) == 546 NULL) { 547 free(name); 548 return (PO_FAIL); 549 } 550 free(name); 551 if (pool_walk_properties(TO_CONF(pe), pe, 552 TO_ELEM(newres), clone_element) != PO_SUCCESS) 553 return (PO_FAIL); 554 break; 555 default: 556 free(name); 557 break; 558 } 559 } else { 560 if (commit_delete(pe) != PO_SUCCESS) 561 return (PO_FAIL); 562 } 563 return (PO_SUCCESS); 564 } 565 566 /* 567 * This function compares the elements of the supplied type in the 568 * static and dynamic configurations supplied. The lists of elements 569 * are compared and used to create, delete and updated elements in 570 * both the static and dynamic configurations. The pass parameter is 571 * used to indicate to commit_update() whether property updates or 572 * association updates should be performed. 573 */ 574 static int 575 process_lists(int type, pool_conf_t *stc, pool_conf_t *dyn, int pass) 576 { 577 uint_t stc_nelem = 0, dyn_nelem = 0; 578 pool_elem_t **stc_elems, **dyn_elems; 579 int i, j; 580 int status = PO_SUCCESS; 581 582 if ((stc_elems = get_elem_list(stc, type, &stc_nelem)) == NULL) 583 return (PO_FAIL); 584 585 qsort(stc_elems, stc_nelem, sizeof (pool_elem_t *), 586 qsort_elem_compare); 587 588 if ((dyn_elems = get_elem_list(dyn, type, &dyn_nelem)) == NULL) { 589 free(stc_elems); 590 return (PO_FAIL); 591 } 592 593 qsort(dyn_elems, dyn_nelem, sizeof (pool_elem_t *), 594 qsort_elem_compare); 595 /* 596 * Step through and do the updating, remember that we are 597 * comparing using the compare function for the configuration 598 * and that is fixed. 599 */ 600 i = j = 0; 601 while (status == PO_SUCCESS && i < stc_nelem && j < dyn_nelem) { 602 int compare; 603 /* 604 * We are going to do this by stepping through the static 605 * list first. 606 */ 607 if (elem_is_default(stc_elems[i]) && 608 elem_is_default(dyn_elems[j])) 609 compare = 0; 610 else 611 compare = pool_elem_compare_name(stc_elems[i], 612 dyn_elems[j]); 613 if (compare < 0) { 614 status = process_elem_lt(stc_elems[i], dyn); 615 i++; 616 } else if (compare > 0) { 617 status = process_elem_gt(dyn_elems[j], stc, dyn); 618 j++; 619 } else { /* compare == 0 */ 620 if (commit_update(stc_elems[i], dyn_elems[j], pass) 621 != PO_SUCCESS) { 622 status = PO_FAIL; 623 } 624 i++; 625 j++; 626 } 627 } 628 if (status == PO_FAIL) { 629 free(stc_elems); 630 free(dyn_elems); 631 return (PO_FAIL); 632 } 633 while (status == PO_SUCCESS && i < stc_nelem) { 634 status = process_elem_lt(stc_elems[i], dyn); 635 i++; 636 } 637 if (status == PO_FAIL) { 638 free(stc_elems); 639 free(dyn_elems); 640 return (PO_FAIL); 641 } 642 while (status == PO_SUCCESS && j < dyn_nelem) { 643 status = process_elem_gt(dyn_elems[j], stc, dyn); 644 j++; 645 } 646 free(stc_elems); 647 free(dyn_elems); 648 return (status); 649 } 650 651 /* 652 * get_elem_list() returns a list of pool_elem_t's. The size of the 653 * list is written into nelem. The list contains elements of all types 654 * that pools is interested in: i.e. system, pool, resources and 655 * resource components. It is the caller's responsibility to free the 656 * list when it is finished with. 657 * 658 * The array of pointers returned by the type specific query can be 659 * safely cast to be an array of pool_elem_t pointers. In the case of 660 * PEC_RES_COMP some additional processing is required to qualify the 661 * list of elements. 662 * 663 * Returns a pointer to a list of pool_elem_t's or NULL on failure. 664 */ 665 static pool_elem_t ** 666 get_elem_list(const pool_conf_t *conf, int type, uint_t *nelem) 667 { 668 pool_resource_t **rl; 669 pool_t **pl; 670 pool_component_t **cl; 671 pool_elem_t **elems = NULL; 672 int i; 673 674 switch (type) { 675 case PEC_SYSTEM: 676 if ((elems = malloc(sizeof (pool_elem_t *))) == NULL) 677 return (NULL); 678 *nelem = 1; 679 elems[0] = pool_conf_to_elem(conf); 680 break; 681 case PEC_POOL: 682 if ((pl = pool_query_pools(conf, nelem, NULL)) != NULL) { 683 elems = (pool_elem_t **)pl; 684 } 685 break; 686 case PEC_RES_COMP: 687 if ((rl = pool_query_resources(conf, nelem, NULL)) != NULL) { 688 int j = 0; 689 elems = (pool_elem_t **)rl; 690 for (i = 0; i < *nelem; i++) { 691 if (pool_elem_class(TO_ELEM(rl[i])) == 692 PEC_RES_COMP) 693 elems[j++] = TO_ELEM(rl[i]); 694 } 695 *nelem = j; 696 } 697 break; 698 case PEC_COMP: 699 if ((cl = pool_query_components(conf, nelem, NULL)) != NULL) { 700 elems = (pool_elem_t **)cl; 701 } 702 break; 703 default: 704 abort(); 705 break; 706 } 707 return (elems); 708 } 709 710 /* 711 * share_resources() sets up the allocation of resources by each 712 * provider. Firstly all resources are updated with the importance of 713 * each pool, then each resource provider is invoked in turn with a 714 * list of it's own resources. Finally, the pool importance details 715 * are removed from the resources. 716 * 717 * Returns PO_SUCCESS/PO_FAIL 718 */ 719 static int 720 share_resources(pool_conf_t *conf) 721 { 722 pool_resource_t **resources; 723 uint_t nelem; 724 pool_value_t *props[] = { NULL, NULL }; 725 pool_value_t val = POOL_VALUE_INITIALIZER; 726 727 props[0] = &val; 728 729 /* 730 * Call an allocation function for each type of supported resource. 731 * This function is responsible for "sharing" resources to resource 732 * sets as determined by the system.allocate-method. 733 */ 734 735 if (pool_value_set_string(props[0], "pset") != PO_SUCCESS || 736 pool_value_set_name(props[0], c_type) != PO_SUCCESS) 737 return (PO_FAIL); 738 739 if (add_importance_props(conf) != PO_SUCCESS) { 740 (void) remove_importance_props(conf); 741 return (PO_FAIL); 742 } 743 744 if ((resources = pool_query_resources(conf, &nelem, props)) != NULL) { 745 /* 746 * 'pool.importance' defines the importance of a pool; 747 * resources inherit the importance of the pool that 748 * is associated with them. If more than one pool is 749 * associated with a resource, the importance of the 750 * resource is the maximum importance of all 751 * associated pools. Use '_importance' on resources 752 * to determine who gets extra. 753 */ 754 if (resource_allocate("pset", resources, nelem) != PO_SUCCESS) { 755 free(resources); 756 (void) remove_importance_props(conf); 757 return (PO_FAIL); 758 } 759 } 760 free(resources); 761 (void) remove_importance_props(conf); 762 return (PO_SUCCESS); 763 } 764 765 766 /* 767 * Work out which allocation method to use based on the value of the 768 * system.allocate-method property. 769 */ 770 int 771 resource_allocate(const char *type, pool_resource_t **res, uint_t nelem) 772 { 773 pool_elem_t *pe; 774 const char *method_name; 775 uint64_t method; 776 pool_value_t val = POOL_VALUE_INITIALIZER; 777 int ret; 778 779 pe = pool_conf_to_elem(TO_CONF(TO_ELEM(res[0]))); 780 781 if (pool_get_ns_property(pe, "allocate-method", &val) != POC_STRING) 782 method_name = POA_IMPORTANCE; 783 else { 784 (void) pool_value_get_string(&val, &method_name); 785 } 786 if (strcmp(POA_IMPORTANCE, method_name) != 0) { 787 if (strcmp(POA_SURPLUS_TO_DEFAULT, method_name) != 0) { 788 pool_seterror(POE_INVALID_CONF); 789 return (PO_FAIL); 790 } else { 791 method = POA_SURPLUS_TO_DEFAULT_NUM; 792 } 793 } else { 794 method = POA_IMPORTANCE_NUM; 795 } 796 switch (method) { 797 case POA_IMPORTANCE_NUM: 798 /* 799 * TODO: Add support for new resource types 800 */ 801 switch (pool_resource_elem_class_from_string(type)) { 802 case PREC_PSET: 803 ret = pset_allocate_imp(res, nelem); 804 break; 805 default: 806 ret = PO_FAIL; 807 break; 808 } 809 break; 810 case POA_SURPLUS_TO_DEFAULT_NUM: 811 ret = resource_allocate_default(res, nelem); 812 break; 813 } 814 815 return (ret); 816 } 817 818 /* 819 * Each set will get its minimum, however if there is more than the 820 * total minimum available, then leave this in the default set. 821 */ 822 int 823 resource_allocate_default(pool_resource_t **res, uint_t nelem) 824 { 825 res_info_t *res_info; 826 uint_t j; 827 pool_resource_t *default_res = NULL; 828 829 if (nelem == 1) 830 return (PO_SUCCESS); 831 832 if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) { 833 return (PO_FAIL); 834 } 835 836 /* Load current resource values. */ 837 for (j = 0; j < nelem; j++) { 838 839 if (default_res == NULL && 840 resource_is_default(res[j]) == PO_TRUE) 841 default_res = res[j]; 842 843 if (resource_get_max(res[j], 844 &res_info[j].ri_max) == PO_FAIL || 845 resource_get_min(res[j], 846 &res_info[j].ri_min) == PO_FAIL || 847 resource_get_size(res[j], 848 &res_info[j].ri_oldsize) == PO_FAIL || 849 resource_get_pinned(res[j], 850 &res_info[j].ri_pinned) == PO_FAIL) { 851 free(res_info); 852 return (PO_FAIL); 853 } 854 res_info[j].ri_res = res[j]; 855 } 856 857 /* 858 * Firstly, for all resources that have size greater than min, 859 * transfer all movable size above min to the default resource. 860 */ 861 for (j = 0; j < nelem; j++) { 862 863 uint64_t real_min; 864 865 /* compute the real minimum number of resources */ 866 real_min = MAX(res_info[j].ri_pinned, res_info[j].ri_min); 867 if (res_info[j].ri_res != default_res && 868 res_info[j].ri_oldsize > real_min) { 869 870 uint64_t num; 871 872 num = res_info[j].ri_oldsize - real_min; 873 if (pool_resource_transfer( 874 TO_CONF(TO_ELEM(default_res)), 875 res_info[j].ri_res, default_res, num) != 876 PO_SUCCESS) { 877 free(res_info); 878 return (PO_FAIL); 879 } 880 } 881 } 882 /* 883 * Now, transfer resources below min from the default. 884 */ 885 for (j = 0; j < nelem; j++) { 886 /* 887 * We don't want to interfere with resources which are reserved 888 */ 889 if (res_info[j].ri_res != default_res && 890 res_info[j].ri_oldsize < res_info[j].ri_min) { 891 if (pool_resource_transfer( 892 TO_CONF(TO_ELEM(default_res)), 893 default_res, res_info[j].ri_res, 894 res_info[j].ri_min - res_info[j].ri_oldsize) != 895 PO_SUCCESS) { 896 free(res_info); 897 return (PO_FAIL); 898 } 899 } 900 } 901 free(res_info); 902 return (PO_SUCCESS); 903 } 904 905 /* 906 * Allocate cpus to pset resource sets, favoring sets with higher importance. 907 * 908 * Step 1: Sort resource sets by decreasing importance, and load each sets 909 * current size (oldsize), min, max, and number of pinned cpus. 910 * Compute the total number of cpus by totaling oldsize. 911 * 912 * Step 2: Compute the newsize for each set: 913 * 914 * Give each set its min number of cpus. This min may be greater than 915 * its pset.min due to pinned cpus. If there are more cpus than the total 916 * of all mins, then the surplus cpus are dealt round-robin to all sets 917 * (up to their max) in order of decreasing importance. A set may be 918 * skipped during dealing because it started with more than its min due to 919 * pinned cpus. The dealing stops when there are no more cpus or all 920 * sets are at their max. If all sets are at their max, any remaining cpus 921 * are given to the default set. 922 * 923 * Step 3: Transfer cpus from sets with (oldsize > newsize) to sets with 924 * (oldsize < newsize). 925 */ 926 int 927 pset_allocate_imp(pool_resource_t **res, uint_t nelem) 928 { 929 res_info_t *res_info; 930 res_info_t *default_res_info; 931 const pool_resource_t *default_res = NULL; 932 uint64_t tot_resources = 0; /* total count of resources */ 933 uint64_t tot_min = 0; /* total of all resource set mins */ 934 uint64_t num_to_deal = 0; /* total resources above mins to deal */ 935 uint64_t sets_maxed = 0; /* number of resource sets dealt to */ 936 /* their max */ 937 uint64_t sets_finished = 0; /* number of resource sets that have */ 938 /* size == newsize */ 939 int donor, receiver; 940 int deal; 941 int j; 942 int ret = PO_SUCCESS; 943 944 /* 945 * Build list of res_info_t's 946 */ 947 if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) { 948 pool_seterror(POE_SYSTEM); 949 return (PO_FAIL); 950 } 951 952 /* Order resources by importance, most important being first */ 953 qsort(res, nelem, sizeof (pool_resource_t *), 954 resource_compare_by_descending_importance); 955 956 for (j = 0; j < nelem; j++) { 957 958 /* Track which resource is the default */ 959 if (default_res == NULL && 960 resource_is_default(res[j]) == PO_TRUE) { 961 default_res = res[j]; 962 default_res_info = &(res_info[j]); 963 } 964 965 /* Load sets' current values */ 966 if (resource_get_max(res[j], &res_info[j].ri_max) == PO_FAIL || 967 resource_get_min(res[j], &res_info[j].ri_min) == PO_FAIL || 968 resource_get_size(res[j], &res_info[j].ri_oldsize) == 969 PO_FAIL || 970 resource_get_pinned(res[j], 971 &res_info[j].ri_pinned) == PO_FAIL) { 972 free(res_info); 973 return (PO_FAIL); 974 } 975 976 /* Start each set's newsize out at their min. */ 977 res_info[j].ri_newsize = res_info[j].ri_min; 978 979 /* pre-deal pinned resources that exceed min */ 980 if (res_info[j].ri_pinned > res_info[j].ri_min) { 981 res_info[j].ri_newsize = res_info[j].ri_pinned; 982 res_info[j].ri_dealt = 983 res_info[j].ri_newsize - res_info[j].ri_min; 984 } 985 res_info[j].ri_res = res[j]; 986 987 /* Compute total number of resources to deal out */ 988 tot_resources += res_info[j].ri_oldsize; 989 tot_min += res_info[j].ri_newsize; 990 991 #ifdef DEBUG 992 dprintf("res allocation details\n"); 993 pool_elem_dprintf(TO_ELEM(res[j])); 994 dprintf("size=%llu\n", res_info[j].ri_oldsize); 995 #endif /* DEBUG */ 996 } 997 998 num_to_deal = tot_resources - tot_min; 999 1000 /* 1001 * Deal one resource to each set, and then another, until all 1002 * resources are dealt or all sets are at their max. 1003 */ 1004 for (deal = 1; num_to_deal > 0 && sets_maxed < nelem; deal++) { 1005 for (j = 0; j < nelem; j++) { 1006 1007 /* 1008 * Skip this resource set if it has already been 1009 * pre-dealt a resource due to pinned resources. 1010 */ 1011 if (res_info[j].ri_dealt >= deal) 1012 continue; 1013 1014 if (res_info[j].ri_newsize < res_info[j].ri_max) { 1015 1016 res_info[j].ri_dealt++; 1017 res_info[j].ri_newsize++; 1018 if (res_info[j].ri_newsize == 1019 res_info[j].ri_max) 1020 sets_maxed++; 1021 1022 num_to_deal--; 1023 if (num_to_deal == 0) 1024 break; 1025 } 1026 } 1027 } 1028 1029 /* 1030 * If all resource sets are at their max, deal the remaining to the 1031 * default resource set. 1032 */ 1033 if ((sets_maxed == nelem) && (num_to_deal > 0)) { 1034 default_res_info->ri_dealt += num_to_deal; 1035 default_res_info->ri_newsize += num_to_deal; 1036 } 1037 1038 /* 1039 * Sort so that resource sets needing resources preced resource sets 1040 * that have extra resources. The sort function will also compute 1041 * The quantity of resources that need to be transfered into or out 1042 * of each set so that it's size == newsize. 1043 */ 1044 qsort(res_info, nelem, sizeof (res_info_t), 1045 compute_size_to_transfer); 1046 1047 /* 1048 * The donor index starts at the end of the resource set list and 1049 * walks up. The receiver index starts at the beginning of the 1050 * resource set list and walks down. Cpu's are transfered from the 1051 * donors to the receivers until all sets have transfer == 0). 1052 */ 1053 donor = nelem - 1; 1054 receiver = 0; 1055 1056 /* Number of sets with transfer == 0 */ 1057 sets_finished = 0; 1058 1059 /* Tranfer resources so that each set's size becomes newsize */ 1060 for (;;) { 1061 1062 uint64_t ntrans; 1063 if (donor == receiver) { 1064 if (res_info[donor].ri_transfer != 0) { 1065 free(res_info); 1066 return (PO_FAIL); 1067 } 1068 sets_finished++; 1069 break; 1070 } 1071 if (res_info[donor].ri_transfer == 0) { 1072 sets_finished++; 1073 donor--; 1074 continue; 1075 } 1076 if (res_info[receiver].ri_transfer == 0) { 1077 sets_finished++; 1078 receiver++; 1079 continue; 1080 } 1081 1082 /* Transfer resources from the donor set to the receiver */ 1083 ntrans = MIN(res_info[donor].ri_transfer, 1084 -res_info[receiver].ri_transfer); 1085 1086 if (pool_resource_transfer( 1087 TO_CONF(TO_ELEM(res_info[donor].ri_res)), 1088 res_info[donor].ri_res, res_info[receiver].ri_res, 1089 ntrans) != PO_SUCCESS) { 1090 free(res_info); 1091 return (PO_FAIL); 1092 } 1093 res_info[donor].ri_transfer -= ntrans; 1094 res_info[receiver].ri_transfer += ntrans; 1095 } 1096 1097 if (sets_finished != nelem) 1098 ret = PO_FAIL; 1099 1100 free(res_info); 1101 return (ret); 1102 } 1103 1104 /* 1105 * Used as a qsort parameter to help order resources in terms of their 1106 * importance, higher importance being first. 1107 */ 1108 int 1109 resource_compare_by_descending_importance(const void *arg1, const void *arg2) 1110 { 1111 pool_elem_t *elem1; 1112 pool_elem_t *elem2; 1113 pool_resource_t **res1 = (pool_resource_t **)arg1; 1114 pool_resource_t **res2 = (pool_resource_t **)arg2; 1115 pool_value_t val = POOL_VALUE_INITIALIZER; 1116 int64_t i1 = 0, i2 = 0; 1117 1118 elem1 = TO_ELEM(*res1); 1119 elem2 = TO_ELEM(*res2); 1120 1121 if (pool_get_property(TO_CONF(elem1), elem1, "_importance", &val) == 1122 POC_INT) 1123 (void) pool_value_get_int64(&val, &i1); 1124 1125 if (pool_get_property(TO_CONF(elem2), elem2, "_importance", &val) == 1126 POC_INT) 1127 (void) pool_value_get_int64(&val, &i2); 1128 return (i1 > i2 ? -1 : (i1 < i2 ? 1 : 0)); 1129 } 1130 1131 /* 1132 * Sort in increasing order so that resource sets with extra resources are at 1133 * the end and resource sets needing resources are at the beginning. 1134 */ 1135 int 1136 compute_size_to_transfer(const void *arg1, const void *arg2) 1137 { 1138 res_info_t *r1 = (res_info_t *)arg1, *r2 = (res_info_t *)arg2; 1139 r1->ri_transfer = (int64_t)r1->ri_oldsize - (int64_t)r1->ri_newsize; 1140 r2->ri_transfer = (int64_t)r2->ri_oldsize - (int64_t)r2->ri_newsize; 1141 return (r1->ri_transfer > r2->ri_transfer ? 1 : 1142 (r1->ri_transfer < r2->ri_transfer ? -1 : 0)); 1143 } 1144 1145 /* 1146 * set_importance_cb() is used to create "_importance" props on each 1147 * resource associated with a pool. 1148 * 1149 * Returns PO_SUCCESS/PO_FAIL 1150 */ 1151 /*ARGSUSED*/ 1152 static int 1153 set_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused) 1154 { 1155 pool_value_t val = POOL_VALUE_INITIALIZER; 1156 int64_t importance; 1157 pool_resource_t **res; 1158 uint_t nelem, i; 1159 1160 if (pool_get_property(conf, TO_ELEM(pool), "pool.importance", &val) != 1161 POC_INT) { 1162 pool_seterror(POE_INVALID_CONF); 1163 return (PO_FAIL); 1164 } 1165 (void) pool_value_get_int64(&val, &importance); 1166 if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) == 1167 NULL) { 1168 return (PO_FAIL); 1169 } 1170 for (i = 0; res[i] != NULL; i++) { 1171 int64_t old_importance = INT64_MIN; 1172 pool_elem_t *elem = TO_ELEM(res[i]); 1173 1174 if (pool_get_property(conf, elem, "_importance", &val) == 1175 POC_INT) 1176 (void) pool_value_get_int64(&val, &old_importance); 1177 if (old_importance <= importance) { 1178 (void) pool_value_set_int64(&val, importance); 1179 (void) pool_put_property(conf, elem, "_importance", 1180 &val); 1181 } 1182 } 1183 free(res); 1184 return (PO_SUCCESS); 1185 } 1186 1187 /* 1188 * unset_importance_cb() is used to remove "_importance" props from 1189 * each resource associated with a pool. 1190 * 1191 * Returns PO_SUCCESS/PO_FAIL 1192 */ 1193 /*ARGSUSED*/ 1194 static int 1195 unset_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused) 1196 { 1197 pool_resource_t **res; 1198 uint_t nelem, i; 1199 1200 if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) == 1201 NULL) { 1202 return (PO_FAIL); 1203 } 1204 for (i = 0; res[i] != NULL; i++) { 1205 if (pool_rm_property(conf, TO_ELEM(res[i]), "_importance") == 1206 PO_FAIL) { 1207 free(res); 1208 return (PO_FAIL); 1209 } 1210 } 1211 free(res); 1212 return (PO_SUCCESS); 1213 } 1214 1215 /* 1216 * add_importance_props() is used to create "_importance" props on 1217 * each resource associated with a pool. 1218 * 1219 * Returns PO_SUCCESS/PO_FAIL 1220 */ 1221 static int 1222 add_importance_props(pool_conf_t *conf) 1223 { 1224 return (pool_walk_pools(conf, NULL, set_importance_cb)); 1225 } 1226 1227 /* 1228 * remove_importance_props() is used to remove "_importance" props on 1229 * each resource associated with a pool. 1230 * 1231 * Returns PO_SUCCESS/PO_FAIL 1232 */ 1233 static int 1234 remove_importance_props(pool_conf_t *conf) 1235 { 1236 return (pool_walk_pools(conf, NULL, unset_importance_cb)); 1237 } 1238 1239 /* 1240 * pool_conf_commit_sys() takes a configuration and modifies both the 1241 * supplied configuration and the dynamic configuration. The goal of 1242 * this modification is to generate a dynamic configuration which best 1243 * represents the constraints laid down in the static configuration 1244 * and to update the static configuration with the results of this 1245 * process. 1246 * 1247 * Returns PO_SUCCESS/PO_FAIL 1248 */ 1249 int 1250 pool_conf_commit_sys(pool_conf_t *conf, int validate) 1251 { 1252 pool_conf_t *dyn; 1253 1254 if ((dyn = pool_conf_alloc()) == NULL) 1255 return (PO_FAIL); 1256 if (pool_conf_open(dyn, pool_dynamic_location(), PO_RDWR) != 1257 PO_SUCCESS) { 1258 pool_conf_free(dyn); 1259 return (PO_FAIL); 1260 } 1261 if (validate == PO_TRUE) { 1262 if (pool_conf_validate(conf, POV_RUNTIME) != PO_SUCCESS) { 1263 (void) pool_conf_close(dyn); 1264 pool_conf_free(dyn); 1265 return (PO_FAIL); 1266 } 1267 } 1268 /* 1269 * Now try to make the two things "the same". 1270 */ 1271 if (diff_and_fix(conf, dyn) != PO_SUCCESS) { 1272 (void) pool_conf_close(dyn); 1273 pool_conf_free(dyn); 1274 pool_seterror(POE_INVALID_CONF); 1275 return (PO_FAIL); 1276 } 1277 if (dyn->pc_prov->pc_commit(dyn) != PO_SUCCESS) { 1278 (void) pool_conf_close(dyn); 1279 pool_conf_free(dyn); 1280 return (PO_FAIL); 1281 } 1282 (void) pool_conf_close(dyn); 1283 pool_conf_free(dyn); 1284 return (PO_SUCCESS); 1285 } 1286 1287 /* 1288 * Copies all properties from one element to another. If the property 1289 * is a readonly property, then don't copy it. 1290 */ 1291 /* ARGSUSED */ 1292 static int 1293 clone_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, 1294 pool_value_t *pv, void *user) 1295 { 1296 pool_elem_t *tgt = (pool_elem_t *)user; 1297 const pool_prop_t *prop; 1298 #ifdef DEBUG 1299 dprintf("Cloning %s from %s\n", 1300 pool_conf_location(TO_CONF(TO_ELEM(tgt))), 1301 pool_conf_location(TO_CONF(pe))); 1302 assert(TO_CONF(TO_ELEM(tgt)) != TO_CONF(pe)); 1303 dprintf("clone_element: Processing %s\n", name); 1304 pool_value_dprintf(pv); 1305 #endif /* DEBUG */ 1306 /* 1307 * Some properties should be ignored 1308 */ 1309 if ((prop = provider_get_prop(pe, name)) != NULL && 1310 prop_is_readonly(prop) == PO_TRUE) 1311 return (PO_SUCCESS); 1312 1313 /* The temporary property needs special handling */ 1314 if (strstr(name, ".temporary") != NULL) 1315 return (pool_set_temporary(TO_CONF(tgt), tgt) == 1316 PO_FAIL ? PO_FAIL : PO_SUCCESS); 1317 else 1318 return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == 1319 PO_FAIL ? PO_FAIL : PO_SUCCESS); 1320 } 1321 1322 /* 1323 * Removes all properties from one element. Properties which are 1324 * managed by the configuration are ignored. 1325 */ 1326 /* ARGSUSED3 */ 1327 static int 1328 clean_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, 1329 pool_value_t *pv, void *user) 1330 { 1331 const pool_prop_t *prop; 1332 /* 1333 * Some properties should be ignored 1334 */ 1335 if (strstr(name, ".temporary") != NULL || 1336 ((prop = provider_get_prop(pe, name)) != NULL && 1337 prop_is_optional(prop) == PO_FALSE)) 1338 return (PO_SUCCESS); 1339 return (pool_rm_property(conf, (pool_elem_t *)pe, name) == PO_FAIL); 1340 } 1341