1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * These functions implement the process of commitment for a pool 30 * configuration. This process can be described as taking instructions 31 * from a static configuration file and using the information about 32 * the target system contained in the dynamic configuration to make 33 * decisions about how best to allocate resources to meet the 34 * constraints specified in the static configuration file. 35 * 36 * Mechanically, this process relies upon ordering the individual 37 * components of the file and stepping through the lists of components 38 * and taking actions depending on their type and which file they are 39 * part of. 40 * 41 * Configuration components can be broken down into different types 42 * which are then treated according to the following table: 43 * 44 * Element Type Action 45 * system || pool || 46 * res_comp || res_agg If the element is a required element, then create or 47 * update it (don't destroy required elements in the 48 * static configuration) otherwise manipulate the 49 * dynamic configuration to create, destroy or update 50 * the element on the system. 51 * comp Create, destroy or update the static configuration 52 * component. 53 * 54 * The treatment of the different elements reflects the fact that all 55 * elements other than comp are configurable and thus libpool can 56 * create, destroy and modify these elements at will. comp elements 57 * reflect the disposition of the system, these elements can be moved 58 * around but they can't be created or destroyed in the dynamic 59 * configuration in the commit process. comp elements can be created 60 * and destroyed in the static configuration file as a result of a 61 * commit operation, since it's possible for a comp to not appear in 62 * the dynamic configuration. For instance, if the static 63 * configuration file was created on a different machine or after a DR 64 * operation which has removed or added components. 65 * 66 */ 67 #include <assert.h> 68 #include <stdio.h> 69 #include <stdlib.h> 70 #include <sys/types.h> 71 #include <errno.h> 72 #include <string.h> 73 #include <limits.h> 74 #include <unistd.h> 75 76 #include <pool.h> 77 #include "pool_internal.h" 78 #include "pool_impl.h" 79 80 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 81 #define MAX(x, y) ((x) > (y) ? (x) : (y)) 82 #define POA_IMPORTANCE_NUM 0 83 #define POA_SURPLUS_TO_DEFAULT_NUM 1 84 85 /* 86 * This resource specific structure is used to determine allocation of resources 87 * during resource set allocation. Each set will receive its min, plus 88 * some number of dealt resources based on the global allocation policy. 89 */ 90 typedef struct res_info { 91 pool_resource_t *ri_res; /* Resource set */ 92 uint64_t ri_min; /* Resource set's low watermark */ 93 uint64_t ri_max; /* Resource set's high watermark */ 94 uint64_t ri_oldsize; /* Size of resource set at the start */ 95 uint64_t ri_newsize; /* New resource set size allocated */ 96 uint64_t ri_pinned; /* Count of pinned resources in set */ 97 uint64_t ri_dealt; /* Count of resources dealt to set */ 98 int64_t ri_transfer; /* oldsize - newsize */ 99 /* The signed quantity of resources */ 100 /* to tranfer into or out of this */ 101 /* resource set */ 102 /* + transfer: tranfer resources out */ 103 /* - transfer: tranfer resources in */ 104 } res_info_t; 105 106 /* 107 * diff_and_fix operations 108 */ 109 static int commit_create(pool_conf_t *, pool_elem_t **); 110 static int commit_delete(pool_elem_t *); 111 static int commit_update(pool_elem_t *, pool_elem_t *, int); 112 113 /* 114 * configuration commit processing 115 */ 116 static int diff_and_fix(pool_conf_t *, pool_conf_t *); 117 static int process_elem_lt(pool_elem_t *, pool_conf_t *); 118 static int process_elem_gt(pool_elem_t *, pool_conf_t *, 119 pool_conf_t *); 120 static int process_lists(int, pool_conf_t *, 121 pool_conf_t *, int); 122 static pool_elem_t **get_elem_list(const pool_conf_t *, int, uint_t *); 123 static int share_resources(pool_conf_t *); 124 static int resource_allocate(const char *, pool_resource_t **, 125 uint_t); 126 static int resource_allocate_default(pool_resource_t **, uint_t); 127 static int pset_allocate_imp(pool_resource_t **, uint_t); 128 static int resource_compare_by_descending_importance(const void *, 129 const void *); 130 static int compute_size_to_transfer(const void *, const void *); 131 static int set_importance_cb(pool_conf_t *, pool_t *, void *); 132 static int unset_importance_cb(pool_conf_t *, pool_t *, void *); 133 static int add_importance_props(pool_conf_t *); 134 static int remove_importance_props(pool_conf_t *); 135 static int clone_element(pool_conf_t *, pool_elem_t *, 136 const char *, pool_value_t *, void *); 137 static int clean_element(pool_conf_t *, pool_elem_t *, 138 const char *, pool_value_t *, void *); 139 140 /* 141 * commit_create() is used to create a configuration element upon the 142 * system. Since only pools and resource actually need to perform any 143 * action, other elements are ignored as a no-op. 144 */ 145 static int 146 commit_create(pool_conf_t *conf, pool_elem_t **e1) 147 { 148 pool_resource_t *res; 149 pool_t *pool; 150 const char *res_type; 151 pool_elem_t *src = *e1; 152 uint64_t smin, smax, dmax; 153 pool_value_t val = POOL_VALUE_INITIALIZER; 154 char *name; 155 156 switch (pool_elem_class(src)) { 157 case PEC_SYSTEM: /* NO-OP */ 158 break; 159 case PEC_POOL: 160 name = elem_get_name(src); 161 if ((pool = pool_create(conf, name)) == NULL) { 162 free(name); 163 return (PO_FAIL); 164 } 165 free(name); 166 /* 167 * Now copy the properties from the original pool to the 168 * new one 169 */ 170 if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(pool), 171 clone_element) != PO_SUCCESS) 172 return (PO_FAIL); 173 /* 174 * Add a pointer to the src element which can be 175 * updated with a sys_id when the sys_id is allocated 176 * to the created element. 177 */ 178 pool_set_pair(TO_ELEM(pool), src); 179 *e1 = TO_ELEM(pool); 180 break; 181 case PEC_RES_COMP: 182 case PEC_RES_AGG: 183 name = elem_get_name(src); 184 res_type = pool_elem_class_string(src); 185 if ((res = pool_resource_create(conf, res_type, name)) == 186 NULL) { 187 free(name); 188 return (PO_FAIL); 189 } 190 free(name); 191 /* 192 * Need to do some ordering of property updates. 193 * Compare the values of source min/max and 194 * destination min/max. If smin < dmax then update the 195 * smin first, else update the max first. 196 */ 197 if (resource_get_min(pool_elem_res(src), &smin) != PO_SUCCESS || 198 resource_get_max(pool_elem_res(src), &smax) != PO_SUCCESS || 199 resource_get_max(res, &dmax) != PO_SUCCESS) 200 return (PO_FAIL); 201 if (smin < dmax) { 202 pool_value_set_uint64(&val, smin); 203 if (pool_put_ns_property(TO_ELEM(res), c_min_prop, 204 &val) != PO_SUCCESS) 205 return (PO_FAIL); 206 } else { 207 pool_value_set_uint64(&val, smax); 208 if (pool_put_ns_property(TO_ELEM(res), c_max_prop, 209 &val) != PO_SUCCESS) 210 return (PO_FAIL); 211 } 212 /* 213 * Now copy the properties from the original resource 214 * to the new one 215 */ 216 if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(res), 217 clone_element) != PO_SUCCESS) 218 return (PO_FAIL); 219 /* 220 * Add a pointer to the src element which can be 221 * updated with a sys_id when the sys_id is allocated 222 * to the created element. 223 */ 224 pool_set_pair(TO_ELEM(res), src); 225 *e1 = TO_ELEM(res); 226 break; 227 case PEC_COMP: /* NO-OP */ 228 break; 229 default: 230 return (PO_FAIL); 231 } 232 return (PO_SUCCESS); 233 } 234 235 236 /* 237 * commit_delete() is used to delete a configuration element upon the 238 * system. Since only pools and resources actually need to perform 239 * any action, other elements are ignored as a no-op. 240 */ 241 static int 242 commit_delete(pool_elem_t *pe) 243 { 244 pool_resource_t *res; 245 pool_t *pool; 246 int ret = 0; 247 248 switch (pool_elem_class(pe)) { 249 case PEC_SYSTEM: /* NO-OP */ 250 break; 251 case PEC_POOL: 252 pool = pool_elem_pool(pe); 253 ret = pool_destroy(TO_CONF(pe), pool); 254 break; 255 case PEC_RES_COMP: 256 case PEC_RES_AGG: 257 res = pool_elem_res(pe); 258 ret = pool_resource_destroy(TO_CONF(pe), res); 259 break; 260 case PEC_COMP: /* NO-OP */ 261 break; 262 default: 263 return (PO_FAIL); 264 } 265 return (ret); 266 } 267 268 /* 269 * commit_update() is used to update a configuration element upon the 270 * system or in a static configuration file. The pass parameter 271 * governs whether properties are being updated or associations. In 272 * pass 0, properties are updated. If the element is of class 273 * PEC_COMP, then make sure that the element in the static 274 * configuration file is correctly located before proceeding with the 275 * update. Then, the element in the dynamic configuration file is 276 * updated. In pass 1, ie. pass != 0, any pool components have their 277 * associations updated in the dynamic configuration. 278 */ 279 static int 280 commit_update(pool_elem_t *e1, pool_elem_t *e2, int pass) 281 { 282 if (pass == 0) { 283 pool_resource_t *res1; 284 pool_resource_t *res2; 285 if (pool_elem_class(e1) == PEC_COMP) { 286 res1 = pool_get_owning_resource(TO_CONF(e1), 287 pool_elem_comp(e1)); 288 res2 = pool_get_owning_resource(TO_CONF(e2), 289 pool_elem_comp(e2)); 290 if (pool_elem_compare_name(TO_ELEM(res1), 291 TO_ELEM(res2)) != 0) { 292 char *name; 293 const pool_resource_t *newres; 294 pool_component_t *comps[2] = { NULL }; 295 296 comps[0] = pool_elem_comp(e2); 297 name = elem_get_name(TO_ELEM(res1)); 298 newres = pool_get_resource(TO_CONF(e2), 299 pool_elem_class_string(TO_ELEM(res1)), 300 name); 301 free(name); 302 assert(newres); 303 #ifdef DEBUG 304 dprintf("transferring: res, comp\n"); 305 pool_elem_dprintf(TO_ELEM(newres)); 306 pool_elem_dprintf(e2); 307 #endif /* DEBUG */ 308 (void) pool_resource_xtransfer(TO_CONF(e2), 309 res2, (pool_resource_t *)newres, comps); 310 } 311 } 312 if (pool_walk_properties(TO_CONF(e2), e2, NULL, 313 clean_element) != PO_SUCCESS) { 314 return (PO_FAIL); 315 } 316 /* 317 * Need to do some ordering of property updates if the 318 * element to be updated is a resource. Compare the 319 * values of source min/max and destination 320 * min/max. If smin < dmax then update the smin first, 321 * else update the max first. 322 */ 323 if (pool_elem_class(e1) == PEC_RES_COMP || 324 pool_elem_class(e1) == PEC_RES_AGG) { 325 uint64_t smin, smax, dmax; 326 pool_value_t val = POOL_VALUE_INITIALIZER; 327 328 if (resource_get_min(pool_elem_res(e1), &smin) != 329 PO_SUCCESS || 330 resource_get_max(pool_elem_res(e1), &smax) != 331 PO_SUCCESS || 332 resource_get_max(pool_elem_res(e2), &dmax) != 333 PO_SUCCESS) 334 return (PO_FAIL); 335 if (smin < dmax) { 336 pool_value_set_uint64(&val, smin); 337 if (pool_put_ns_property(e2, c_min_prop, 338 &val) != PO_SUCCESS) 339 return (PO_FAIL); 340 } else { 341 pool_value_set_uint64(&val, smax); 342 if (pool_put_ns_property(e2, c_max_prop, 343 &val) != PO_SUCCESS) 344 return (PO_FAIL); 345 } 346 } 347 /* 348 * This next couple of steps needs some 349 * explanation. The first walk, copies all the 350 * properties that are writeable from the static 351 * configuration to the dynamic configuration. The 352 * second walk copies all properties (writeable or 353 * not) from the dynamic configuration element back to 354 * the static configuration element. This ensures that 355 * updates from the static configuration element are 356 * correctly applied to the dynamic configuration and 357 * then the static configuration element is updated 358 * with the latest values of the read-only xproperties 359 * from the dynamic configuration element. The 360 * enforcing of permisssions is performed in 361 * clone_element by its choice of property 362 * manipulation function. 363 */ 364 if (pool_walk_properties(TO_CONF(e1), e1, e2, clone_element) != 365 PO_SUCCESS) { 366 return (PO_FAIL); 367 } 368 if (pool_walk_properties(TO_CONF(e2), e2, e1, clone_element) != 369 PO_SUCCESS) { 370 return (PO_FAIL); 371 } 372 } else { 373 if (pool_elem_class(e1) == PEC_POOL) { 374 pool_resource_t **rs; 375 uint_t nelem; 376 int i; 377 pool_value_t val = POOL_VALUE_INITIALIZER; 378 pool_value_t *pvals[] = { NULL, NULL }; 379 380 pvals[0] = &val; 381 if (pool_value_set_string(&val, "pset") != PO_SUCCESS || 382 pool_value_set_name(&val, c_type) != PO_SUCCESS) 383 return (PO_FAIL); 384 if ((rs = pool_query_pool_resources(TO_CONF(e1), 385 pool_elem_pool(e1), &nelem, pvals)) != NULL) { 386 for (i = 0; i < nelem; i++) { 387 const pool_resource_t *tgt_res; 388 char *res_name = 389 elem_get_name(TO_ELEM(rs[i])); 390 391 if ((tgt_res = pool_get_resource( 392 TO_CONF(e2), pool_elem_class_string( 393 TO_ELEM(rs[i])), res_name)) == 394 NULL) { 395 tgt_res = get_default_resource( 396 rs[i]); 397 } 398 free(res_name); 399 if (pool_associate(TO_CONF(e2), 400 pool_elem_pool(e2), tgt_res) != 401 PO_SUCCESS) { 402 free(rs); 403 return (PO_FAIL); 404 } 405 } 406 free(rs); 407 } 408 } 409 } 410 return (PO_SUCCESS); 411 } 412 413 /* 414 * diff_and_fix() works out the differences between two configurations 415 * and modifies the state of the system to match the operations 416 * required to bring the two configurations into sync. 417 * 418 * Returns PO_SUCCESS/PO_FAIL. 419 */ 420 static int 421 diff_and_fix(pool_conf_t *stc, pool_conf_t *dyn) 422 { 423 /* 424 * The ordering of the operations is significant, we must 425 * process the system element, then the pools elements, then 426 * the resource elements, then the pools elements again and 427 * finally the resource components. 428 * 429 * TODO 430 * PEC_RES_COMP are the only type of resources 431 * currently. When PEC_RES_AGG resources are added they must 432 * also be processed. 433 */ 434 if (process_lists(PEC_SYSTEM, stc, dyn, 0) != PO_SUCCESS) { 435 return (PO_FAIL); 436 } 437 if (process_lists(PEC_POOL, stc, dyn, 0) != PO_SUCCESS) { 438 return (PO_FAIL); 439 } 440 if (process_lists(PEC_RES_COMP, stc, dyn, 0) != PO_SUCCESS) { 441 return (PO_FAIL); 442 } 443 if (process_lists(PEC_COMP, stc, dyn, 0) != PO_SUCCESS) { 444 return (PO_FAIL); 445 } 446 if (process_lists(PEC_POOL, stc, dyn, 1) != PO_SUCCESS) { 447 return (PO_FAIL); 448 } 449 /* 450 * Share the resources. It has to be called for both 451 * configurations to ensure that the configurations still look 452 * the same. 453 */ 454 if (share_resources(dyn) != PO_SUCCESS) { 455 return (PO_FAIL); 456 } 457 if (share_resources(stc) != PO_SUCCESS) { 458 return (PO_FAIL); 459 } 460 return (PO_SUCCESS); 461 } 462 463 static int 464 process_elem_lt(pool_elem_t *pe, pool_conf_t *dyn) 465 { 466 if (pool_elem_class(pe) == PEC_COMP) { 467 if (pool_component_destroy(pool_elem_comp(pe)) == PO_FAIL) { 468 return (PO_FAIL); 469 } 470 } else if (! elem_is_default(pe)) { 471 if (commit_create(dyn, &pe) != PO_SUCCESS) { 472 return (PO_FAIL); 473 } 474 } 475 return (PO_SUCCESS); 476 } 477 478 static int 479 process_elem_gt(pool_elem_t *pe, pool_conf_t *stc, pool_conf_t *dyn) 480 { 481 if (pool_elem_class(pe) == PEC_COMP) { 482 pool_resource_t *owner; 483 const pool_resource_t *parent_res; 484 pool_value_t val = POOL_VALUE_INITIALIZER; 485 const pool_component_t *newcomp; 486 const char *resname; 487 const char *restype; 488 /* 489 * I have to find the right parent in the static 490 * configuration. It may not exist, in which case it's 491 * correct to put it in the default 492 */ 493 owner = pool_get_owning_resource(dyn, 494 pool_elem_comp(pe)); 495 if (pool_get_ns_property(TO_ELEM(owner), "name", &val) == 496 POC_INVAL) 497 return (PO_FAIL); 498 499 if (pool_value_get_string(&val, &resname) == PO_FAIL) 500 return (PO_FAIL); 501 502 if ((resname = strdup(resname)) == NULL) 503 return (PO_FAIL); 504 505 restype = pool_elem_class_string(TO_ELEM(owner)); 506 parent_res = pool_get_resource(stc, restype, resname); 507 free((void *)resname); 508 if (parent_res == NULL) 509 parent_res = resource_by_sysid(stc, PS_NONE, restype); 510 /* 511 * Now need to make a copy of the component in the 512 * dynamic configuration in the static configuration. 513 */ 514 if ((newcomp = pool_component_create(stc, parent_res, 515 elem_get_sysid(pe))) == NULL) 516 return (PO_FAIL); 517 518 if (pool_walk_properties(TO_CONF(pe), pe, TO_ELEM(newcomp), 519 clone_element) != PO_SUCCESS) 520 return (PO_FAIL); 521 } else if (elem_is_default(pe)) { 522 pool_resource_t *newres; 523 pool_t *newpool; 524 char *name; 525 526 if ((name = elem_get_name(pe)) == NULL) 527 return (PO_FAIL); 528 switch (pool_elem_class(pe)) { 529 case PEC_POOL: 530 if ((newpool = pool_create(stc, name)) == NULL) { 531 free(name); 532 return (PO_FAIL); 533 } 534 free(name); 535 if (pool_walk_properties(TO_CONF(pe), pe, 536 TO_ELEM(newpool), clone_element) != PO_SUCCESS) 537 return (PO_FAIL); 538 break; 539 case PEC_RES_AGG: 540 case PEC_RES_COMP: 541 if ((newres = pool_resource_create(stc, 542 pool_elem_class_string(pe), name)) == 543 NULL) { 544 free(name); 545 return (PO_FAIL); 546 } 547 free(name); 548 if (pool_walk_properties(TO_CONF(pe), pe, 549 TO_ELEM(newres), clone_element) != PO_SUCCESS) 550 return (PO_FAIL); 551 break; 552 default: 553 free(name); 554 break; 555 } 556 } else { 557 if (commit_delete(pe) != PO_SUCCESS) 558 return (PO_FAIL); 559 } 560 return (PO_SUCCESS); 561 } 562 563 /* 564 * This function compares the elements of the supplied type in the 565 * static and dynamic configurations supplied. The lists of elements 566 * are compared and used to create, delete and updated elements in 567 * both the static and dynamic configurations. The pass parameter is 568 * used to indicate to commit_update() whether property updates or 569 * association updates should be performed. 570 */ 571 static int 572 process_lists(int type, pool_conf_t *stc, pool_conf_t *dyn, int pass) 573 { 574 uint_t stc_nelem = 0, dyn_nelem = 0; 575 pool_elem_t **stc_elems, **dyn_elems; 576 int i, j; 577 int status = PO_SUCCESS; 578 579 if ((stc_elems = get_elem_list(stc, type, &stc_nelem)) == NULL) 580 return (PO_FAIL); 581 582 qsort(stc_elems, stc_nelem, sizeof (pool_elem_t *), 583 qsort_elem_compare); 584 585 if ((dyn_elems = get_elem_list(dyn, type, &dyn_nelem)) == NULL) { 586 free(stc_elems); 587 return (PO_FAIL); 588 } 589 590 qsort(dyn_elems, dyn_nelem, sizeof (pool_elem_t *), 591 qsort_elem_compare); 592 /* 593 * Step through and do the updating, remember that we are 594 * comparing using the compare function for the configuration 595 * and that is fixed. 596 */ 597 i = j = 0; 598 while (status == PO_SUCCESS && i < stc_nelem && j < dyn_nelem) { 599 int compare; 600 /* 601 * We are going to do this by stepping through the static 602 * list first. 603 */ 604 if (elem_is_default(stc_elems[i]) && 605 elem_is_default(dyn_elems[j])) 606 compare = 0; 607 else 608 compare = pool_elem_compare_name(stc_elems[i], 609 dyn_elems[j]); 610 if (compare < 0) { 611 status = process_elem_lt(stc_elems[i], dyn); 612 i++; 613 } else if (compare > 0) { 614 status = process_elem_gt(dyn_elems[j], stc, dyn); 615 j++; 616 } else { /* compare == 0 */ 617 if (commit_update(stc_elems[i], dyn_elems[j], pass) 618 != PO_SUCCESS) { 619 status = PO_FAIL; 620 } 621 i++; 622 j++; 623 } 624 } 625 if (status == PO_FAIL) { 626 free(stc_elems); 627 free(dyn_elems); 628 return (PO_FAIL); 629 } 630 while (status == PO_SUCCESS && i < stc_nelem) { 631 status = process_elem_lt(stc_elems[i], dyn); 632 i++; 633 } 634 if (status == PO_FAIL) { 635 free(stc_elems); 636 free(dyn_elems); 637 return (PO_FAIL); 638 } 639 while (status == PO_SUCCESS && j < dyn_nelem) { 640 status = process_elem_gt(dyn_elems[j], stc, dyn); 641 j++; 642 } 643 free(stc_elems); 644 free(dyn_elems); 645 return (status); 646 } 647 648 /* 649 * get_elem_list() returns a list of pool_elem_t's. The size of the 650 * list is written into nelem. The list contains elements of all types 651 * that pools is interested in: i.e. system, pool, resources and 652 * resource components. It is the caller's responsibility to free the 653 * list when it is finished with. 654 * 655 * The array of pointers returned by the type specific query can be 656 * safely cast to be an array of pool_elem_t pointers. In the case of 657 * PEC_RES_COMP some additional processing is required to qualify the 658 * list of elements. 659 * 660 * Returns a pointer to a list of pool_elem_t's or NULL on failure. 661 */ 662 static pool_elem_t ** 663 get_elem_list(const pool_conf_t *conf, int type, uint_t *nelem) 664 { 665 pool_resource_t **rl; 666 pool_t **pl; 667 pool_component_t **cl; 668 pool_elem_t **elems = NULL; 669 int i; 670 671 switch (type) { 672 case PEC_SYSTEM: 673 if ((elems = malloc(sizeof (pool_elem_t *))) == NULL) 674 return (NULL); 675 *nelem = 1; 676 elems[0] = pool_conf_to_elem(conf); 677 break; 678 case PEC_POOL: 679 if ((pl = pool_query_pools(conf, nelem, NULL)) != NULL) { 680 elems = (pool_elem_t **)pl; 681 } 682 break; 683 case PEC_RES_COMP: 684 if ((rl = pool_query_resources(conf, nelem, NULL)) != NULL) { 685 int j = 0; 686 elems = (pool_elem_t **)rl; 687 for (i = 0; i < *nelem; i++) { 688 if (pool_elem_class(TO_ELEM(rl[i])) == 689 PEC_RES_COMP) 690 elems[j++] = TO_ELEM(rl[i]); 691 } 692 *nelem = j; 693 } 694 break; 695 case PEC_COMP: 696 if ((cl = pool_query_components(conf, nelem, NULL)) != NULL) { 697 elems = (pool_elem_t **)cl; 698 } 699 break; 700 default: 701 abort(); 702 break; 703 } 704 return (elems); 705 } 706 707 /* 708 * share_resources() sets up the allocation of resources by each 709 * provider. Firstly all resources are updated with the importance of 710 * each pool, then each resource provider is invoked in turn with a 711 * list of it's own resources. Finally, the pool importance details 712 * are removed from the resources. 713 * 714 * Returns PO_SUCCESS/PO_FAIL 715 */ 716 static int 717 share_resources(pool_conf_t *conf) 718 { 719 pool_resource_t **resources; 720 uint_t nelem; 721 pool_value_t *props[] = { NULL, NULL }; 722 pool_value_t val = POOL_VALUE_INITIALIZER; 723 724 props[0] = &val; 725 726 /* 727 * Call an allocation function for each type of supported resource. 728 * This function is responsible for "sharing" resources to resource 729 * sets as determined by the system.allocate-method. 730 */ 731 732 if (pool_value_set_string(props[0], "pset") != PO_SUCCESS || 733 pool_value_set_name(props[0], c_type) != PO_SUCCESS) 734 return (PO_FAIL); 735 736 if (add_importance_props(conf) != PO_SUCCESS) { 737 (void) remove_importance_props(conf); 738 return (PO_FAIL); 739 } 740 741 if ((resources = pool_query_resources(conf, &nelem, props)) != NULL) { 742 /* 743 * 'pool.importance' defines the importance of a pool; 744 * resources inherit the importance of the pool that 745 * is associated with them. If more than one pool is 746 * associated with a resource, the importance of the 747 * resource is the maximum importance of all 748 * associated pools. Use '_importance' on resources 749 * to determine who gets extra. 750 */ 751 if (resource_allocate("pset", resources, nelem) != PO_SUCCESS) { 752 free(resources); 753 (void) remove_importance_props(conf); 754 return (PO_FAIL); 755 } 756 } 757 free(resources); 758 (void) remove_importance_props(conf); 759 return (PO_SUCCESS); 760 } 761 762 763 /* 764 * Work out which allocation method to use based on the value of the 765 * system.allocate-method property. 766 */ 767 int 768 resource_allocate(const char *type, pool_resource_t **res, uint_t nelem) 769 { 770 pool_elem_t *pe; 771 const char *method_name; 772 uint64_t method; 773 pool_value_t val = POOL_VALUE_INITIALIZER; 774 int ret; 775 776 pe = pool_conf_to_elem(TO_CONF(TO_ELEM(res[0]))); 777 778 if (pool_get_ns_property(pe, "allocate-method", &val) != POC_STRING) 779 method_name = POA_IMPORTANCE; 780 else { 781 (void) pool_value_get_string(&val, &method_name); 782 } 783 if (strcmp(POA_IMPORTANCE, method_name) != 0) { 784 if (strcmp(POA_SURPLUS_TO_DEFAULT, method_name) != 0) { 785 pool_seterror(POE_INVALID_CONF); 786 return (PO_FAIL); 787 } else { 788 method = POA_SURPLUS_TO_DEFAULT_NUM; 789 } 790 } else { 791 method = POA_IMPORTANCE_NUM; 792 } 793 switch (method) { 794 case POA_IMPORTANCE_NUM: 795 /* 796 * TODO: Add support for new resource types 797 */ 798 switch (pool_resource_elem_class_from_string(type)) { 799 case PREC_PSET: 800 ret = pset_allocate_imp(res, nelem); 801 break; 802 default: 803 ret = PO_FAIL; 804 break; 805 } 806 break; 807 case POA_SURPLUS_TO_DEFAULT_NUM: 808 ret = resource_allocate_default(res, nelem); 809 break; 810 } 811 812 return (ret); 813 } 814 815 /* 816 * Each set will get its minimum, however if there is more than the 817 * total minimum available, then leave this in the default set. 818 */ 819 int 820 resource_allocate_default(pool_resource_t **res, uint_t nelem) 821 { 822 res_info_t *res_info; 823 uint_t j; 824 pool_resource_t *default_res = NULL; 825 826 if (nelem == 1) 827 return (PO_SUCCESS); 828 829 if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) { 830 return (PO_FAIL); 831 } 832 833 /* Load current resource values. */ 834 for (j = 0; j < nelem; j++) { 835 836 if (default_res == NULL && 837 resource_is_default(res[j]) == PO_TRUE) 838 default_res = res[j]; 839 840 if (resource_get_max(res[j], 841 &res_info[j].ri_max) == PO_FAIL || 842 resource_get_min(res[j], 843 &res_info[j].ri_min) == PO_FAIL || 844 resource_get_size(res[j], 845 &res_info[j].ri_oldsize) == PO_FAIL || 846 resource_get_pinned(res[j], 847 &res_info[j].ri_pinned) == PO_FAIL) { 848 free(res_info); 849 return (PO_FAIL); 850 } 851 res_info[j].ri_res = res[j]; 852 } 853 854 /* 855 * Firstly, for all resources that have size greater than min, 856 * transfer all movable size above min to the default resource. 857 */ 858 for (j = 0; j < nelem; j++) { 859 860 uint64_t real_min; 861 862 /* compute the real minimum number of resources */ 863 real_min = MAX(res_info[j].ri_pinned, res_info[j].ri_min); 864 if (res_info[j].ri_res != default_res && 865 res_info[j].ri_oldsize > real_min) { 866 867 uint64_t num; 868 869 num = res_info[j].ri_oldsize - real_min; 870 if (pool_resource_transfer( 871 TO_CONF(TO_ELEM(default_res)), 872 res_info[j].ri_res, default_res, num) != 873 PO_SUCCESS) { 874 free(res_info); 875 return (PO_FAIL); 876 } 877 } 878 } 879 /* 880 * Now, transfer resources below min from the default. 881 */ 882 for (j = 0; j < nelem; j++) { 883 /* 884 * We don't want to interfere with resources which are reserved 885 */ 886 if (res_info[j].ri_res != default_res && 887 res_info[j].ri_oldsize < res_info[j].ri_min) { 888 if (pool_resource_transfer( 889 TO_CONF(TO_ELEM(default_res)), 890 default_res, res_info[j].ri_res, 891 res_info[j].ri_min - res_info[j].ri_oldsize) != 892 PO_SUCCESS) { 893 free(res_info); 894 return (PO_FAIL); 895 } 896 } 897 } 898 free(res_info); 899 return (PO_SUCCESS); 900 } 901 902 /* 903 * Allocate cpus to pset resource sets, favoring sets with higher importance. 904 * 905 * Step 1: Sort resource sets by decreasing importance, and load each sets 906 * current size (oldsize), min, max, and number of pinned cpus. 907 * Compute the total number of cpus by totaling oldsize. 908 * 909 * Step 2: Compute the newsize for each set: 910 * 911 * Give each set its min number of cpus. This min may be greater than 912 * its pset.min due to pinned cpus. If there are more cpus than the total 913 * of all mins, then the surplus cpus are dealt round-robin to all sets 914 * (up to their max) in order of decreasing importance. A set may be 915 * skipped during dealing because it started with more than its min due to 916 * pinned cpus. The dealing stops when there are no more cpus or all 917 * sets are at their max. If all sets are at their max, any remaining cpus 918 * are given to the default set. 919 * 920 * Step 3: Transfer cpus from sets with (oldsize > newsize) to sets with 921 * (oldsize < newsize). 922 */ 923 int 924 pset_allocate_imp(pool_resource_t **res, uint_t nelem) 925 { 926 res_info_t *res_info; 927 res_info_t *default_res_info; 928 const pool_resource_t *default_res = NULL; 929 uint64_t tot_resources = 0; /* total count of resources */ 930 uint64_t tot_min = 0; /* total of all resource set mins */ 931 uint64_t num_to_deal = 0; /* total resources above mins to deal */ 932 uint64_t sets_maxed = 0; /* number of resource sets dealt to */ 933 /* their max */ 934 uint64_t sets_finished = 0; /* number of resource sets that have */ 935 /* size == newsize */ 936 int donor, receiver; 937 int deal; 938 int j; 939 int ret = PO_SUCCESS; 940 941 /* 942 * Build list of res_info_t's 943 */ 944 if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) { 945 pool_seterror(POE_SYSTEM); 946 return (PO_FAIL); 947 } 948 949 /* Order resources by importance, most important being first */ 950 qsort(res, nelem, sizeof (pool_resource_t *), 951 resource_compare_by_descending_importance); 952 953 for (j = 0; j < nelem; j++) { 954 955 /* Track which resource is the default */ 956 if (default_res == NULL && 957 resource_is_default(res[j]) == PO_TRUE) { 958 default_res = res[j]; 959 default_res_info = &(res_info[j]); 960 } 961 962 /* Load sets' current values */ 963 if (resource_get_max(res[j], &res_info[j].ri_max) == PO_FAIL || 964 resource_get_min(res[j], &res_info[j].ri_min) == PO_FAIL || 965 resource_get_size(res[j], &res_info[j].ri_oldsize) == 966 PO_FAIL || 967 resource_get_pinned(res[j], 968 &res_info[j].ri_pinned) == PO_FAIL) { 969 free(res_info); 970 return (PO_FAIL); 971 } 972 973 /* Start each set's newsize out at their min. */ 974 res_info[j].ri_newsize = res_info[j].ri_min; 975 976 /* pre-deal pinned resources that exceed min */ 977 if (res_info[j].ri_pinned > res_info[j].ri_min) { 978 res_info[j].ri_newsize = res_info[j].ri_pinned; 979 res_info[j].ri_dealt = 980 res_info[j].ri_newsize - res_info[j].ri_min; 981 } 982 res_info[j].ri_res = res[j]; 983 984 /* Compute total number of resources to deal out */ 985 tot_resources += res_info[j].ri_oldsize; 986 tot_min += res_info[j].ri_newsize; 987 988 #ifdef DEBUG 989 dprintf("res allocation details\n"); 990 pool_elem_dprintf(TO_ELEM(res[j])); 991 dprintf("size=%llu\n", res_info[j].ri_oldsize); 992 #endif /* DEBUG */ 993 } 994 995 num_to_deal = tot_resources - tot_min; 996 997 /* 998 * Deal one resource to each set, and then another, until all 999 * resources are dealt or all sets are at their max. 1000 */ 1001 for (deal = 1; num_to_deal > 0 && sets_maxed < nelem; deal++) { 1002 for (j = 0; j < nelem; j++) { 1003 1004 /* 1005 * Skip this resource set if it has already been 1006 * pre-dealt a resource due to pinned resources. 1007 */ 1008 if (res_info[j].ri_dealt >= deal) 1009 continue; 1010 1011 if (res_info[j].ri_newsize < res_info[j].ri_max) { 1012 1013 res_info[j].ri_dealt++; 1014 res_info[j].ri_newsize++; 1015 if (res_info[j].ri_newsize == 1016 res_info[j].ri_max) 1017 sets_maxed++; 1018 1019 num_to_deal--; 1020 if (num_to_deal == 0) 1021 break; 1022 } 1023 } 1024 } 1025 1026 /* 1027 * If all resource sets are at their max, deal the remaining to the 1028 * default resource set. 1029 */ 1030 if ((sets_maxed == nelem) && (num_to_deal > 0)) { 1031 default_res_info->ri_dealt += num_to_deal; 1032 default_res_info->ri_newsize += num_to_deal; 1033 } 1034 1035 /* 1036 * Sort so that resource sets needing resources preced resource sets 1037 * that have extra resources. The sort function will also compute 1038 * The quantity of resources that need to be transfered into or out 1039 * of each set so that it's size == newsize. 1040 */ 1041 qsort(res_info, nelem, sizeof (res_info_t), 1042 compute_size_to_transfer); 1043 1044 /* 1045 * The donor index starts at the end of the resource set list and 1046 * walks up. The receiver index starts at the beginning of the 1047 * resource set list and walks down. Cpu's are transfered from the 1048 * donors to the receivers until all sets have transfer == 0). 1049 */ 1050 donor = nelem - 1; 1051 receiver = 0; 1052 1053 /* Number of sets with transfer == 0 */ 1054 sets_finished = 0; 1055 1056 /* Tranfer resources so that each set's size becomes newsize */ 1057 for (;;) { 1058 1059 uint64_t ntrans; 1060 if (donor == receiver) { 1061 if (res_info[donor].ri_transfer != 0) { 1062 free(res_info); 1063 return (PO_FAIL); 1064 } 1065 sets_finished++; 1066 break; 1067 } 1068 if (res_info[donor].ri_transfer == 0) { 1069 sets_finished++; 1070 donor--; 1071 continue; 1072 } 1073 if (res_info[receiver].ri_transfer == 0) { 1074 sets_finished++; 1075 receiver++; 1076 continue; 1077 } 1078 1079 /* Transfer resources from the donor set to the receiver */ 1080 ntrans = MIN(res_info[donor].ri_transfer, 1081 -res_info[receiver].ri_transfer); 1082 1083 if (pool_resource_transfer( 1084 TO_CONF(TO_ELEM(res_info[donor].ri_res)), 1085 res_info[donor].ri_res, res_info[receiver].ri_res, 1086 ntrans) != PO_SUCCESS) { 1087 free(res_info); 1088 return (PO_FAIL); 1089 } 1090 res_info[donor].ri_transfer -= ntrans; 1091 res_info[receiver].ri_transfer += ntrans; 1092 } 1093 1094 if (sets_finished != nelem) 1095 ret = PO_FAIL; 1096 1097 free(res_info); 1098 return (ret); 1099 } 1100 1101 /* 1102 * Used as a qsort parameter to help order resources in terms of their 1103 * importance, higher importance being first. 1104 */ 1105 int 1106 resource_compare_by_descending_importance(const void *arg1, const void *arg2) 1107 { 1108 pool_elem_t *elem1; 1109 pool_elem_t *elem2; 1110 pool_resource_t **res1 = (pool_resource_t **)arg1; 1111 pool_resource_t **res2 = (pool_resource_t **)arg2; 1112 pool_value_t val = POOL_VALUE_INITIALIZER; 1113 int64_t i1 = 0, i2 = 0; 1114 1115 elem1 = TO_ELEM(*res1); 1116 elem2 = TO_ELEM(*res2); 1117 1118 if (pool_get_property(TO_CONF(elem1), elem1, "_importance", &val) == 1119 POC_INT) 1120 (void) pool_value_get_int64(&val, &i1); 1121 1122 if (pool_get_property(TO_CONF(elem2), elem2, "_importance", &val) == 1123 POC_INT) 1124 (void) pool_value_get_int64(&val, &i2); 1125 return (i1 > i2 ? -1 : (i1 < i2 ? 1 : 0)); 1126 } 1127 1128 /* 1129 * Sort in increasing order so that resource sets with extra resources are at 1130 * the end and resource sets needing resources are at the beginning. 1131 */ 1132 int 1133 compute_size_to_transfer(const void *arg1, const void *arg2) 1134 { 1135 res_info_t *r1 = (res_info_t *)arg1, *r2 = (res_info_t *)arg2; 1136 r1->ri_transfer = (int64_t)r1->ri_oldsize - (int64_t)r1->ri_newsize; 1137 r2->ri_transfer = (int64_t)r2->ri_oldsize - (int64_t)r2->ri_newsize; 1138 return (r1->ri_transfer > r2->ri_transfer ? 1 : 1139 (r1->ri_transfer < r2->ri_transfer ? -1 : 0)); 1140 } 1141 1142 /* 1143 * set_importance_cb() is used to create "_importance" props on each 1144 * resource associated with a pool. 1145 * 1146 * Returns PO_SUCCESS/PO_FAIL 1147 */ 1148 /*ARGSUSED*/ 1149 static int 1150 set_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused) 1151 { 1152 pool_value_t val = POOL_VALUE_INITIALIZER; 1153 int64_t importance; 1154 pool_resource_t **res; 1155 uint_t nelem, i; 1156 1157 if (pool_get_property(conf, TO_ELEM(pool), "pool.importance", &val) != 1158 POC_INT) { 1159 pool_seterror(POE_INVALID_CONF); 1160 return (PO_FAIL); 1161 } 1162 (void) pool_value_get_int64(&val, &importance); 1163 if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) == 1164 NULL) { 1165 return (PO_FAIL); 1166 } 1167 for (i = 0; res[i] != NULL; i++) { 1168 int64_t old_importance = INT64_MIN; 1169 pool_elem_t *elem = TO_ELEM(res[i]); 1170 1171 if (pool_get_property(conf, elem, "_importance", &val) == 1172 POC_INT) 1173 (void) pool_value_get_int64(&val, &old_importance); 1174 if (old_importance <= importance) { 1175 (void) pool_value_set_int64(&val, importance); 1176 (void) pool_put_property(conf, elem, "_importance", 1177 &val); 1178 } 1179 } 1180 free(res); 1181 return (PO_SUCCESS); 1182 } 1183 1184 /* 1185 * unset_importance_cb() is used to remove "_importance" props from 1186 * each resource associated with a pool. 1187 * 1188 * Returns PO_SUCCESS/PO_FAIL 1189 */ 1190 /*ARGSUSED*/ 1191 static int 1192 unset_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused) 1193 { 1194 pool_resource_t **res; 1195 uint_t nelem, i; 1196 1197 if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) == 1198 NULL) { 1199 return (PO_FAIL); 1200 } 1201 for (i = 0; res[i] != NULL; i++) { 1202 if (pool_rm_property(conf, TO_ELEM(res[i]), "_importance") == 1203 PO_FAIL) { 1204 free(res); 1205 return (PO_FAIL); 1206 } 1207 } 1208 free(res); 1209 return (PO_SUCCESS); 1210 } 1211 1212 /* 1213 * add_importance_props() is used to create "_importance" props on 1214 * each resource associated with a pool. 1215 * 1216 * Returns PO_SUCCESS/PO_FAIL 1217 */ 1218 static int 1219 add_importance_props(pool_conf_t *conf) 1220 { 1221 return (pool_walk_pools(conf, NULL, set_importance_cb)); 1222 } 1223 1224 /* 1225 * remove_importance_props() is used to remove "_importance" props on 1226 * each resource associated with a pool. 1227 * 1228 * Returns PO_SUCCESS/PO_FAIL 1229 */ 1230 static int 1231 remove_importance_props(pool_conf_t *conf) 1232 { 1233 return (pool_walk_pools(conf, NULL, unset_importance_cb)); 1234 } 1235 1236 /* 1237 * pool_conf_commit_sys() takes a configuration and modifies both the 1238 * supplied configuration and the dynamic configuration. The goal of 1239 * this modification is to generate a dynamic configuration which best 1240 * represents the constraints laid down in the static configuration 1241 * and to update the static configuration with the results of this 1242 * process. 1243 * 1244 * Returns PO_SUCCESS/PO_FAIL 1245 */ 1246 int 1247 pool_conf_commit_sys(pool_conf_t *conf, int validate) 1248 { 1249 pool_conf_t *dyn; 1250 1251 if ((dyn = pool_conf_alloc()) == NULL) 1252 return (PO_FAIL); 1253 if (pool_conf_open(dyn, pool_dynamic_location(), PO_RDWR) != 1254 PO_SUCCESS) { 1255 pool_conf_free(dyn); 1256 return (PO_FAIL); 1257 } 1258 if (validate == PO_TRUE) { 1259 if (pool_conf_validate(conf, POV_RUNTIME) != PO_SUCCESS) { 1260 (void) pool_conf_close(dyn); 1261 pool_conf_free(dyn); 1262 return (PO_FAIL); 1263 } 1264 } 1265 /* 1266 * Now try to make the two things "the same". 1267 */ 1268 if (diff_and_fix(conf, dyn) != PO_SUCCESS) { 1269 (void) pool_conf_close(dyn); 1270 pool_conf_free(dyn); 1271 pool_seterror(POE_INVALID_CONF); 1272 return (PO_FAIL); 1273 } 1274 if (dyn->pc_prov->pc_commit(dyn) != PO_SUCCESS) { 1275 (void) pool_conf_close(dyn); 1276 pool_conf_free(dyn); 1277 return (PO_FAIL); 1278 } 1279 (void) pool_conf_close(dyn); 1280 pool_conf_free(dyn); 1281 return (PO_SUCCESS); 1282 } 1283 1284 /* 1285 * Copies all properties from one element to another. If the property 1286 * is a readonly property, then don't copy it. 1287 */ 1288 /* ARGSUSED */ 1289 static int 1290 clone_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, 1291 pool_value_t *pv, void *user) 1292 { 1293 pool_elem_t *tgt = (pool_elem_t *)user; 1294 const pool_prop_t *prop; 1295 #ifdef DEBUG 1296 dprintf("Cloning %s from %s\n", 1297 pool_conf_location(TO_CONF(TO_ELEM(tgt))), 1298 pool_conf_location(TO_CONF(pe))); 1299 assert(TO_CONF(TO_ELEM(tgt)) != TO_CONF(pe)); 1300 dprintf("clone_element: Processing %s\n", name); 1301 pool_value_dprintf(pv); 1302 #endif /* DEBUG */ 1303 /* 1304 * Some properties should be ignored 1305 */ 1306 if ((prop = provider_get_prop(pe, name)) != NULL && 1307 prop_is_readonly(prop) == PO_TRUE) 1308 return (PO_SUCCESS); 1309 return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == PO_FAIL); 1310 } 1311 1312 /* 1313 * Removes all properties from one element. Properties which are 1314 * managed by the configuration are ignored. 1315 */ 1316 /* ARGSUSED3 */ 1317 static int 1318 clean_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, 1319 pool_value_t *pv, void *user) 1320 { 1321 const pool_prop_t *prop; 1322 /* 1323 * Some properties should be ignored 1324 */ 1325 if ((prop = provider_get_prop(pe, name)) != NULL && 1326 prop_is_optional(prop) == PO_FALSE) 1327 return (PO_SUCCESS); 1328 return (pool_rm_property(conf, (pool_elem_t *)pe, name) == PO_FAIL); 1329 } 1330