1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * These functions implement the process of commitment for a pool 31 * configuration. This process can be described as taking instructions 32 * from a static configuration file and using the information about 33 * the target system contained in the dynamic configuration to make 34 * decisions about how best to allocate resources to meet the 35 * constraints specified in the static configuration file. 36 * 37 * Mechanically, this process relies upon ordering the individual 38 * components of the file and stepping through the lists of components 39 * and taking actions depending on their type and which file they are 40 * part of. 41 * 42 * Configuration components can be broken down into different types 43 * which are then treated according to the following table: 44 * 45 * Element Type Action 46 * system || pool || 47 * res_comp || res_agg If the element is a required element, then create or 48 * update it (don't destroy required elements in the 49 * static configuration) otherwise manipulate the 50 * dynamic configuration to create, destroy or update 51 * the element on the system. 52 * comp Create, destroy or update the static configuration 53 * component. 54 * 55 * The treatment of the different elements reflects the fact that all 56 * elements other than comp are configurable and thus libpool can 57 * create, destroy and modify these elements at will. comp elements 58 * reflect the disposition of the system, these elements can be moved 59 * around but they can't be created or destroyed in the dynamic 60 * configuration in the commit process. comp elements can be created 61 * and destroyed in the static configuration file as a result of a 62 * commit operation, since it's possible for a comp to not appear in 63 * the dynamic configuration. For instance, if the static 64 * configuration file was created on a different machine or after a DR 65 * operation which has removed or added components. 66 * 67 */ 68 #include <assert.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <sys/types.h> 72 #include <errno.h> 73 #include <string.h> 74 #include <limits.h> 75 #include <unistd.h> 76 77 #include <pool.h> 78 #include "pool_internal.h" 79 #include "pool_impl.h" 80 81 #define MIN(x, y) ((x) < (y) ? (x) : (y)) 82 #define MAX(x, y) ((x) > (y) ? (x) : (y)) 83 #define POA_IMPORTANCE_NUM 0 84 #define POA_SURPLUS_TO_DEFAULT_NUM 1 85 86 /* 87 * This resource specific structure is used to determine allocation of resources 88 * during resource set allocation. Each set will receive its min, plus 89 * some number of dealt resources based on the global allocation policy. 90 */ 91 typedef struct res_info { 92 pool_resource_t *ri_res; /* Resource set */ 93 uint64_t ri_min; /* Resource set's low watermark */ 94 uint64_t ri_max; /* Resource set's high watermark */ 95 uint64_t ri_oldsize; /* Size of resource set at the start */ 96 uint64_t ri_newsize; /* New resource set size allocated */ 97 uint64_t ri_pinned; /* Count of pinned resources in set */ 98 uint64_t ri_dealt; /* Count of resources dealt to set */ 99 int64_t ri_transfer; /* oldsize - newsize */ 100 /* The signed quantity of resources */ 101 /* to tranfer into or out of this */ 102 /* resource set */ 103 /* + transfer: tranfer resources out */ 104 /* - transfer: tranfer resources in */ 105 } res_info_t; 106 107 /* 108 * diff_and_fix operations 109 */ 110 static int commit_create(pool_conf_t *, pool_elem_t **); 111 static int commit_delete(pool_elem_t *); 112 static int commit_update(pool_elem_t *, pool_elem_t *, int); 113 114 /* 115 * configuration commit processing 116 */ 117 static int diff_and_fix(pool_conf_t *, pool_conf_t *); 118 static int process_elem_lt(pool_elem_t *, pool_conf_t *); 119 static int process_elem_gt(pool_elem_t *, pool_conf_t *, 120 pool_conf_t *); 121 static int process_lists(int, pool_conf_t *, 122 pool_conf_t *, int); 123 static pool_elem_t **get_elem_list(const pool_conf_t *, int, uint_t *); 124 static int share_resources(pool_conf_t *); 125 static int resource_allocate(const char *, pool_resource_t **, 126 uint_t); 127 static int resource_allocate_default(pool_resource_t **, uint_t); 128 static int pset_allocate_imp(pool_resource_t **, uint_t); 129 static int resource_compare_by_descending_importance(const void *, 130 const void *); 131 static int compute_size_to_transfer(const void *, const void *); 132 static int set_importance_cb(pool_conf_t *, pool_t *, void *); 133 static int unset_importance_cb(pool_conf_t *, pool_t *, void *); 134 static int add_importance_props(pool_conf_t *); 135 static int remove_importance_props(pool_conf_t *); 136 static int clone_element(pool_conf_t *, pool_elem_t *, 137 const char *, pool_value_t *, void *); 138 static int clean_element(pool_conf_t *, pool_elem_t *, 139 const char *, pool_value_t *, void *); 140 141 /* 142 * commit_create() is used to create a configuration element upon the 143 * system. Since only pools and resource actually need to perform any 144 * action, other elements are ignored as a no-op. 145 */ 146 static int 147 commit_create(pool_conf_t *conf, pool_elem_t **e1) 148 { 149 pool_resource_t *res; 150 pool_t *pool; 151 const char *res_type; 152 pool_elem_t *src = *e1; 153 uint64_t smin, smax, dmax; 154 pool_value_t val = POOL_VALUE_INITIALIZER; 155 char *name; 156 157 switch (pool_elem_class(src)) { 158 case PEC_SYSTEM: /* NO-OP */ 159 break; 160 case PEC_POOL: 161 name = elem_get_name(src); 162 if ((pool = pool_create(conf, name)) == NULL) { 163 free(name); 164 return (PO_FAIL); 165 } 166 free(name); 167 /* 168 * Now copy the properties from the original pool to the 169 * new one 170 */ 171 if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(pool), 172 clone_element) != PO_SUCCESS) 173 return (PO_FAIL); 174 /* 175 * Add a pointer to the src element which can be 176 * updated with a sys_id when the sys_id is allocated 177 * to the created element. 178 */ 179 pool_set_pair(TO_ELEM(pool), src); 180 *e1 = TO_ELEM(pool); 181 break; 182 case PEC_RES_COMP: 183 case PEC_RES_AGG: 184 name = elem_get_name(src); 185 res_type = pool_elem_class_string(src); 186 if ((res = pool_resource_create(conf, res_type, name)) == 187 NULL) { 188 free(name); 189 return (PO_FAIL); 190 } 191 free(name); 192 /* 193 * Need to do some ordering of property updates. 194 * Compare the values of source min/max and 195 * destination min/max. If smin < dmax then update the 196 * smin first, else update the max first. 197 */ 198 if (resource_get_min(pool_elem_res(src), &smin) != PO_SUCCESS || 199 resource_get_max(pool_elem_res(src), &smax) != PO_SUCCESS || 200 resource_get_max(res, &dmax) != PO_SUCCESS) 201 return (PO_FAIL); 202 if (smin < dmax) { 203 pool_value_set_uint64(&val, smin); 204 if (pool_put_ns_property(TO_ELEM(res), c_min_prop, 205 &val) != PO_SUCCESS) 206 return (PO_FAIL); 207 } else { 208 pool_value_set_uint64(&val, smax); 209 if (pool_put_ns_property(TO_ELEM(res), c_max_prop, 210 &val) != PO_SUCCESS) 211 return (PO_FAIL); 212 } 213 /* 214 * Now copy the properties from the original resource 215 * to the new one 216 */ 217 if (pool_walk_properties(TO_CONF(src), src, TO_ELEM(res), 218 clone_element) != PO_SUCCESS) 219 return (PO_FAIL); 220 /* 221 * Add a pointer to the src element which can be 222 * updated with a sys_id when the sys_id is allocated 223 * to the created element. 224 */ 225 pool_set_pair(TO_ELEM(res), src); 226 *e1 = TO_ELEM(res); 227 break; 228 case PEC_COMP: /* NO-OP */ 229 break; 230 default: 231 return (PO_FAIL); 232 } 233 return (PO_SUCCESS); 234 } 235 236 237 /* 238 * commit_delete() is used to delete a configuration element upon the 239 * system. Since only pools and resources actually need to perform 240 * any action, other elements are ignored as a no-op. 241 */ 242 static int 243 commit_delete(pool_elem_t *pe) 244 { 245 pool_resource_t *res; 246 pool_t *pool; 247 int ret = 0; 248 249 switch (pool_elem_class(pe)) { 250 case PEC_SYSTEM: /* NO-OP */ 251 break; 252 case PEC_POOL: 253 pool = pool_elem_pool(pe); 254 ret = pool_destroy(TO_CONF(pe), pool); 255 break; 256 case PEC_RES_COMP: 257 case PEC_RES_AGG: 258 res = pool_elem_res(pe); 259 ret = pool_resource_destroy(TO_CONF(pe), res); 260 break; 261 case PEC_COMP: /* NO-OP */ 262 break; 263 default: 264 return (PO_FAIL); 265 } 266 return (ret); 267 } 268 269 /* 270 * commit_update() is used to update a configuration element upon the 271 * system or in a static configuration file. The pass parameter 272 * governs whether properties are being updated or associations. In 273 * pass 0, properties are updated. If the element is of class 274 * PEC_COMP, then make sure that the element in the static 275 * configuration file is correctly located before proceeding with the 276 * update. Then, the element in the dynamic configuration file is 277 * updated. In pass 1, ie. pass != 0, any pool components have their 278 * associations updated in the dynamic configuration. 279 */ 280 static int 281 commit_update(pool_elem_t *e1, pool_elem_t *e2, int pass) 282 { 283 if (pass == 0) { 284 pool_resource_t *res1; 285 pool_resource_t *res2; 286 if (pool_elem_class(e1) == PEC_COMP) { 287 res1 = pool_get_owning_resource(TO_CONF(e1), 288 pool_elem_comp(e1)); 289 res2 = pool_get_owning_resource(TO_CONF(e2), 290 pool_elem_comp(e2)); 291 if (pool_elem_compare_name(TO_ELEM(res1), 292 TO_ELEM(res2)) != 0) { 293 char *name; 294 const pool_resource_t *newres; 295 pool_component_t *comps[2] = { NULL }; 296 297 comps[0] = pool_elem_comp(e2); 298 name = elem_get_name(TO_ELEM(res2)); 299 newres = pool_get_resource(TO_CONF(e2), 300 pool_elem_class_string(TO_ELEM(res1)), 301 name); 302 free(name); 303 assert(newres); 304 #ifdef DEBUG 305 dprintf("transferring: res, comp\n"); 306 pool_elem_dprintf(TO_ELEM(newres)); 307 pool_elem_dprintf(e2); 308 #endif /* DEBUG */ 309 (void) pool_resource_xtransfer(TO_CONF(e2), 310 res2, (pool_resource_t *)newres, comps); 311 } 312 } 313 if (pool_walk_properties(TO_CONF(e2), e2, NULL, 314 clean_element) != PO_SUCCESS) { 315 return (PO_FAIL); 316 } 317 /* 318 * Need to do some ordering of property updates if the 319 * element to be updated is a resource. Compare the 320 * values of source min/max and destination 321 * min/max. If smin < dmax then update the smin first, 322 * else update the max first. 323 */ 324 if (pool_elem_class(e1) == PEC_RES_COMP || 325 pool_elem_class(e1) == PEC_RES_AGG) { 326 uint64_t smin, smax, dmax; 327 pool_value_t val = POOL_VALUE_INITIALIZER; 328 329 if (resource_get_min(pool_elem_res(e1), &smin) != 330 PO_SUCCESS || 331 resource_get_max(pool_elem_res(e1), &smax) != 332 PO_SUCCESS || 333 resource_get_max(pool_elem_res(e2), &dmax) != 334 PO_SUCCESS) 335 return (PO_FAIL); 336 if (smin < dmax) { 337 pool_value_set_uint64(&val, smin); 338 if (pool_put_ns_property(e2, c_min_prop, 339 &val) != PO_SUCCESS) 340 return (PO_FAIL); 341 } else { 342 pool_value_set_uint64(&val, smax); 343 if (pool_put_ns_property(e2, c_max_prop, 344 &val) != PO_SUCCESS) 345 return (PO_FAIL); 346 } 347 } 348 /* 349 * This next couple of steps needs some 350 * explanation. The first walk, copies all the 351 * properties that are writeable from the static 352 * configuration to the dynamic configuration. The 353 * second walk copies all properties (writeable or 354 * not) from the dynamic configuration element back to 355 * the static configuration element. This ensures that 356 * updates from the static configuration element are 357 * correctly applied to the dynamic configuration and 358 * then the static configuration element is updated 359 * with the latest values of the read-only xproperties 360 * from the dynamic configuration element. The 361 * enforcing of permisssions is performed in 362 * clone_element by its choice of property 363 * manipulation function. 364 */ 365 if (pool_walk_properties(TO_CONF(e1), e1, e2, clone_element) != 366 PO_SUCCESS) { 367 return (PO_FAIL); 368 } 369 if (pool_walk_properties(TO_CONF(e2), e2, e1, clone_element) != 370 PO_SUCCESS) { 371 return (PO_FAIL); 372 } 373 } else { 374 if (pool_elem_class(e1) == PEC_POOL) { 375 pool_resource_t **rs; 376 uint_t nelem; 377 int i; 378 pool_value_t val = POOL_VALUE_INITIALIZER; 379 pool_value_t *pvals[] = { NULL, NULL }; 380 381 pvals[0] = &val; 382 if (pool_value_set_string(&val, "pset") != PO_SUCCESS || 383 pool_value_set_name(&val, c_type) != PO_SUCCESS) 384 return (PO_FAIL); 385 if ((rs = pool_query_pool_resources(TO_CONF(e1), 386 pool_elem_pool(e1), &nelem, pvals)) != NULL) { 387 for (i = 0; i < nelem; i++) { 388 const pool_resource_t *tgt_res; 389 char *res_name = 390 elem_get_name(TO_ELEM(rs[i])); 391 392 if ((tgt_res = pool_get_resource( 393 TO_CONF(e2), pool_elem_class_string( 394 TO_ELEM(rs[i])), res_name)) == 395 NULL) { 396 tgt_res = get_default_resource( 397 rs[i]); 398 } 399 free(res_name); 400 if (pool_associate(TO_CONF(e2), 401 pool_elem_pool(e2), tgt_res) != 402 PO_SUCCESS) { 403 free(rs); 404 return (PO_FAIL); 405 } 406 } 407 free(rs); 408 } 409 } 410 } 411 return (PO_SUCCESS); 412 } 413 414 /* 415 * diff_and_fix() works out the differences between two configurations 416 * and modifies the state of the system to match the operations 417 * required to bring the two configurations into sync. 418 * 419 * Returns PO_SUCCESS/PO_FAIL. 420 */ 421 static int 422 diff_and_fix(pool_conf_t *stc, pool_conf_t *dyn) 423 { 424 /* 425 * The ordering of the operations is significant, we must 426 * process the system element, then the pools elements, then 427 * the resource elements, then the pools elements again and 428 * finally the resource components. 429 * 430 * TODO 431 * PEC_RES_COMP are the only type of resources 432 * currently. When PEC_RES_AGG resources are added they must 433 * also be processed. 434 */ 435 if (process_lists(PEC_SYSTEM, stc, dyn, 0) != PO_SUCCESS) { 436 return (PO_FAIL); 437 } 438 if (process_lists(PEC_POOL, stc, dyn, 0) != PO_SUCCESS) { 439 return (PO_FAIL); 440 } 441 if (process_lists(PEC_RES_COMP, stc, dyn, 0) != PO_SUCCESS) { 442 return (PO_FAIL); 443 } 444 if (process_lists(PEC_COMP, stc, dyn, 0) != PO_SUCCESS) { 445 return (PO_FAIL); 446 } 447 if (process_lists(PEC_POOL, stc, dyn, 1) != PO_SUCCESS) { 448 return (PO_FAIL); 449 } 450 /* 451 * Share the resources. It has to be called for both 452 * configurations to ensure that the configurations still look 453 * the same. 454 */ 455 if (share_resources(dyn) != PO_SUCCESS) { 456 return (PO_FAIL); 457 } 458 if (share_resources(stc) != PO_SUCCESS) { 459 return (PO_FAIL); 460 } 461 return (PO_SUCCESS); 462 } 463 464 static int 465 process_elem_lt(pool_elem_t *pe, pool_conf_t *dyn) 466 { 467 if (pool_elem_class(pe) == PEC_COMP) { 468 if (pool_component_destroy(pool_elem_comp(pe)) == PO_FAIL) { 469 return (PO_FAIL); 470 } 471 } else if (! elem_is_default(pe)) { 472 if (commit_create(dyn, &pe) != PO_SUCCESS) { 473 return (PO_FAIL); 474 } 475 } 476 return (PO_SUCCESS); 477 } 478 479 static int 480 process_elem_gt(pool_elem_t *pe, pool_conf_t *stc, pool_conf_t *dyn) 481 { 482 if (pool_elem_class(pe) == PEC_COMP) { 483 pool_resource_t *owner; 484 const pool_resource_t *parent_res; 485 pool_value_t val = POOL_VALUE_INITIALIZER; 486 const pool_component_t *newcomp; 487 const char *resname; 488 const char *restype; 489 /* 490 * I have to find the right parent in the static 491 * configuration. It may not exist, in which case it's 492 * correct to put it in the default 493 */ 494 owner = pool_get_owning_resource(dyn, 495 pool_elem_comp(pe)); 496 if (pool_get_ns_property(TO_ELEM(owner), "name", &val) == 497 POC_INVAL) 498 return (PO_FAIL); 499 500 if (pool_value_get_string(&val, &resname) == PO_FAIL) 501 return (PO_FAIL); 502 503 if ((resname = strdup(resname)) == NULL) 504 return (PO_FAIL); 505 506 restype = pool_elem_class_string(TO_ELEM(owner)); 507 parent_res = pool_get_resource(stc, restype, resname); 508 free((void *)resname); 509 if (parent_res == NULL) 510 parent_res = resource_by_sysid(stc, PS_NONE, restype); 511 /* 512 * Now need to make a copy of the component in the 513 * dynamic configuration in the static configuration. 514 */ 515 if ((newcomp = pool_component_create(stc, parent_res, 516 elem_get_sysid(pe))) == NULL) 517 return (PO_FAIL); 518 519 if (pool_walk_properties(TO_CONF(pe), pe, TO_ELEM(newcomp), 520 clone_element) != PO_SUCCESS) 521 return (PO_FAIL); 522 } else if (elem_is_default(pe)) { 523 pool_resource_t *newres; 524 pool_t *newpool; 525 char *name; 526 527 if ((name = elem_get_name(pe)) == NULL) 528 return (PO_FAIL); 529 switch (pool_elem_class(pe)) { 530 case PEC_POOL: 531 if ((newpool = pool_create(stc, name)) == NULL) { 532 free(name); 533 return (PO_FAIL); 534 } 535 free(name); 536 if (pool_walk_properties(TO_CONF(pe), pe, 537 TO_ELEM(newpool), clone_element) != PO_SUCCESS) 538 return (PO_FAIL); 539 break; 540 case PEC_RES_AGG: 541 case PEC_RES_COMP: 542 if ((newres = pool_resource_create(stc, 543 pool_elem_class_string(pe), name)) == 544 NULL) { 545 free(name); 546 return (PO_FAIL); 547 } 548 free(name); 549 if (pool_walk_properties(TO_CONF(pe), pe, 550 TO_ELEM(newres), clone_element) != PO_SUCCESS) 551 return (PO_FAIL); 552 break; 553 default: 554 free(name); 555 break; 556 } 557 } else { 558 if (commit_delete(pe) != PO_SUCCESS) 559 return (PO_FAIL); 560 } 561 return (PO_SUCCESS); 562 } 563 564 /* 565 * This function compares the elements of the supplied type in the 566 * static and dynamic configurations supplied. The lists of elements 567 * are compared and used to create, delete and updated elements in 568 * both the static and dynamic configurations. The pass parameter is 569 * used to indicate to commit_update() whether property updates or 570 * association updates should be performed. 571 */ 572 static int 573 process_lists(int type, pool_conf_t *stc, pool_conf_t *dyn, int pass) 574 { 575 uint_t stc_nelem = 0, dyn_nelem = 0; 576 pool_elem_t **stc_elems, **dyn_elems; 577 int i, j; 578 int status = PO_SUCCESS; 579 580 if ((stc_elems = get_elem_list(stc, type, &stc_nelem)) == NULL) 581 return (PO_FAIL); 582 583 qsort(stc_elems, stc_nelem, sizeof (pool_elem_t *), 584 qsort_elem_compare); 585 586 if ((dyn_elems = get_elem_list(dyn, type, &dyn_nelem)) == NULL) { 587 free(stc_elems); 588 return (PO_FAIL); 589 } 590 591 qsort(dyn_elems, dyn_nelem, sizeof (pool_elem_t *), 592 qsort_elem_compare); 593 /* 594 * Step through and do the updating, remember that we are 595 * comparing using the compare function for the configuration 596 * and that is fixed. 597 */ 598 i = j = 0; 599 while (status == PO_SUCCESS && i < stc_nelem && j < dyn_nelem) { 600 int compare; 601 /* 602 * We are going to do this by stepping through the static 603 * list first. 604 */ 605 if (elem_is_default(stc_elems[i]) && 606 elem_is_default(dyn_elems[j])) 607 compare = 0; 608 else 609 compare = pool_elem_compare_name(stc_elems[i], 610 dyn_elems[j]); 611 if (compare < 0) { 612 status = process_elem_lt(stc_elems[i], dyn); 613 i++; 614 } else if (compare > 0) { 615 status = process_elem_gt(dyn_elems[j], stc, dyn); 616 j++; 617 } else { /* compare == 0 */ 618 if (commit_update(stc_elems[i], dyn_elems[j], pass) 619 != PO_SUCCESS) { 620 status = PO_FAIL; 621 } 622 i++; 623 j++; 624 } 625 } 626 if (status == PO_FAIL) { 627 free(stc_elems); 628 free(dyn_elems); 629 return (PO_FAIL); 630 } 631 while (status == PO_SUCCESS && i < stc_nelem) { 632 status = process_elem_lt(stc_elems[i], dyn); 633 i++; 634 } 635 if (status == PO_FAIL) { 636 free(stc_elems); 637 free(dyn_elems); 638 return (PO_FAIL); 639 } 640 while (status == PO_SUCCESS && j < dyn_nelem) { 641 status = process_elem_gt(dyn_elems[j], stc, dyn); 642 j++; 643 } 644 free(stc_elems); 645 free(dyn_elems); 646 return (status); 647 } 648 649 /* 650 * get_elem_list() returns a list of pool_elem_t's. The size of the 651 * list is written into nelem. The list contains elements of all types 652 * that pools is interested in: i.e. system, pool, resources and 653 * resource components. It is the caller's responsibility to free the 654 * list when it is finished with. 655 * 656 * The array of pointers returned by the type specific query can be 657 * safely cast to be an array of pool_elem_t pointers. In the case of 658 * PEC_RES_COMP some additional processing is required to qualify the 659 * list of elements. 660 * 661 * Returns a pointer to a list of pool_elem_t's or NULL on failure. 662 */ 663 static pool_elem_t ** 664 get_elem_list(const pool_conf_t *conf, int type, uint_t *nelem) 665 { 666 pool_resource_t **rl; 667 pool_t **pl; 668 pool_component_t **cl; 669 pool_elem_t **elems = NULL; 670 int i; 671 672 switch (type) { 673 case PEC_SYSTEM: 674 if ((elems = malloc(sizeof (pool_elem_t *))) == NULL) 675 return (NULL); 676 *nelem = 1; 677 elems[0] = pool_conf_to_elem(conf); 678 break; 679 case PEC_POOL: 680 if ((pl = pool_query_pools(conf, nelem, NULL)) != NULL) { 681 elems = (pool_elem_t **)pl; 682 } 683 break; 684 case PEC_RES_COMP: 685 if ((rl = pool_query_resources(conf, nelem, NULL)) != NULL) { 686 int j = 0; 687 elems = (pool_elem_t **)rl; 688 for (i = 0; i < *nelem; i++) { 689 if (pool_elem_class(TO_ELEM(rl[i])) == 690 PEC_RES_COMP) 691 elems[j++] = TO_ELEM(rl[i]); 692 } 693 *nelem = j; 694 } 695 break; 696 case PEC_COMP: 697 if ((cl = pool_query_components(conf, nelem, NULL)) != NULL) { 698 elems = (pool_elem_t **)cl; 699 } 700 break; 701 default: 702 abort(); 703 break; 704 } 705 return (elems); 706 } 707 708 /* 709 * share_resources() sets up the allocation of resources by each 710 * provider. Firstly all resources are updated with the importance of 711 * each pool, then each resource provider is invoked in turn with a 712 * list of it's own resources. Finally, the pool importance details 713 * are removed from the resources. 714 * 715 * Returns PO_SUCCESS/PO_FAIL 716 */ 717 static int 718 share_resources(pool_conf_t *conf) 719 { 720 pool_resource_t **resources; 721 uint_t nelem; 722 pool_value_t *props[] = { NULL, NULL }; 723 pool_value_t val = POOL_VALUE_INITIALIZER; 724 725 props[0] = &val; 726 727 /* 728 * Call an allocation function for each type of supported resource. 729 * This function is responsible for "sharing" resources to resource 730 * sets as determined by the system.allocate-method. 731 */ 732 733 if (pool_value_set_string(props[0], "pset") != PO_SUCCESS || 734 pool_value_set_name(props[0], c_type) != PO_SUCCESS) 735 return (PO_FAIL); 736 737 if (add_importance_props(conf) != PO_SUCCESS) { 738 (void) remove_importance_props(conf); 739 return (PO_FAIL); 740 } 741 742 if ((resources = pool_query_resources(conf, &nelem, props)) != NULL) { 743 /* 744 * 'pool.importance' defines the importance of a pool; 745 * resources inherit the importance of the pool that 746 * is associated with them. If more than one pool is 747 * associated with a resource, the importance of the 748 * resource is the maximum importance of all 749 * associated pools. Use '_importance' on resources 750 * to determine who gets extra. 751 */ 752 if (resource_allocate("pset", resources, nelem) != PO_SUCCESS) { 753 free(resources); 754 (void) remove_importance_props(conf); 755 return (PO_FAIL); 756 } 757 } 758 free(resources); 759 (void) remove_importance_props(conf); 760 return (PO_SUCCESS); 761 } 762 763 764 /* 765 * Work out which allocation method to use based on the value of the 766 * system.allocate-method property. 767 */ 768 int 769 resource_allocate(const char *type, pool_resource_t **res, uint_t nelem) 770 { 771 pool_elem_t *pe; 772 const char *method_name; 773 uint64_t method; 774 pool_value_t val = POOL_VALUE_INITIALIZER; 775 int ret; 776 777 pe = pool_conf_to_elem(TO_CONF(TO_ELEM(res[0]))); 778 779 if (pool_get_ns_property(pe, "allocate-method", &val) != POC_STRING) 780 method_name = POA_IMPORTANCE; 781 else { 782 (void) pool_value_get_string(&val, &method_name); 783 } 784 if (strcmp(POA_IMPORTANCE, method_name) != 0) { 785 if (strcmp(POA_SURPLUS_TO_DEFAULT, method_name) != 0) { 786 pool_seterror(POE_INVALID_CONF); 787 return (PO_FAIL); 788 } else { 789 method = POA_SURPLUS_TO_DEFAULT_NUM; 790 } 791 } else { 792 method = POA_IMPORTANCE_NUM; 793 } 794 switch (method) { 795 case POA_IMPORTANCE_NUM: 796 /* 797 * TODO: Add support for new resource types 798 */ 799 switch (pool_resource_elem_class_from_string(type)) { 800 case PREC_PSET: 801 ret = pset_allocate_imp(res, nelem); 802 break; 803 default: 804 ret = PO_FAIL; 805 break; 806 } 807 break; 808 case POA_SURPLUS_TO_DEFAULT_NUM: 809 ret = resource_allocate_default(res, nelem); 810 break; 811 } 812 813 return (ret); 814 } 815 816 /* 817 * Each set will get its minimum, however if there is more than the 818 * total minimum available, then leave this in the default set. 819 */ 820 int 821 resource_allocate_default(pool_resource_t **res, uint_t nelem) 822 { 823 res_info_t *res_info; 824 uint_t j; 825 pool_resource_t *default_res = NULL; 826 827 if (nelem == 1) 828 return (PO_SUCCESS); 829 830 if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) { 831 return (PO_FAIL); 832 } 833 834 /* Load current resource values. */ 835 for (j = 0; j < nelem; j++) { 836 837 if (default_res == NULL && 838 resource_is_default(res[j]) == PO_TRUE) 839 default_res = res[j]; 840 841 if (resource_get_max(res[j], 842 &res_info[j].ri_max) == PO_FAIL || 843 resource_get_min(res[j], 844 &res_info[j].ri_min) == PO_FAIL || 845 resource_get_size(res[j], 846 &res_info[j].ri_oldsize) == PO_FAIL || 847 resource_get_pinned(res[j], 848 &res_info[j].ri_pinned) == PO_FAIL) { 849 free(res_info); 850 return (PO_FAIL); 851 } 852 res_info[j].ri_res = res[j]; 853 } 854 855 /* 856 * Firstly, for all resources that have size greater than min, 857 * transfer all movable size above min to the default resource. 858 */ 859 for (j = 0; j < nelem; j++) { 860 861 uint64_t real_min; 862 863 /* compute the real minimum number of resources */ 864 real_min = MAX(res_info[j].ri_pinned, res_info[j].ri_min); 865 if (res_info[j].ri_res != default_res && 866 res_info[j].ri_oldsize > real_min) { 867 868 uint64_t num; 869 870 num = res_info[j].ri_oldsize - real_min; 871 if (pool_resource_transfer( 872 TO_CONF(TO_ELEM(default_res)), 873 res_info[j].ri_res, default_res, num) != 874 PO_SUCCESS) { 875 free(res_info); 876 return (PO_FAIL); 877 } 878 } 879 } 880 /* 881 * Now, transfer resources below min from the default. 882 */ 883 for (j = 0; j < nelem; j++) { 884 /* 885 * We don't want to interfere with resources which are reserved 886 */ 887 if (res_info[j].ri_res != default_res && 888 res_info[j].ri_oldsize < res_info[j].ri_min) { 889 if (pool_resource_transfer( 890 TO_CONF(TO_ELEM(default_res)), 891 default_res, res_info[j].ri_res, 892 res_info[j].ri_min - res_info[j].ri_oldsize) != 893 PO_SUCCESS) { 894 free(res_info); 895 return (PO_FAIL); 896 } 897 } 898 } 899 free(res_info); 900 return (PO_SUCCESS); 901 } 902 903 /* 904 * Allocate cpus to pset resource sets, favoring sets with higher importance. 905 * 906 * Step 1: Sort resource sets by decreasing importance, and load each sets 907 * current size (oldsize), min, max, and number of pinned cpus. 908 * Compute the total number of cpus by totaling oldsize. 909 * 910 * Step 2: Compute the newsize for each set: 911 * 912 * Give each set its min number of cpus. This min may be greater than 913 * its pset.min due to pinned cpus. If there are more cpus than the total 914 * of all mins, then the surplus cpus are dealt round-robin to all sets 915 * (up to their max) in order of decreasing importance. A set may be 916 * skipped during dealing because it started with more than its min due to 917 * pinned cpus. The dealing stops when there are no more cpus or all 918 * sets are at their max. If all sets are at their max, any remaining cpus 919 * are given to the default set. 920 * 921 * Step 3: Transfer cpus from sets with (oldsize > newsize) to sets with 922 * (oldsize < newsize). 923 */ 924 int 925 pset_allocate_imp(pool_resource_t **res, uint_t nelem) 926 { 927 res_info_t *res_info; 928 res_info_t *default_res_info; 929 const pool_resource_t *default_res = NULL; 930 uint64_t tot_resources = 0; /* total count of resources */ 931 uint64_t tot_min = 0; /* total of all resource set mins */ 932 uint64_t num_to_deal = 0; /* total resources above mins to deal */ 933 uint64_t sets_maxed = 0; /* number of resource sets dealt to */ 934 /* their max */ 935 uint64_t sets_finished = 0; /* number of resource sets that have */ 936 /* size == newsize */ 937 int donor, receiver; 938 int deal; 939 int j; 940 int ret = PO_SUCCESS; 941 942 /* 943 * Build list of res_info_t's 944 */ 945 if ((res_info = calloc(nelem, sizeof (res_info_t))) == NULL) { 946 pool_seterror(POE_SYSTEM); 947 return (PO_FAIL); 948 } 949 950 /* Order resources by importance, most important being first */ 951 qsort(res, nelem, sizeof (pool_resource_t *), 952 resource_compare_by_descending_importance); 953 954 for (j = 0; j < nelem; j++) { 955 956 /* Track which resource is the default */ 957 if (default_res == NULL && 958 resource_is_default(res[j]) == PO_TRUE) { 959 default_res = res[j]; 960 default_res_info = &(res_info[j]); 961 } 962 963 /* Load sets' current values */ 964 if (resource_get_max(res[j], &res_info[j].ri_max) == PO_FAIL || 965 resource_get_min(res[j], &res_info[j].ri_min) == PO_FAIL || 966 resource_get_size(res[j], &res_info[j].ri_oldsize) == 967 PO_FAIL || 968 resource_get_pinned(res[j], 969 &res_info[j].ri_pinned) == PO_FAIL) { 970 free(res_info); 971 return (PO_FAIL); 972 } 973 974 /* Start each set's newsize out at their min. */ 975 res_info[j].ri_newsize = res_info[j].ri_min; 976 977 /* pre-deal pinned resources that exceed min */ 978 if (res_info[j].ri_pinned > res_info[j].ri_min) { 979 res_info[j].ri_newsize = res_info[j].ri_pinned; 980 res_info[j].ri_dealt = 981 res_info[j].ri_newsize - res_info[j].ri_min; 982 } 983 res_info[j].ri_res = res[j]; 984 985 /* Compute total number of resources to deal out */ 986 tot_resources += res_info[j].ri_oldsize; 987 tot_min += res_info[j].ri_newsize; 988 989 #ifdef DEBUG 990 dprintf("res allocation details\n"); 991 pool_elem_dprintf(TO_ELEM(res[j])); 992 dprintf("size=%llu\n", res_info[j].ri_oldsize); 993 #endif /* DEBUG */ 994 } 995 996 num_to_deal = tot_resources - tot_min; 997 998 /* 999 * Deal one resource to each set, and then another, until all 1000 * resources are dealt or all sets are at their max. 1001 */ 1002 for (deal = 1; num_to_deal > 0 && sets_maxed < nelem; deal++) { 1003 for (j = 0; j < nelem; j++) { 1004 1005 /* 1006 * Skip this resource set if it has already been 1007 * pre-dealt a resource due to pinned resources. 1008 */ 1009 if (res_info[j].ri_dealt >= deal) 1010 continue; 1011 1012 if (res_info[j].ri_newsize < res_info[j].ri_max) { 1013 1014 res_info[j].ri_dealt++; 1015 res_info[j].ri_newsize++; 1016 if (res_info[j].ri_newsize == 1017 res_info[j].ri_max) 1018 sets_maxed++; 1019 1020 num_to_deal--; 1021 if (num_to_deal == 0) 1022 break; 1023 } 1024 } 1025 } 1026 1027 /* 1028 * If all resource sets are at their max, deal the remaining to the 1029 * default resource set. 1030 */ 1031 if ((sets_maxed == nelem) && (num_to_deal > 0)) { 1032 default_res_info->ri_dealt += num_to_deal; 1033 default_res_info->ri_newsize += num_to_deal; 1034 } 1035 1036 /* 1037 * Sort so that resource sets needing resources preced resource sets 1038 * that have extra resources. The sort function will also compute 1039 * The quantity of resources that need to be transfered into or out 1040 * of each set so that it's size == newsize. 1041 */ 1042 qsort(res_info, nelem, sizeof (res_info_t), 1043 compute_size_to_transfer); 1044 1045 /* 1046 * The donor index starts at the end of the resource set list and 1047 * walks up. The receiver index starts at the beginning of the 1048 * resource set list and walks down. Cpu's are transfered from the 1049 * donors to the receivers until all sets have transfer == 0). 1050 */ 1051 donor = nelem - 1; 1052 receiver = 0; 1053 1054 /* Number of sets with transfer == 0 */ 1055 sets_finished = 0; 1056 1057 /* Tranfer resources so that each set's size becomes newsize */ 1058 for (;;) { 1059 1060 uint64_t ntrans; 1061 if (donor == receiver) { 1062 if (res_info[donor].ri_transfer != 0) { 1063 free(res_info); 1064 return (PO_FAIL); 1065 } 1066 sets_finished++; 1067 break; 1068 } 1069 if (res_info[donor].ri_transfer == 0) { 1070 sets_finished++; 1071 donor--; 1072 continue; 1073 } 1074 if (res_info[receiver].ri_transfer == 0) { 1075 sets_finished++; 1076 receiver++; 1077 continue; 1078 } 1079 1080 /* Transfer resources from the donor set to the receiver */ 1081 ntrans = MIN(res_info[donor].ri_transfer, 1082 -res_info[receiver].ri_transfer); 1083 1084 if (pool_resource_transfer( 1085 TO_CONF(TO_ELEM(res_info[donor].ri_res)), 1086 res_info[donor].ri_res, res_info[receiver].ri_res, 1087 ntrans) != PO_SUCCESS) { 1088 free(res_info); 1089 return (PO_FAIL); 1090 } 1091 res_info[donor].ri_transfer -= ntrans; 1092 res_info[receiver].ri_transfer += ntrans; 1093 } 1094 1095 if (sets_finished != nelem) 1096 ret = PO_FAIL; 1097 1098 free(res_info); 1099 return (ret); 1100 } 1101 1102 /* 1103 * Used as a qsort parameter to help order resources in terms of their 1104 * importance, higher importance being first. 1105 */ 1106 int 1107 resource_compare_by_descending_importance(const void *arg1, const void *arg2) 1108 { 1109 pool_elem_t *elem1; 1110 pool_elem_t *elem2; 1111 pool_resource_t **res1 = (pool_resource_t **)arg1; 1112 pool_resource_t **res2 = (pool_resource_t **)arg2; 1113 pool_value_t val = POOL_VALUE_INITIALIZER; 1114 int64_t i1 = 0, i2 = 0; 1115 1116 elem1 = TO_ELEM(*res1); 1117 elem2 = TO_ELEM(*res2); 1118 1119 if (pool_get_property(TO_CONF(elem1), elem1, "_importance", &val) == 1120 POC_INT) 1121 (void) pool_value_get_int64(&val, &i1); 1122 1123 if (pool_get_property(TO_CONF(elem2), elem2, "_importance", &val) == 1124 POC_INT) 1125 (void) pool_value_get_int64(&val, &i2); 1126 return (i1 > i2 ? -1 : (i1 < i2 ? 1 : 0)); 1127 } 1128 1129 /* 1130 * Sort in increasing order so that resource sets with extra resources are at 1131 * the end and resource sets needing resources are at the beginning. 1132 */ 1133 int 1134 compute_size_to_transfer(const void *arg1, const void *arg2) 1135 { 1136 res_info_t *r1 = (res_info_t *)arg1, *r2 = (res_info_t *)arg2; 1137 r1->ri_transfer = (int64_t)r1->ri_oldsize - (int64_t)r1->ri_newsize; 1138 r2->ri_transfer = (int64_t)r2->ri_oldsize - (int64_t)r2->ri_newsize; 1139 return (r1->ri_transfer > r2->ri_transfer ? 1 : 1140 (r1->ri_transfer < r2->ri_transfer ? -1 : 0)); 1141 } 1142 1143 /* 1144 * set_importance_cb() is used to create "_importance" props on each 1145 * resource associated with a pool. 1146 * 1147 * Returns PO_SUCCESS/PO_FAIL 1148 */ 1149 /*ARGSUSED*/ 1150 static int 1151 set_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused) 1152 { 1153 pool_value_t val = POOL_VALUE_INITIALIZER; 1154 int64_t importance; 1155 pool_resource_t **res; 1156 uint_t nelem, i; 1157 1158 if (pool_get_property(conf, TO_ELEM(pool), "pool.importance", &val) != 1159 POC_INT) { 1160 pool_seterror(POE_INVALID_CONF); 1161 return (PO_FAIL); 1162 } 1163 (void) pool_value_get_int64(&val, &importance); 1164 if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) == 1165 NULL) { 1166 return (PO_FAIL); 1167 } 1168 for (i = 0; res[i] != NULL; i++) { 1169 int64_t old_importance = INT64_MIN; 1170 pool_elem_t *elem = TO_ELEM(res[i]); 1171 1172 if (pool_get_property(conf, elem, "_importance", &val) == 1173 POC_INT) 1174 (void) pool_value_get_int64(&val, &old_importance); 1175 if (old_importance <= importance) { 1176 (void) pool_value_set_int64(&val, importance); 1177 (void) pool_put_property(conf, elem, "_importance", 1178 &val); 1179 } 1180 } 1181 free(res); 1182 return (PO_SUCCESS); 1183 } 1184 1185 /* 1186 * unset_importance_cb() is used to remove "_importance" props from 1187 * each resource associated with a pool. 1188 * 1189 * Returns PO_SUCCESS/PO_FAIL 1190 */ 1191 /*ARGSUSED*/ 1192 static int 1193 unset_importance_cb(pool_conf_t *conf, pool_t *pool, void *unused) 1194 { 1195 pool_resource_t **res; 1196 uint_t nelem, i; 1197 1198 if ((res = pool_query_pool_resources(conf, pool, &nelem, NULL)) == 1199 NULL) { 1200 return (PO_FAIL); 1201 } 1202 for (i = 0; res[i] != NULL; i++) { 1203 if (pool_rm_property(conf, TO_ELEM(res[i]), "_importance") == 1204 PO_FAIL) { 1205 free(res); 1206 return (PO_FAIL); 1207 } 1208 } 1209 free(res); 1210 return (PO_SUCCESS); 1211 } 1212 1213 /* 1214 * add_importance_props() is used to create "_importance" props on 1215 * each resource associated with a pool. 1216 * 1217 * Returns PO_SUCCESS/PO_FAIL 1218 */ 1219 static int 1220 add_importance_props(pool_conf_t *conf) 1221 { 1222 return (pool_walk_pools(conf, NULL, set_importance_cb)); 1223 } 1224 1225 /* 1226 * remove_importance_props() is used to remove "_importance" props on 1227 * each resource associated with a pool. 1228 * 1229 * Returns PO_SUCCESS/PO_FAIL 1230 */ 1231 static int 1232 remove_importance_props(pool_conf_t *conf) 1233 { 1234 return (pool_walk_pools(conf, NULL, unset_importance_cb)); 1235 } 1236 1237 /* 1238 * pool_conf_commit_sys() takes a configuration and modifies both the 1239 * supplied configuration and the dynamic configuration. The goal of 1240 * this modification is to generate a dynamic configuration which best 1241 * represents the constraints laid down in the static configuration 1242 * and to update the static configuration with the results of this 1243 * process. 1244 * 1245 * Returns PO_SUCCESS/PO_FAIL 1246 */ 1247 int 1248 pool_conf_commit_sys(pool_conf_t *conf, int validate) 1249 { 1250 pool_conf_t *dyn; 1251 1252 if ((dyn = pool_conf_alloc()) == NULL) 1253 return (PO_FAIL); 1254 if (pool_conf_open(dyn, pool_dynamic_location(), PO_RDWR) != 1255 PO_SUCCESS) { 1256 pool_conf_free(dyn); 1257 return (PO_FAIL); 1258 } 1259 if (validate == PO_TRUE) { 1260 if (pool_conf_validate(conf, POV_RUNTIME) != PO_SUCCESS) { 1261 (void) pool_conf_close(dyn); 1262 pool_conf_free(dyn); 1263 return (PO_FAIL); 1264 } 1265 } 1266 /* 1267 * Now try to make the two things "the same". 1268 */ 1269 if (diff_and_fix(conf, dyn) != PO_SUCCESS) { 1270 (void) pool_conf_close(dyn); 1271 pool_conf_free(dyn); 1272 pool_seterror(POE_INVALID_CONF); 1273 return (PO_FAIL); 1274 } 1275 if (dyn->pc_prov->pc_commit(dyn) != PO_SUCCESS) { 1276 (void) pool_conf_close(dyn); 1277 pool_conf_free(dyn); 1278 return (PO_FAIL); 1279 } 1280 (void) pool_conf_close(dyn); 1281 pool_conf_free(dyn); 1282 return (PO_SUCCESS); 1283 } 1284 1285 /* 1286 * Copies all properties from one element to another. If the property 1287 * is a readonly property, then don't copy it. 1288 */ 1289 /* ARGSUSED */ 1290 static int 1291 clone_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, 1292 pool_value_t *pv, void *user) 1293 { 1294 pool_elem_t *tgt = (pool_elem_t *)user; 1295 const pool_prop_t *prop; 1296 #ifdef DEBUG 1297 dprintf("Cloning %s from %s\n", 1298 pool_conf_location(TO_CONF(TO_ELEM(tgt))), 1299 pool_conf_location(TO_CONF(pe))); 1300 assert(TO_CONF(TO_ELEM(tgt)) != TO_CONF(pe)); 1301 dprintf("clone_element: Processing %s\n", name); 1302 pool_value_dprintf(pv); 1303 #endif /* DEBUG */ 1304 /* 1305 * Some properties should be ignored 1306 */ 1307 if ((prop = provider_get_prop(pe, name)) != NULL && 1308 prop_is_readonly(prop) == PO_TRUE) 1309 return (PO_SUCCESS); 1310 return (pool_put_property(TO_CONF(tgt), tgt, name, pv) == PO_FAIL); 1311 } 1312 1313 /* 1314 * Removes all properties from one element. Properties which are 1315 * managed by the configuration are ignored. 1316 */ 1317 /* ARGSUSED3 */ 1318 static int 1319 clean_element(pool_conf_t *conf, pool_elem_t *pe, const char *name, 1320 pool_value_t *pv, void *user) 1321 { 1322 const pool_prop_t *prop; 1323 /* 1324 * Some properties should be ignored 1325 */ 1326 if ((prop = provider_get_prop(pe, name)) != NULL && 1327 prop_is_optional(prop) == PO_FALSE) 1328 return (PO_SUCCESS); 1329 return (pool_rm_property(conf, (pool_elem_t *)pe, name) == PO_FAIL); 1330 } 1331