1 2 #ifdef __KERNEL__ 3 # include <linux/string.h> 4 # include <linux/slab.h> 5 # include <linux/bug.h> 6 # include <linux/kernel.h> 7 # ifndef dprintk 8 # define dprintk(args...) 9 # endif 10 #else 11 # include <string.h> 12 # include <stdio.h> 13 # include <stdlib.h> 14 # include <assert.h> 15 # define BUG_ON(x) assert(!(x)) 16 # define dprintk(args...) /* printf(args) */ 17 # define kmalloc(x, f) malloc(x) 18 # define kfree(x) free(x) 19 #endif 20 21 #include <linux/crush/crush.h> 22 #include <linux/crush/hash.h> 23 #include <linux/crush/mapper.h> 24 25 /* 26 * Implement the core CRUSH mapping algorithm. 27 */ 28 29 /** 30 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size. 31 * @map: the crush_map 32 * @ruleset: the storage ruleset id (user defined) 33 * @type: storage ruleset type (user defined) 34 * @size: output set size 35 */ 36 int crush_find_rule(struct crush_map *map, int ruleset, int type, int size) 37 { 38 int i; 39 40 for (i = 0; i < map->max_rules; i++) { 41 if (map->rules[i] && 42 map->rules[i]->mask.ruleset == ruleset && 43 map->rules[i]->mask.type == type && 44 map->rules[i]->mask.min_size <= size && 45 map->rules[i]->mask.max_size >= size) 46 return i; 47 } 48 return -1; 49 } 50 51 52 /* 53 * bucket choose methods 54 * 55 * For each bucket algorithm, we have a "choose" method that, given a 56 * crush input @x and replica position (usually, position in output set) @r, 57 * will produce an item in the bucket. 58 */ 59 60 /* 61 * Choose based on a random permutation of the bucket. 62 * 63 * We used to use some prime number arithmetic to do this, but it 64 * wasn't very random, and had some other bad behaviors. Instead, we 65 * calculate an actual random permutation of the bucket members. 66 * Since this is expensive, we optimize for the r=0 case, which 67 * captures the vast majority of calls. 68 */ 69 static int bucket_perm_choose(struct crush_bucket *bucket, 70 int x, int r) 71 { 72 unsigned int pr = r % bucket->size; 73 unsigned int i, s; 74 75 /* start a new permutation if @x has changed */ 76 if (bucket->perm_x != x || bucket->perm_n == 0) { 77 dprintk("bucket %d new x=%d\n", bucket->id, x); 78 bucket->perm_x = x; 79 80 /* optimize common r=0 case */ 81 if (pr == 0) { 82 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) % 83 bucket->size; 84 bucket->perm[0] = s; 85 bucket->perm_n = 0xffff; /* magic value, see below */ 86 goto out; 87 } 88 89 for (i = 0; i < bucket->size; i++) 90 bucket->perm[i] = i; 91 bucket->perm_n = 0; 92 } else if (bucket->perm_n == 0xffff) { 93 /* clean up after the r=0 case above */ 94 for (i = 1; i < bucket->size; i++) 95 bucket->perm[i] = i; 96 bucket->perm[bucket->perm[0]] = 0; 97 bucket->perm_n = 1; 98 } 99 100 /* calculate permutation up to pr */ 101 for (i = 0; i < bucket->perm_n; i++) 102 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]); 103 while (bucket->perm_n <= pr) { 104 unsigned int p = bucket->perm_n; 105 /* no point in swapping the final entry */ 106 if (p < bucket->size - 1) { 107 i = crush_hash32_3(bucket->hash, x, bucket->id, p) % 108 (bucket->size - p); 109 if (i) { 110 unsigned int t = bucket->perm[p + i]; 111 bucket->perm[p + i] = bucket->perm[p]; 112 bucket->perm[p] = t; 113 } 114 dprintk(" perm_choose swap %d with %d\n", p, p+i); 115 } 116 bucket->perm_n++; 117 } 118 for (i = 0; i < bucket->size; i++) 119 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]); 120 121 s = bucket->perm[pr]; 122 out: 123 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id, 124 bucket->size, x, r, pr, s); 125 return bucket->items[s]; 126 } 127 128 /* uniform */ 129 static int bucket_uniform_choose(struct crush_bucket_uniform *bucket, 130 int x, int r) 131 { 132 return bucket_perm_choose(&bucket->h, x, r); 133 } 134 135 /* list */ 136 static int bucket_list_choose(struct crush_bucket_list *bucket, 137 int x, int r) 138 { 139 int i; 140 141 for (i = bucket->h.size-1; i >= 0; i--) { 142 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i], 143 r, bucket->h.id); 144 w &= 0xffff; 145 dprintk("list_choose i=%d x=%d r=%d item %d weight %x " 146 "sw %x rand %llx", 147 i, x, r, bucket->h.items[i], bucket->item_weights[i], 148 bucket->sum_weights[i], w); 149 w *= bucket->sum_weights[i]; 150 w = w >> 16; 151 /*dprintk(" scaled %llx\n", w);*/ 152 if (w < bucket->item_weights[i]) 153 return bucket->h.items[i]; 154 } 155 156 BUG_ON(1); 157 return 0; 158 } 159 160 161 /* (binary) tree */ 162 static int height(int n) 163 { 164 int h = 0; 165 while ((n & 1) == 0) { 166 h++; 167 n = n >> 1; 168 } 169 return h; 170 } 171 172 static int left(int x) 173 { 174 int h = height(x); 175 return x - (1 << (h-1)); 176 } 177 178 static int right(int x) 179 { 180 int h = height(x); 181 return x + (1 << (h-1)); 182 } 183 184 static int terminal(int x) 185 { 186 return x & 1; 187 } 188 189 static int bucket_tree_choose(struct crush_bucket_tree *bucket, 190 int x, int r) 191 { 192 int n, l; 193 __u32 w; 194 __u64 t; 195 196 /* start at root */ 197 n = bucket->num_nodes >> 1; 198 199 while (!terminal(n)) { 200 /* pick point in [0, w) */ 201 w = bucket->node_weights[n]; 202 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r, 203 bucket->h.id) * (__u64)w; 204 t = t >> 32; 205 206 /* descend to the left or right? */ 207 l = left(n); 208 if (t < bucket->node_weights[l]) 209 n = l; 210 else 211 n = right(n); 212 } 213 214 return bucket->h.items[n >> 1]; 215 } 216 217 218 /* straw */ 219 220 static int bucket_straw_choose(struct crush_bucket_straw *bucket, 221 int x, int r) 222 { 223 int i; 224 int high = 0; 225 __u64 high_draw = 0; 226 __u64 draw; 227 228 for (i = 0; i < bucket->h.size; i++) { 229 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r); 230 draw &= 0xffff; 231 draw *= bucket->straws[i]; 232 if (i == 0 || draw > high_draw) { 233 high = i; 234 high_draw = draw; 235 } 236 } 237 return bucket->h.items[high]; 238 } 239 240 static int crush_bucket_choose(struct crush_bucket *in, int x, int r) 241 { 242 dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); 243 switch (in->alg) { 244 case CRUSH_BUCKET_UNIFORM: 245 return bucket_uniform_choose((struct crush_bucket_uniform *)in, 246 x, r); 247 case CRUSH_BUCKET_LIST: 248 return bucket_list_choose((struct crush_bucket_list *)in, 249 x, r); 250 case CRUSH_BUCKET_TREE: 251 return bucket_tree_choose((struct crush_bucket_tree *)in, 252 x, r); 253 case CRUSH_BUCKET_STRAW: 254 return bucket_straw_choose((struct crush_bucket_straw *)in, 255 x, r); 256 default: 257 BUG_ON(1); 258 return in->items[0]; 259 } 260 } 261 262 /* 263 * true if device is marked "out" (failed, fully offloaded) 264 * of the cluster 265 */ 266 static int is_out(struct crush_map *map, __u32 *weight, int item, int x) 267 { 268 if (weight[item] >= 0x10000) 269 return 0; 270 if (weight[item] == 0) 271 return 1; 272 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff) 273 < weight[item]) 274 return 0; 275 return 1; 276 } 277 278 /** 279 * crush_choose - choose numrep distinct items of given type 280 * @map: the crush_map 281 * @bucket: the bucket we are choose an item from 282 * @x: crush input value 283 * @numrep: the number of items to choose 284 * @type: the type of item to choose 285 * @out: pointer to output vector 286 * @outpos: our position in that vector 287 * @firstn: true if choosing "first n" items, false if choosing "indep" 288 * @recurse_to_leaf: true if we want one device under each item of given type 289 * @out2: second output vector for leaf items (if @recurse_to_leaf) 290 */ 291 static int crush_choose(struct crush_map *map, 292 struct crush_bucket *bucket, 293 __u32 *weight, 294 int x, int numrep, int type, 295 int *out, int outpos, 296 int firstn, int recurse_to_leaf, 297 int *out2) 298 { 299 int rep; 300 int ftotal, flocal; 301 int retry_descent, retry_bucket, skip_rep; 302 struct crush_bucket *in = bucket; 303 int r; 304 int i; 305 int item = 0; 306 int itemtype; 307 int collide, reject; 308 const int orig_tries = 5; /* attempts before we fall back to search */ 309 310 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 311 bucket->id, x, outpos, numrep); 312 313 for (rep = outpos; rep < numrep; rep++) { 314 /* keep trying until we get a non-out, non-colliding item */ 315 ftotal = 0; 316 skip_rep = 0; 317 do { 318 retry_descent = 0; 319 in = bucket; /* initial bucket */ 320 321 /* choose through intervening buckets */ 322 flocal = 0; 323 do { 324 collide = 0; 325 retry_bucket = 0; 326 r = rep; 327 if (in->alg == CRUSH_BUCKET_UNIFORM) { 328 /* be careful */ 329 if (firstn || numrep >= in->size) 330 /* r' = r + f_total */ 331 r += ftotal; 332 else if (in->size % numrep == 0) 333 /* r'=r+(n+1)*f_local */ 334 r += (numrep+1) * 335 (flocal+ftotal); 336 else 337 /* r' = r + n*f_local */ 338 r += numrep * (flocal+ftotal); 339 } else { 340 if (firstn) 341 /* r' = r + f_total */ 342 r += ftotal; 343 else 344 /* r' = r + n*f_local */ 345 r += numrep * (flocal+ftotal); 346 } 347 348 /* bucket choose */ 349 if (in->size == 0) { 350 reject = 1; 351 goto reject; 352 } 353 if (flocal >= (in->size>>1) && 354 flocal > orig_tries) 355 item = bucket_perm_choose(in, x, r); 356 else 357 item = crush_bucket_choose(in, x, r); 358 BUG_ON(item >= map->max_devices); 359 360 /* desired type? */ 361 if (item < 0) 362 itemtype = map->buckets[-1-item]->type; 363 else 364 itemtype = 0; 365 dprintk(" item %d type %d\n", item, itemtype); 366 367 /* keep going? */ 368 if (itemtype != type) { 369 BUG_ON(item >= 0 || 370 (-1-item) >= map->max_buckets); 371 in = map->buckets[-1-item]; 372 retry_bucket = 1; 373 continue; 374 } 375 376 /* collision? */ 377 for (i = 0; i < outpos; i++) { 378 if (out[i] == item) { 379 collide = 1; 380 break; 381 } 382 } 383 384 reject = 0; 385 if (recurse_to_leaf) { 386 if (item < 0) { 387 if (crush_choose(map, 388 map->buckets[-1-item], 389 weight, 390 x, outpos+1, 0, 391 out2, outpos, 392 firstn, 0, 393 NULL) <= outpos) 394 /* didn't get leaf */ 395 reject = 1; 396 } else { 397 /* we already have a leaf! */ 398 out2[outpos] = item; 399 } 400 } 401 402 if (!reject) { 403 /* out? */ 404 if (itemtype == 0) 405 reject = is_out(map, weight, 406 item, x); 407 else 408 reject = 0; 409 } 410 411 reject: 412 if (reject || collide) { 413 ftotal++; 414 flocal++; 415 416 if (collide && flocal < 3) 417 /* retry locally a few times */ 418 retry_bucket = 1; 419 else if (flocal < in->size + orig_tries) 420 /* exhaustive bucket search */ 421 retry_bucket = 1; 422 else if (ftotal < 20) 423 /* then retry descent */ 424 retry_descent = 1; 425 else 426 /* else give up */ 427 skip_rep = 1; 428 dprintk(" reject %d collide %d " 429 "ftotal %d flocal %d\n", 430 reject, collide, ftotal, 431 flocal); 432 } 433 } while (retry_bucket); 434 } while (retry_descent); 435 436 if (skip_rep) { 437 dprintk("skip rep\n"); 438 continue; 439 } 440 441 dprintk("CHOOSE got %d\n", item); 442 out[outpos] = item; 443 outpos++; 444 } 445 446 dprintk("CHOOSE returns %d\n", outpos); 447 return outpos; 448 } 449 450 451 /** 452 * crush_do_rule - calculate a mapping with the given input and rule 453 * @map: the crush_map 454 * @ruleno: the rule id 455 * @x: hash input 456 * @result: pointer to result vector 457 * @result_max: maximum result size 458 * @force: force initial replica choice; -1 for none 459 */ 460 int crush_do_rule(struct crush_map *map, 461 int ruleno, int x, int *result, int result_max, 462 int force, __u32 *weight) 463 { 464 int result_len; 465 int force_context[CRUSH_MAX_DEPTH]; 466 int force_pos = -1; 467 int a[CRUSH_MAX_SET]; 468 int b[CRUSH_MAX_SET]; 469 int c[CRUSH_MAX_SET]; 470 int recurse_to_leaf; 471 int *w; 472 int wsize = 0; 473 int *o; 474 int osize; 475 int *tmp; 476 struct crush_rule *rule; 477 int step; 478 int i, j; 479 int numrep; 480 int firstn; 481 482 BUG_ON(ruleno >= map->max_rules); 483 484 rule = map->rules[ruleno]; 485 result_len = 0; 486 w = a; 487 o = b; 488 489 /* 490 * determine hierarchical context of force, if any. note 491 * that this may or may not correspond to the specific types 492 * referenced by the crush rule. 493 */ 494 if (force >= 0 && 495 force < map->max_devices && 496 map->device_parents[force] != 0 && 497 !is_out(map, weight, force, x)) { 498 while (1) { 499 force_context[++force_pos] = force; 500 if (force >= 0) 501 force = map->device_parents[force]; 502 else 503 force = map->bucket_parents[-1-force]; 504 if (force == 0) 505 break; 506 } 507 } 508 509 for (step = 0; step < rule->len; step++) { 510 firstn = 0; 511 switch (rule->steps[step].op) { 512 case CRUSH_RULE_TAKE: 513 w[0] = rule->steps[step].arg1; 514 515 /* find position in force_context/hierarchy */ 516 while (force_pos >= 0 && 517 force_context[force_pos] != w[0]) 518 force_pos--; 519 /* and move past it */ 520 if (force_pos >= 0) 521 force_pos--; 522 523 wsize = 1; 524 break; 525 526 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN: 527 case CRUSH_RULE_CHOOSE_FIRSTN: 528 firstn = 1; 529 case CRUSH_RULE_CHOOSE_LEAF_INDEP: 530 case CRUSH_RULE_CHOOSE_INDEP: 531 BUG_ON(wsize == 0); 532 533 recurse_to_leaf = 534 rule->steps[step].op == 535 CRUSH_RULE_CHOOSE_LEAF_FIRSTN || 536 rule->steps[step].op == 537 CRUSH_RULE_CHOOSE_LEAF_INDEP; 538 539 /* reset output */ 540 osize = 0; 541 542 for (i = 0; i < wsize; i++) { 543 /* 544 * see CRUSH_N, CRUSH_N_MINUS macros. 545 * basically, numrep <= 0 means relative to 546 * the provided result_max 547 */ 548 numrep = rule->steps[step].arg1; 549 if (numrep <= 0) { 550 numrep += result_max; 551 if (numrep <= 0) 552 continue; 553 } 554 j = 0; 555 if (osize == 0 && force_pos >= 0) { 556 /* skip any intermediate types */ 557 while (force_pos && 558 force_context[force_pos] < 0 && 559 rule->steps[step].arg2 != 560 map->buckets[-1 - 561 force_context[force_pos]]->type) 562 force_pos--; 563 o[osize] = force_context[force_pos]; 564 if (recurse_to_leaf) 565 c[osize] = force_context[0]; 566 j++; 567 force_pos--; 568 } 569 osize += crush_choose(map, 570 map->buckets[-1-w[i]], 571 weight, 572 x, numrep, 573 rule->steps[step].arg2, 574 o+osize, j, 575 firstn, 576 recurse_to_leaf, c+osize); 577 } 578 579 if (recurse_to_leaf) 580 /* copy final _leaf_ values to output set */ 581 memcpy(o, c, osize*sizeof(*o)); 582 583 /* swap t and w arrays */ 584 tmp = o; 585 o = w; 586 w = tmp; 587 wsize = osize; 588 break; 589 590 591 case CRUSH_RULE_EMIT: 592 for (i = 0; i < wsize && result_len < result_max; i++) { 593 result[result_len] = w[i]; 594 result_len++; 595 } 596 wsize = 0; 597 break; 598 599 default: 600 BUG_ON(1); 601 } 602 } 603 return result_len; 604 } 605 606 607