1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2014-2016 Christoph Hellwig. 4 */ 5 6 #include <linux/vmalloc.h> 7 8 #include "blocklayout.h" 9 #include "../nfs4trace.h" 10 11 #define NFSDBG_FACILITY NFSDBG_PNFS_LD 12 13 static inline struct pnfs_block_extent * 14 ext_node(struct rb_node *node) 15 { 16 return rb_entry(node, struct pnfs_block_extent, be_node); 17 } 18 19 static struct pnfs_block_extent * 20 ext_tree_first(struct rb_root *root) 21 { 22 struct rb_node *node = rb_first(root); 23 return node ? ext_node(node) : NULL; 24 } 25 26 static struct pnfs_block_extent * 27 ext_tree_prev(struct pnfs_block_extent *be) 28 { 29 struct rb_node *node = rb_prev(&be->be_node); 30 return node ? ext_node(node) : NULL; 31 } 32 33 static struct pnfs_block_extent * 34 ext_tree_next(struct pnfs_block_extent *be) 35 { 36 struct rb_node *node = rb_next(&be->be_node); 37 return node ? ext_node(node) : NULL; 38 } 39 40 static inline sector_t 41 ext_f_end(struct pnfs_block_extent *be) 42 { 43 return be->be_f_offset + be->be_length; 44 } 45 46 static struct pnfs_block_extent * 47 __ext_tree_search(struct rb_root *root, sector_t start) 48 { 49 struct rb_node *node = root->rb_node; 50 struct pnfs_block_extent *be = NULL; 51 52 while (node) { 53 be = ext_node(node); 54 if (start < be->be_f_offset) 55 node = node->rb_left; 56 else if (start >= ext_f_end(be)) 57 node = node->rb_right; 58 else 59 return be; 60 } 61 62 if (be) { 63 if (start < be->be_f_offset) 64 return be; 65 66 if (start >= ext_f_end(be)) 67 return ext_tree_next(be); 68 } 69 70 return NULL; 71 } 72 73 static bool 74 ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) 75 { 76 if (be1->be_state != be2->be_state) 77 return false; 78 if (be1->be_device != be2->be_device) 79 return false; 80 81 if (be1->be_f_offset + be1->be_length != be2->be_f_offset) 82 return false; 83 84 if (be1->be_state != PNFS_BLOCK_NONE_DATA && 85 (be1->be_v_offset + be1->be_length != be2->be_v_offset)) 86 return false; 87 88 if (be1->be_state == PNFS_BLOCK_INVALID_DATA && 89 be1->be_tag != be2->be_tag) 90 return false; 91 92 return true; 93 } 94 95 static struct pnfs_block_extent * 96 ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) 97 { 98 struct pnfs_block_extent *left = ext_tree_prev(be); 99 100 if (left && ext_can_merge(left, be)) { 101 left->be_length += be->be_length; 102 rb_erase(&be->be_node, root); 103 nfs4_put_deviceid_node(be->be_device); 104 kfree(be); 105 return left; 106 } 107 108 return be; 109 } 110 111 static struct pnfs_block_extent * 112 ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) 113 { 114 struct pnfs_block_extent *right = ext_tree_next(be); 115 116 if (right && ext_can_merge(be, right)) { 117 be->be_length += right->be_length; 118 rb_erase(&right->be_node, root); 119 nfs4_put_deviceid_node(right->be_device); 120 kfree(right); 121 } 122 123 return be; 124 } 125 126 static void __ext_put_deviceids(struct list_head *head) 127 { 128 struct pnfs_block_extent *be, *tmp; 129 130 list_for_each_entry_safe(be, tmp, head, be_list) { 131 nfs4_put_deviceid_node(be->be_device); 132 kfree(be); 133 } 134 } 135 136 static void 137 __ext_tree_insert(struct rb_root *root, 138 struct pnfs_block_extent *new, bool merge_ok) 139 { 140 struct rb_node **p = &root->rb_node, *parent = NULL; 141 struct pnfs_block_extent *be; 142 143 while (*p) { 144 parent = *p; 145 be = ext_node(parent); 146 147 if (new->be_f_offset < be->be_f_offset) { 148 if (merge_ok && ext_can_merge(new, be)) { 149 be->be_f_offset = new->be_f_offset; 150 if (be->be_state != PNFS_BLOCK_NONE_DATA) 151 be->be_v_offset = new->be_v_offset; 152 be->be_length += new->be_length; 153 be = ext_try_to_merge_left(root, be); 154 goto free_new; 155 } 156 p = &(*p)->rb_left; 157 } else if (new->be_f_offset >= ext_f_end(be)) { 158 if (merge_ok && ext_can_merge(be, new)) { 159 be->be_length += new->be_length; 160 be = ext_try_to_merge_right(root, be); 161 goto free_new; 162 } 163 p = &(*p)->rb_right; 164 } else { 165 BUG(); 166 } 167 } 168 169 rb_link_node(&new->be_node, parent, p); 170 rb_insert_color(&new->be_node, root); 171 return; 172 free_new: 173 nfs4_put_deviceid_node(new->be_device); 174 kfree(new); 175 } 176 177 static int 178 __ext_tree_remove(struct rb_root *root, 179 sector_t start, sector_t end, struct list_head *tmp) 180 { 181 struct pnfs_block_extent *be; 182 sector_t len1 = 0, len2 = 0; 183 sector_t orig_v_offset; 184 sector_t orig_len; 185 186 be = __ext_tree_search(root, start); 187 if (!be) 188 return 0; 189 if (be->be_f_offset >= end) 190 return 0; 191 192 orig_v_offset = be->be_v_offset; 193 orig_len = be->be_length; 194 195 if (start > be->be_f_offset) 196 len1 = start - be->be_f_offset; 197 if (ext_f_end(be) > end) 198 len2 = ext_f_end(be) - end; 199 200 if (len2 > 0) { 201 if (len1 > 0) { 202 struct pnfs_block_extent *new; 203 204 new = kzalloc(sizeof(*new), GFP_ATOMIC); 205 if (!new) 206 return -ENOMEM; 207 208 be->be_length = len1; 209 210 new->be_f_offset = end; 211 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 212 new->be_v_offset = 213 orig_v_offset + orig_len - len2; 214 } 215 new->be_length = len2; 216 new->be_state = be->be_state; 217 new->be_tag = be->be_tag; 218 new->be_device = nfs4_get_deviceid(be->be_device); 219 220 __ext_tree_insert(root, new, true); 221 } else { 222 be->be_f_offset = end; 223 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 224 be->be_v_offset = 225 orig_v_offset + orig_len - len2; 226 } 227 be->be_length = len2; 228 } 229 } else { 230 if (len1 > 0) { 231 be->be_length = len1; 232 be = ext_tree_next(be); 233 } 234 235 while (be && ext_f_end(be) <= end) { 236 struct pnfs_block_extent *next = ext_tree_next(be); 237 238 rb_erase(&be->be_node, root); 239 list_add_tail(&be->be_list, tmp); 240 be = next; 241 } 242 243 if (be && be->be_f_offset < end) { 244 len1 = ext_f_end(be) - end; 245 be->be_f_offset = end; 246 if (be->be_state != PNFS_BLOCK_NONE_DATA) 247 be->be_v_offset += be->be_length - len1; 248 be->be_length = len1; 249 } 250 } 251 252 return 0; 253 } 254 255 int 256 ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) 257 { 258 struct pnfs_block_extent *be; 259 struct rb_root *root; 260 int err = 0; 261 262 switch (new->be_state) { 263 case PNFS_BLOCK_READWRITE_DATA: 264 case PNFS_BLOCK_INVALID_DATA: 265 root = &bl->bl_ext_rw; 266 break; 267 case PNFS_BLOCK_READ_DATA: 268 case PNFS_BLOCK_NONE_DATA: 269 root = &bl->bl_ext_ro; 270 break; 271 default: 272 dprintk("invalid extent type\n"); 273 return -EINVAL; 274 } 275 276 spin_lock(&bl->bl_ext_lock); 277 retry: 278 be = __ext_tree_search(root, new->be_f_offset); 279 if (!be || be->be_f_offset >= ext_f_end(new)) { 280 __ext_tree_insert(root, new, true); 281 } else if (new->be_f_offset >= be->be_f_offset) { 282 if (ext_f_end(new) <= ext_f_end(be)) { 283 nfs4_put_deviceid_node(new->be_device); 284 kfree(new); 285 } else { 286 sector_t new_len = ext_f_end(new) - ext_f_end(be); 287 sector_t diff = new->be_length - new_len; 288 289 new->be_f_offset += diff; 290 new->be_v_offset += diff; 291 new->be_length = new_len; 292 goto retry; 293 } 294 } else if (ext_f_end(new) <= ext_f_end(be)) { 295 new->be_length = be->be_f_offset - new->be_f_offset; 296 __ext_tree_insert(root, new, true); 297 } else { 298 struct pnfs_block_extent *split; 299 sector_t new_len = ext_f_end(new) - ext_f_end(be); 300 sector_t diff = new->be_length - new_len; 301 302 split = kmemdup(new, sizeof(*new), GFP_ATOMIC); 303 if (!split) { 304 err = -EINVAL; 305 goto out; 306 } 307 308 split->be_length = be->be_f_offset - split->be_f_offset; 309 split->be_device = nfs4_get_deviceid(new->be_device); 310 __ext_tree_insert(root, split, true); 311 312 new->be_f_offset += diff; 313 new->be_v_offset += diff; 314 new->be_length = new_len; 315 goto retry; 316 } 317 out: 318 spin_unlock(&bl->bl_ext_lock); 319 return err; 320 } 321 322 static bool 323 __ext_tree_lookup(struct rb_root *root, sector_t isect, 324 struct pnfs_block_extent *ret) 325 { 326 struct rb_node *node; 327 struct pnfs_block_extent *be; 328 329 node = root->rb_node; 330 while (node) { 331 be = ext_node(node); 332 if (isect < be->be_f_offset) 333 node = node->rb_left; 334 else if (isect >= ext_f_end(be)) 335 node = node->rb_right; 336 else { 337 *ret = *be; 338 return true; 339 } 340 } 341 342 return false; 343 } 344 345 bool 346 ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, 347 struct pnfs_block_extent *ret, bool rw) 348 { 349 bool found = false; 350 351 spin_lock(&bl->bl_ext_lock); 352 if (!rw) 353 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); 354 if (!found) 355 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); 356 spin_unlock(&bl->bl_ext_lock); 357 358 return found; 359 } 360 361 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, 362 sector_t start, sector_t end) 363 { 364 int err, err2; 365 LIST_HEAD(tmp); 366 367 spin_lock(&bl->bl_ext_lock); 368 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 369 if (rw) { 370 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp); 371 if (!err) 372 err = err2; 373 } 374 spin_unlock(&bl->bl_ext_lock); 375 376 __ext_put_deviceids(&tmp); 377 return err; 378 } 379 380 static int 381 ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, 382 sector_t split) 383 { 384 struct pnfs_block_extent *new; 385 sector_t orig_len = be->be_length; 386 387 new = kzalloc(sizeof(*new), GFP_ATOMIC); 388 if (!new) 389 return -ENOMEM; 390 391 be->be_length = split - be->be_f_offset; 392 393 new->be_f_offset = split; 394 if (be->be_state != PNFS_BLOCK_NONE_DATA) 395 new->be_v_offset = be->be_v_offset + be->be_length; 396 new->be_length = orig_len - be->be_length; 397 new->be_state = be->be_state; 398 new->be_tag = be->be_tag; 399 new->be_device = nfs4_get_deviceid(be->be_device); 400 401 __ext_tree_insert(root, new, false); 402 return 0; 403 } 404 405 int 406 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, 407 sector_t len, u64 lwb) 408 { 409 struct rb_root *root = &bl->bl_ext_rw; 410 sector_t end = start + len; 411 struct pnfs_block_extent *be; 412 int err = 0; 413 LIST_HEAD(tmp); 414 415 spin_lock(&bl->bl_ext_lock); 416 /* 417 * First remove all COW extents or holes from written to range. 418 */ 419 err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp); 420 if (err) 421 goto out; 422 423 /* 424 * Then mark all invalid extents in the range as written to. 425 */ 426 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { 427 if (be->be_f_offset >= end) 428 break; 429 430 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) 431 continue; 432 433 if (be->be_f_offset < start) { 434 struct pnfs_block_extent *left = ext_tree_prev(be); 435 436 if (left && ext_can_merge(left, be)) { 437 sector_t diff = start - be->be_f_offset; 438 439 left->be_length += diff; 440 441 be->be_f_offset += diff; 442 be->be_v_offset += diff; 443 be->be_length -= diff; 444 } else { 445 err = ext_tree_split(root, be, start); 446 if (err) 447 goto out; 448 } 449 } 450 451 if (ext_f_end(be) > end) { 452 struct pnfs_block_extent *right = ext_tree_next(be); 453 454 if (right && ext_can_merge(be, right)) { 455 sector_t diff = end - be->be_f_offset; 456 457 be->be_length -= diff; 458 459 right->be_f_offset -= diff; 460 right->be_v_offset -= diff; 461 right->be_length += diff; 462 } else { 463 err = ext_tree_split(root, be, end); 464 if (err) 465 goto out; 466 } 467 } 468 469 if (be->be_f_offset >= start && ext_f_end(be) <= end) { 470 be->be_tag = EXTENT_WRITTEN; 471 be = ext_try_to_merge_left(root, be); 472 be = ext_try_to_merge_right(root, be); 473 } 474 } 475 out: 476 if (bl->bl_lwb < lwb) 477 bl->bl_lwb = lwb; 478 spin_unlock(&bl->bl_ext_lock); 479 480 __ext_put_deviceids(&tmp); 481 return err; 482 } 483 484 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count) 485 { 486 if (bl->bl_scsi_layout) 487 return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count; 488 else 489 return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count; 490 } 491 492 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, 493 size_t buffer_size) 494 { 495 if (arg->layoutupdate_pages != &arg->layoutupdate_page) { 496 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; 497 498 for (i = 0; i < nr_pages; i++) 499 put_page(arg->layoutupdate_pages[i]); 500 vfree(arg->start_p); 501 kfree(arg->layoutupdate_pages); 502 } else { 503 put_page(arg->layoutupdate_page); 504 } 505 } 506 507 static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p) 508 { 509 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, 510 NFS4_DEVICEID4_SIZE); 511 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 512 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 513 p = xdr_encode_hyper(p, 0LL); 514 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 515 return p; 516 } 517 518 static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) 519 { 520 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 521 return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 522 } 523 524 /** 525 * ext_tree_try_encode_commit - try to encode all extents into the buffer 526 * @bl: pointer to the layout 527 * @p: pointer to the output buffer 528 * @buffer_size: size of the output buffer 529 * @count: output pointer to the number of encoded extents 530 * @lastbyte: output pointer to the last written byte 531 * 532 * Return values: 533 * %0: Success, all required extents encoded, outputs are valid 534 * %-ENOSPC: Buffer too small, nothing encoded, outputs are invalid 535 */ 536 static int 537 ext_tree_try_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 538 size_t buffer_size, size_t *count, __u64 *lastbyte) 539 { 540 struct pnfs_block_extent *be; 541 542 spin_lock(&bl->bl_ext_lock); 543 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { 544 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 545 be->be_tag != EXTENT_WRITTEN) 546 continue; 547 548 (*count)++; 549 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { 550 spin_unlock(&bl->bl_ext_lock); 551 return -ENOSPC; 552 } 553 } 554 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { 555 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 556 be->be_tag != EXTENT_WRITTEN) 557 continue; 558 559 if (bl->bl_scsi_layout) 560 p = encode_scsi_range(be, p); 561 else 562 p = encode_block_extent(be, p); 563 be->be_tag = EXTENT_COMMITTING; 564 } 565 *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; 566 bl->bl_lwb = 0; 567 spin_unlock(&bl->bl_ext_lock); 568 569 return 0; 570 } 571 572 /** 573 * ext_tree_encode_commit - encode as much as possible extents into the buffer 574 * @bl: pointer to the layout 575 * @p: pointer to the output buffer 576 * @buffer_size: size of the output buffer 577 * @count: output pointer to the number of encoded extents 578 * @lastbyte: output pointer to the last written byte 579 * 580 * Return values: 581 * %0: Success, all required extents encoded, outputs are valid 582 * %-ENOSPC: Buffer too small, some extents are encoded, outputs are valid 583 */ 584 static int 585 ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 586 size_t buffer_size, size_t *count, __u64 *lastbyte) 587 { 588 struct pnfs_block_extent *be, *be_prev; 589 int ret = 0; 590 591 spin_lock(&bl->bl_ext_lock); 592 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { 593 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 594 be->be_tag != EXTENT_WRITTEN) 595 continue; 596 597 (*count)++; 598 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { 599 (*count)--; 600 ret = -ENOSPC; 601 break; 602 } 603 604 if (bl->bl_scsi_layout) 605 p = encode_scsi_range(be, p); 606 else 607 p = encode_block_extent(be, p); 608 be->be_tag = EXTENT_COMMITTING; 609 be_prev = be; 610 } 611 if (!ret) { 612 *lastbyte = (bl->bl_lwb != 0) ? bl->bl_lwb - 1 : U64_MAX; 613 bl->bl_lwb = 0; 614 } else { 615 *lastbyte = be_prev->be_f_offset + be_prev->be_length; 616 *lastbyte <<= SECTOR_SHIFT; 617 *lastbyte -= 1; 618 } 619 spin_unlock(&bl->bl_ext_lock); 620 621 return ret; 622 } 623 624 /** 625 * ext_tree_prepare_commit - encode extents that need to be committed 626 * @arg: layout commit data 627 * 628 * Return values: 629 * %0: Success, all required extents are encoded 630 * %-ENOSPC: Some extents are encoded, but not all, due to RPC size limit 631 * %-ENOMEM: Out of memory, extents not encoded 632 */ 633 int 634 ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) 635 { 636 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 637 size_t count = 0, buffer_size = PAGE_SIZE; 638 __be32 *start_p; 639 int ret; 640 641 arg->layoutupdate_page = alloc_page(GFP_NOFS); 642 if (!arg->layoutupdate_page) 643 return -ENOMEM; 644 start_p = page_address(arg->layoutupdate_page); 645 arg->layoutupdate_pages = &arg->layoutupdate_page; 646 647 ret = ext_tree_try_encode_commit(bl, start_p + 1, buffer_size, 648 &count, &arg->lastbytewritten); 649 if (unlikely(ret)) { 650 ext_tree_free_commitdata(arg, buffer_size); 651 652 buffer_size = NFS_SERVER(arg->inode)->wsize; 653 count = 0; 654 655 arg->layoutupdate_pages = 656 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 657 sizeof(struct page *), GFP_NOFS); 658 if (!arg->layoutupdate_pages) 659 return -ENOMEM; 660 661 start_p = __vmalloc(buffer_size, GFP_NOFS); 662 if (!start_p) { 663 kfree(arg->layoutupdate_pages); 664 return -ENOMEM; 665 } 666 667 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, 668 &count, &arg->lastbytewritten); 669 } 670 671 *start_p = cpu_to_be32(count); 672 arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count); 673 674 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { 675 void *p = start_p, *end = p + arg->layoutupdate_len; 676 struct page *page = NULL; 677 int i = 0; 678 679 arg->start_p = start_p; 680 for ( ; p < end; p += PAGE_SIZE) { 681 page = vmalloc_to_page(p); 682 arg->layoutupdate_pages[i++] = page; 683 get_page(page); 684 } 685 } 686 687 trace_bl_ext_tree_prepare_commit(ret, count, 688 arg->lastbytewritten, !!ret); 689 return ret; 690 } 691 692 void 693 ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) 694 { 695 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 696 struct rb_root *root = &bl->bl_ext_rw; 697 struct pnfs_block_extent *be; 698 699 dprintk("%s status %d\n", __func__, status); 700 701 ext_tree_free_commitdata(arg, arg->layoutupdate_len); 702 703 spin_lock(&bl->bl_ext_lock); 704 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { 705 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 706 be->be_tag != EXTENT_COMMITTING) 707 continue; 708 709 if (status) { 710 /* 711 * Mark as written and try again. 712 * 713 * XXX: some real error handling here wouldn't hurt.. 714 */ 715 be->be_tag = EXTENT_WRITTEN; 716 } else { 717 be->be_state = PNFS_BLOCK_READWRITE_DATA; 718 be->be_tag = 0; 719 } 720 721 be = ext_try_to_merge_left(root, be); 722 be = ext_try_to_merge_right(root, be); 723 } 724 spin_unlock(&bl->bl_ext_lock); 725 } 726