1 /* 2 * Copyright (c) 2014-2016 Christoph Hellwig. 3 */ 4 5 #include <linux/vmalloc.h> 6 7 #include "blocklayout.h" 8 9 #define NFSDBG_FACILITY NFSDBG_PNFS_LD 10 11 static inline struct pnfs_block_extent * 12 ext_node(struct rb_node *node) 13 { 14 return rb_entry(node, struct pnfs_block_extent, be_node); 15 } 16 17 static struct pnfs_block_extent * 18 ext_tree_first(struct rb_root *root) 19 { 20 struct rb_node *node = rb_first(root); 21 return node ? ext_node(node) : NULL; 22 } 23 24 static struct pnfs_block_extent * 25 ext_tree_prev(struct pnfs_block_extent *be) 26 { 27 struct rb_node *node = rb_prev(&be->be_node); 28 return node ? ext_node(node) : NULL; 29 } 30 31 static struct pnfs_block_extent * 32 ext_tree_next(struct pnfs_block_extent *be) 33 { 34 struct rb_node *node = rb_next(&be->be_node); 35 return node ? ext_node(node) : NULL; 36 } 37 38 static inline sector_t 39 ext_f_end(struct pnfs_block_extent *be) 40 { 41 return be->be_f_offset + be->be_length; 42 } 43 44 static struct pnfs_block_extent * 45 __ext_tree_search(struct rb_root *root, sector_t start) 46 { 47 struct rb_node *node = root->rb_node; 48 struct pnfs_block_extent *be = NULL; 49 50 while (node) { 51 be = ext_node(node); 52 if (start < be->be_f_offset) 53 node = node->rb_left; 54 else if (start >= ext_f_end(be)) 55 node = node->rb_right; 56 else 57 return be; 58 } 59 60 if (be) { 61 if (start < be->be_f_offset) 62 return be; 63 64 if (start >= ext_f_end(be)) 65 return ext_tree_next(be); 66 } 67 68 return NULL; 69 } 70 71 static bool 72 ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) 73 { 74 if (be1->be_state != be2->be_state) 75 return false; 76 if (be1->be_device != be2->be_device) 77 return false; 78 79 if (be1->be_f_offset + be1->be_length != be2->be_f_offset) 80 return false; 81 82 if (be1->be_state != PNFS_BLOCK_NONE_DATA && 83 (be1->be_v_offset + be1->be_length != be2->be_v_offset)) 84 return false; 85 86 if (be1->be_state == PNFS_BLOCK_INVALID_DATA && 87 be1->be_tag != be2->be_tag) 88 return false; 89 90 return true; 91 } 92 93 static struct pnfs_block_extent * 94 ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) 95 { 96 struct pnfs_block_extent *left = ext_tree_prev(be); 97 98 if (left && ext_can_merge(left, be)) { 99 left->be_length += be->be_length; 100 rb_erase(&be->be_node, root); 101 nfs4_put_deviceid_node(be->be_device); 102 kfree(be); 103 return left; 104 } 105 106 return be; 107 } 108 109 static struct pnfs_block_extent * 110 ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) 111 { 112 struct pnfs_block_extent *right = ext_tree_next(be); 113 114 if (right && ext_can_merge(be, right)) { 115 be->be_length += right->be_length; 116 rb_erase(&right->be_node, root); 117 nfs4_put_deviceid_node(right->be_device); 118 kfree(right); 119 } 120 121 return be; 122 } 123 124 static void 125 __ext_tree_insert(struct rb_root *root, 126 struct pnfs_block_extent *new, bool merge_ok) 127 { 128 struct rb_node **p = &root->rb_node, *parent = NULL; 129 struct pnfs_block_extent *be; 130 131 while (*p) { 132 parent = *p; 133 be = ext_node(parent); 134 135 if (new->be_f_offset < be->be_f_offset) { 136 if (merge_ok && ext_can_merge(new, be)) { 137 be->be_f_offset = new->be_f_offset; 138 if (be->be_state != PNFS_BLOCK_NONE_DATA) 139 be->be_v_offset = new->be_v_offset; 140 be->be_length += new->be_length; 141 be = ext_try_to_merge_left(root, be); 142 goto free_new; 143 } 144 p = &(*p)->rb_left; 145 } else if (new->be_f_offset >= ext_f_end(be)) { 146 if (merge_ok && ext_can_merge(be, new)) { 147 be->be_length += new->be_length; 148 be = ext_try_to_merge_right(root, be); 149 goto free_new; 150 } 151 p = &(*p)->rb_right; 152 } else { 153 BUG(); 154 } 155 } 156 157 rb_link_node(&new->be_node, parent, p); 158 rb_insert_color(&new->be_node, root); 159 return; 160 free_new: 161 nfs4_put_deviceid_node(new->be_device); 162 kfree(new); 163 } 164 165 static int 166 __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) 167 { 168 struct pnfs_block_extent *be; 169 sector_t len1 = 0, len2 = 0; 170 sector_t orig_v_offset; 171 sector_t orig_len; 172 173 be = __ext_tree_search(root, start); 174 if (!be) 175 return 0; 176 if (be->be_f_offset >= end) 177 return 0; 178 179 orig_v_offset = be->be_v_offset; 180 orig_len = be->be_length; 181 182 if (start > be->be_f_offset) 183 len1 = start - be->be_f_offset; 184 if (ext_f_end(be) > end) 185 len2 = ext_f_end(be) - end; 186 187 if (len2 > 0) { 188 if (len1 > 0) { 189 struct pnfs_block_extent *new; 190 191 new = kzalloc(sizeof(*new), GFP_ATOMIC); 192 if (!new) 193 return -ENOMEM; 194 195 be->be_length = len1; 196 197 new->be_f_offset = end; 198 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 199 new->be_v_offset = 200 orig_v_offset + orig_len - len2; 201 } 202 new->be_length = len2; 203 new->be_state = be->be_state; 204 new->be_tag = be->be_tag; 205 new->be_device = nfs4_get_deviceid(be->be_device); 206 207 __ext_tree_insert(root, new, true); 208 } else { 209 be->be_f_offset = end; 210 if (be->be_state != PNFS_BLOCK_NONE_DATA) { 211 be->be_v_offset = 212 orig_v_offset + orig_len - len2; 213 } 214 be->be_length = len2; 215 } 216 } else { 217 if (len1 > 0) { 218 be->be_length = len1; 219 be = ext_tree_next(be); 220 } 221 222 while (be && ext_f_end(be) <= end) { 223 struct pnfs_block_extent *next = ext_tree_next(be); 224 225 rb_erase(&be->be_node, root); 226 nfs4_put_deviceid_node(be->be_device); 227 kfree(be); 228 be = next; 229 } 230 231 if (be && be->be_f_offset < end) { 232 len1 = ext_f_end(be) - end; 233 be->be_f_offset = end; 234 if (be->be_state != PNFS_BLOCK_NONE_DATA) 235 be->be_v_offset += be->be_length - len1; 236 be->be_length = len1; 237 } 238 } 239 240 return 0; 241 } 242 243 int 244 ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) 245 { 246 struct pnfs_block_extent *be; 247 struct rb_root *root; 248 int err = 0; 249 250 switch (new->be_state) { 251 case PNFS_BLOCK_READWRITE_DATA: 252 case PNFS_BLOCK_INVALID_DATA: 253 root = &bl->bl_ext_rw; 254 break; 255 case PNFS_BLOCK_READ_DATA: 256 case PNFS_BLOCK_NONE_DATA: 257 root = &bl->bl_ext_ro; 258 break; 259 default: 260 dprintk("invalid extent type\n"); 261 return -EINVAL; 262 } 263 264 spin_lock(&bl->bl_ext_lock); 265 retry: 266 be = __ext_tree_search(root, new->be_f_offset); 267 if (!be || be->be_f_offset >= ext_f_end(new)) { 268 __ext_tree_insert(root, new, true); 269 } else if (new->be_f_offset >= be->be_f_offset) { 270 if (ext_f_end(new) <= ext_f_end(be)) { 271 nfs4_put_deviceid_node(new->be_device); 272 kfree(new); 273 } else { 274 sector_t new_len = ext_f_end(new) - ext_f_end(be); 275 sector_t diff = new->be_length - new_len; 276 277 new->be_f_offset += diff; 278 new->be_v_offset += diff; 279 new->be_length = new_len; 280 goto retry; 281 } 282 } else if (ext_f_end(new) <= ext_f_end(be)) { 283 new->be_length = be->be_f_offset - new->be_f_offset; 284 __ext_tree_insert(root, new, true); 285 } else { 286 struct pnfs_block_extent *split; 287 sector_t new_len = ext_f_end(new) - ext_f_end(be); 288 sector_t diff = new->be_length - new_len; 289 290 split = kmemdup(new, sizeof(*new), GFP_ATOMIC); 291 if (!split) { 292 err = -EINVAL; 293 goto out; 294 } 295 296 split->be_length = be->be_f_offset - split->be_f_offset; 297 split->be_device = nfs4_get_deviceid(new->be_device); 298 __ext_tree_insert(root, split, true); 299 300 new->be_f_offset += diff; 301 new->be_v_offset += diff; 302 new->be_length = new_len; 303 goto retry; 304 } 305 out: 306 spin_unlock(&bl->bl_ext_lock); 307 return err; 308 } 309 310 static bool 311 __ext_tree_lookup(struct rb_root *root, sector_t isect, 312 struct pnfs_block_extent *ret) 313 { 314 struct rb_node *node; 315 struct pnfs_block_extent *be; 316 317 node = root->rb_node; 318 while (node) { 319 be = ext_node(node); 320 if (isect < be->be_f_offset) 321 node = node->rb_left; 322 else if (isect >= ext_f_end(be)) 323 node = node->rb_right; 324 else { 325 *ret = *be; 326 return true; 327 } 328 } 329 330 return false; 331 } 332 333 bool 334 ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, 335 struct pnfs_block_extent *ret, bool rw) 336 { 337 bool found = false; 338 339 spin_lock(&bl->bl_ext_lock); 340 if (!rw) 341 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); 342 if (!found) 343 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); 344 spin_unlock(&bl->bl_ext_lock); 345 346 return found; 347 } 348 349 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, 350 sector_t start, sector_t end) 351 { 352 int err, err2; 353 354 spin_lock(&bl->bl_ext_lock); 355 err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 356 if (rw) { 357 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); 358 if (!err) 359 err = err2; 360 } 361 spin_unlock(&bl->bl_ext_lock); 362 363 return err; 364 } 365 366 static int 367 ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, 368 sector_t split) 369 { 370 struct pnfs_block_extent *new; 371 sector_t orig_len = be->be_length; 372 373 new = kzalloc(sizeof(*new), GFP_ATOMIC); 374 if (!new) 375 return -ENOMEM; 376 377 be->be_length = split - be->be_f_offset; 378 379 new->be_f_offset = split; 380 if (be->be_state != PNFS_BLOCK_NONE_DATA) 381 new->be_v_offset = be->be_v_offset + be->be_length; 382 new->be_length = orig_len - be->be_length; 383 new->be_state = be->be_state; 384 new->be_tag = be->be_tag; 385 new->be_device = nfs4_get_deviceid(be->be_device); 386 387 __ext_tree_insert(root, new, false); 388 return 0; 389 } 390 391 int 392 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, 393 sector_t len) 394 { 395 struct rb_root *root = &bl->bl_ext_rw; 396 sector_t end = start + len; 397 struct pnfs_block_extent *be; 398 int err = 0; 399 400 spin_lock(&bl->bl_ext_lock); 401 /* 402 * First remove all COW extents or holes from written to range. 403 */ 404 err = __ext_tree_remove(&bl->bl_ext_ro, start, end); 405 if (err) 406 goto out; 407 408 /* 409 * Then mark all invalid extents in the range as written to. 410 */ 411 for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { 412 if (be->be_f_offset >= end) 413 break; 414 415 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) 416 continue; 417 418 if (be->be_f_offset < start) { 419 struct pnfs_block_extent *left = ext_tree_prev(be); 420 421 if (left && ext_can_merge(left, be)) { 422 sector_t diff = start - be->be_f_offset; 423 424 left->be_length += diff; 425 426 be->be_f_offset += diff; 427 be->be_v_offset += diff; 428 be->be_length -= diff; 429 } else { 430 err = ext_tree_split(root, be, start); 431 if (err) 432 goto out; 433 } 434 } 435 436 if (ext_f_end(be) > end) { 437 struct pnfs_block_extent *right = ext_tree_next(be); 438 439 if (right && ext_can_merge(be, right)) { 440 sector_t diff = end - be->be_f_offset; 441 442 be->be_length -= diff; 443 444 right->be_f_offset -= diff; 445 right->be_v_offset -= diff; 446 right->be_length += diff; 447 } else { 448 err = ext_tree_split(root, be, end); 449 if (err) 450 goto out; 451 } 452 } 453 454 if (be->be_f_offset >= start && ext_f_end(be) <= end) { 455 be->be_tag = EXTENT_WRITTEN; 456 be = ext_try_to_merge_left(root, be); 457 be = ext_try_to_merge_right(root, be); 458 } 459 } 460 out: 461 spin_unlock(&bl->bl_ext_lock); 462 return err; 463 } 464 465 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count) 466 { 467 if (bl->bl_scsi_layout) 468 return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count; 469 else 470 return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count; 471 } 472 473 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, 474 size_t buffer_size) 475 { 476 if (arg->layoutupdate_pages != &arg->layoutupdate_page) { 477 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; 478 479 for (i = 0; i < nr_pages; i++) 480 put_page(arg->layoutupdate_pages[i]); 481 vfree(arg->start_p); 482 kfree(arg->layoutupdate_pages); 483 } else { 484 put_page(arg->layoutupdate_page); 485 } 486 } 487 488 static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p) 489 { 490 p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, 491 NFS4_DEVICEID4_SIZE); 492 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 493 p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 494 p = xdr_encode_hyper(p, 0LL); 495 *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); 496 return p; 497 } 498 499 static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p) 500 { 501 p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); 502 return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); 503 } 504 505 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, 506 size_t buffer_size, size_t *count) 507 { 508 struct pnfs_block_extent *be; 509 int ret = 0; 510 511 spin_lock(&bl->bl_ext_lock); 512 for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { 513 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 514 be->be_tag != EXTENT_WRITTEN) 515 continue; 516 517 (*count)++; 518 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) { 519 /* keep counting.. */ 520 ret = -ENOSPC; 521 continue; 522 } 523 524 if (bl->bl_scsi_layout) 525 p = encode_scsi_range(be, p); 526 else 527 p = encode_block_extent(be, p); 528 be->be_tag = EXTENT_COMMITTING; 529 } 530 spin_unlock(&bl->bl_ext_lock); 531 532 return ret; 533 } 534 535 int 536 ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) 537 { 538 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 539 size_t count = 0, buffer_size = PAGE_SIZE; 540 __be32 *start_p; 541 int ret; 542 543 dprintk("%s enter\n", __func__); 544 545 arg->layoutupdate_page = alloc_page(GFP_NOFS); 546 if (!arg->layoutupdate_page) 547 return -ENOMEM; 548 start_p = page_address(arg->layoutupdate_page); 549 arg->layoutupdate_pages = &arg->layoutupdate_page; 550 551 retry: 552 ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); 553 if (unlikely(ret)) { 554 ext_tree_free_commitdata(arg, buffer_size); 555 556 buffer_size = ext_tree_layoutupdate_size(bl, count); 557 count = 0; 558 559 arg->layoutupdate_pages = 560 kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), 561 sizeof(struct page *), GFP_NOFS); 562 if (!arg->layoutupdate_pages) 563 return -ENOMEM; 564 565 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); 566 if (!start_p) { 567 kfree(arg->layoutupdate_pages); 568 return -ENOMEM; 569 } 570 571 goto retry; 572 } 573 574 *start_p = cpu_to_be32(count); 575 arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count); 576 577 if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { 578 void *p = start_p, *end = p + arg->layoutupdate_len; 579 struct page *page = NULL; 580 int i = 0; 581 582 arg->start_p = start_p; 583 for ( ; p < end; p += PAGE_SIZE) { 584 page = vmalloc_to_page(p); 585 arg->layoutupdate_pages[i++] = page; 586 get_page(page); 587 } 588 } 589 590 dprintk("%s found %zu ranges\n", __func__, count); 591 return 0; 592 } 593 594 void 595 ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) 596 { 597 struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); 598 struct rb_root *root = &bl->bl_ext_rw; 599 struct pnfs_block_extent *be; 600 601 dprintk("%s status %d\n", __func__, status); 602 603 ext_tree_free_commitdata(arg, arg->layoutupdate_len); 604 605 spin_lock(&bl->bl_ext_lock); 606 for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { 607 if (be->be_state != PNFS_BLOCK_INVALID_DATA || 608 be->be_tag != EXTENT_COMMITTING) 609 continue; 610 611 if (status) { 612 /* 613 * Mark as written and try again. 614 * 615 * XXX: some real error handling here wouldn't hurt.. 616 */ 617 be->be_tag = EXTENT_WRITTEN; 618 } else { 619 be->be_state = PNFS_BLOCK_READWRITE_DATA; 620 be->be_tag = 0; 621 } 622 623 be = ext_try_to_merge_left(root, be); 624 be = ext_try_to_merge_right(root, be); 625 } 626 spin_unlock(&bl->bl_ext_lock); 627 } 628