1 /* 2 * linux/fs/nfs/blocklayout/blocklayout.c 3 * 4 * Module for the NFSv4.1 pNFS block layout driver. 5 * 6 * Copyright (c) 2006 The Regents of the University of Michigan. 7 * All rights reserved. 8 * 9 * Andy Adamson <andros@citi.umich.edu> 10 * Fred Isaman <iisaman@umich.edu> 11 * 12 * permission is granted to use, copy, create derivative works and 13 * redistribute this software and such derivative works for any purpose, 14 * so long as the name of the university of michigan is not used in 15 * any advertising or publicity pertaining to the use or distribution 16 * of this software without specific, written prior authorization. if 17 * the above copyright notice or any other identification of the 18 * university of michigan is included in any copy of any portion of 19 * this software, then the disclaimer below must also be included. 20 * 21 * this software is provided as is, without representation from the 22 * university of michigan as to its fitness for any purpose, and without 23 * warranty by the university of michigan of any kind, either express 24 * or implied, including without limitation the implied warranties of 25 * merchantability and fitness for a particular purpose. the regents 26 * of the university of michigan shall not be liable for any damages, 27 * including special, indirect, incidental, or consequential damages, 28 * with respect to any claim arising out or in connection with the use 29 * of the software, even if it has been or is hereafter advised of the 30 * possibility of such damages. 31 */ 32 33 #include <linux/module.h> 34 #include <linux/init.h> 35 #include <linux/mount.h> 36 #include <linux/namei.h> 37 #include <linux/bio.h> /* struct bio */ 38 #include <linux/buffer_head.h> /* various write calls */ 39 #include <linux/prefetch.h> 40 41 #include "../pnfs.h" 42 #include "../internal.h" 43 #include "blocklayout.h" 44 45 #define NFSDBG_FACILITY NFSDBG_PNFS_LD 46 47 MODULE_LICENSE("GPL"); 48 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); 49 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); 50 51 static void print_page(struct page *page) 52 { 53 dprintk("PRINTPAGE page %p\n", page); 54 dprintk(" PagePrivate %d\n", PagePrivate(page)); 55 dprintk(" PageUptodate %d\n", PageUptodate(page)); 56 dprintk(" PageError %d\n", PageError(page)); 57 dprintk(" PageDirty %d\n", PageDirty(page)); 58 dprintk(" PageReferenced %d\n", PageReferenced(page)); 59 dprintk(" PageLocked %d\n", PageLocked(page)); 60 dprintk(" PageWriteback %d\n", PageWriteback(page)); 61 dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); 62 dprintk("\n"); 63 } 64 65 /* Given the be associated with isect, determine if page data needs to be 66 * initialized. 67 */ 68 static int is_hole(struct pnfs_block_extent *be, sector_t isect) 69 { 70 if (be->be_state == PNFS_BLOCK_NONE_DATA) 71 return 1; 72 else if (be->be_state != PNFS_BLOCK_INVALID_DATA) 73 return 0; 74 else 75 return !bl_is_sector_init(be->be_inval, isect); 76 } 77 78 /* Given the be associated with isect, determine if page data can be 79 * written to disk. 80 */ 81 static int is_writable(struct pnfs_block_extent *be, sector_t isect) 82 { 83 return (be->be_state == PNFS_BLOCK_READWRITE_DATA || 84 be->be_state == PNFS_BLOCK_INVALID_DATA); 85 } 86 87 /* The data we are handed might be spread across several bios. We need 88 * to track when the last one is finished. 89 */ 90 struct parallel_io { 91 struct kref refcnt; 92 void (*pnfs_callback) (void *data, int num_se); 93 void *data; 94 int bse_count; 95 }; 96 97 static inline struct parallel_io *alloc_parallel(void *data) 98 { 99 struct parallel_io *rv; 100 101 rv = kmalloc(sizeof(*rv), GFP_NOFS); 102 if (rv) { 103 rv->data = data; 104 kref_init(&rv->refcnt); 105 rv->bse_count = 0; 106 } 107 return rv; 108 } 109 110 static inline void get_parallel(struct parallel_io *p) 111 { 112 kref_get(&p->refcnt); 113 } 114 115 static void destroy_parallel(struct kref *kref) 116 { 117 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 118 119 dprintk("%s enter\n", __func__); 120 p->pnfs_callback(p->data, p->bse_count); 121 kfree(p); 122 } 123 124 static inline void put_parallel(struct parallel_io *p) 125 { 126 kref_put(&p->refcnt, destroy_parallel); 127 } 128 129 static struct bio * 130 bl_submit_bio(int rw, struct bio *bio) 131 { 132 if (bio) { 133 get_parallel(bio->bi_private); 134 dprintk("%s submitting %s bio %u@%llu\n", __func__, 135 rw == READ ? "read" : "write", 136 bio->bi_size, (unsigned long long)bio->bi_sector); 137 submit_bio(rw, bio); 138 } 139 return NULL; 140 } 141 142 static struct bio *bl_alloc_init_bio(int npg, sector_t isect, 143 struct pnfs_block_extent *be, 144 void (*end_io)(struct bio *, int err), 145 struct parallel_io *par) 146 { 147 struct bio *bio; 148 149 npg = min(npg, BIO_MAX_PAGES); 150 bio = bio_alloc(GFP_NOIO, npg); 151 if (!bio && (current->flags & PF_MEMALLOC)) { 152 while (!bio && (npg /= 2)) 153 bio = bio_alloc(GFP_NOIO, npg); 154 } 155 156 if (bio) { 157 bio->bi_sector = isect - be->be_f_offset + be->be_v_offset; 158 bio->bi_bdev = be->be_mdev; 159 bio->bi_end_io = end_io; 160 bio->bi_private = par; 161 } 162 return bio; 163 } 164 165 static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, 166 sector_t isect, struct page *page, 167 struct pnfs_block_extent *be, 168 void (*end_io)(struct bio *, int err), 169 struct parallel_io *par) 170 { 171 retry: 172 if (!bio) { 173 bio = bl_alloc_init_bio(npg, isect, be, end_io, par); 174 if (!bio) 175 return ERR_PTR(-ENOMEM); 176 } 177 if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { 178 bio = bl_submit_bio(rw, bio); 179 goto retry; 180 } 181 return bio; 182 } 183 184 /* This is basically copied from mpage_end_io_read */ 185 static void bl_end_io_read(struct bio *bio, int err) 186 { 187 struct parallel_io *par = bio->bi_private; 188 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 189 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 190 191 do { 192 struct page *page = bvec->bv_page; 193 194 if (--bvec >= bio->bi_io_vec) 195 prefetchw(&bvec->bv_page->flags); 196 if (uptodate) 197 SetPageUptodate(page); 198 } while (bvec >= bio->bi_io_vec); 199 if (!uptodate) { 200 struct nfs_read_data *rdata = par->data; 201 struct nfs_pgio_header *header = rdata->header; 202 203 if (!header->pnfs_error) 204 header->pnfs_error = -EIO; 205 pnfs_set_lo_fail(header->lseg); 206 } 207 bio_put(bio); 208 put_parallel(par); 209 } 210 211 static void bl_read_cleanup(struct work_struct *work) 212 { 213 struct rpc_task *task; 214 struct nfs_read_data *rdata; 215 dprintk("%s enter\n", __func__); 216 task = container_of(work, struct rpc_task, u.tk_work); 217 rdata = container_of(task, struct nfs_read_data, task); 218 pnfs_ld_read_done(rdata); 219 } 220 221 static void 222 bl_end_par_io_read(void *data, int unused) 223 { 224 struct nfs_read_data *rdata = data; 225 226 rdata->task.tk_status = rdata->header->pnfs_error; 227 INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup); 228 schedule_work(&rdata->task.u.tk_work); 229 } 230 231 static enum pnfs_try_status 232 bl_read_pagelist(struct nfs_read_data *rdata) 233 { 234 struct nfs_pgio_header *header = rdata->header; 235 int i, hole; 236 struct bio *bio = NULL; 237 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 238 sector_t isect, extent_length = 0; 239 struct parallel_io *par; 240 loff_t f_offset = rdata->args.offset; 241 struct page **pages = rdata->args.pages; 242 int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT; 243 244 dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, 245 rdata->pages.npages, f_offset, (unsigned int)rdata->args.count); 246 247 par = alloc_parallel(rdata); 248 if (!par) 249 goto use_mds; 250 par->pnfs_callback = bl_end_par_io_read; 251 /* At this point, we can no longer jump to use_mds */ 252 253 isect = (sector_t) (f_offset >> SECTOR_SHIFT); 254 /* Code assumes extents are page-aligned */ 255 for (i = pg_index; i < rdata->pages.npages; i++) { 256 if (!extent_length) { 257 /* We've used up the previous extent */ 258 bl_put_extent(be); 259 bl_put_extent(cow_read); 260 bio = bl_submit_bio(READ, bio); 261 /* Get the next one */ 262 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 263 isect, &cow_read); 264 if (!be) { 265 header->pnfs_error = -EIO; 266 goto out; 267 } 268 extent_length = be->be_length - 269 (isect - be->be_f_offset); 270 if (cow_read) { 271 sector_t cow_length = cow_read->be_length - 272 (isect - cow_read->be_f_offset); 273 extent_length = min(extent_length, cow_length); 274 } 275 } 276 hole = is_hole(be, isect); 277 if (hole && !cow_read) { 278 bio = bl_submit_bio(READ, bio); 279 /* Fill hole w/ zeroes w/o accessing device */ 280 dprintk("%s Zeroing page for hole\n", __func__); 281 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE); 282 print_page(pages[i]); 283 SetPageUptodate(pages[i]); 284 } else { 285 struct pnfs_block_extent *be_read; 286 287 be_read = (hole && cow_read) ? cow_read : be; 288 bio = bl_add_page_to_bio(bio, rdata->pages.npages - i, 289 READ, 290 isect, pages[i], be_read, 291 bl_end_io_read, par); 292 if (IS_ERR(bio)) { 293 header->pnfs_error = PTR_ERR(bio); 294 bio = NULL; 295 goto out; 296 } 297 } 298 isect += PAGE_CACHE_SECTORS; 299 extent_length -= PAGE_CACHE_SECTORS; 300 } 301 if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { 302 rdata->res.eof = 1; 303 rdata->res.count = header->inode->i_size - f_offset; 304 } else { 305 rdata->res.count = (isect << SECTOR_SHIFT) - f_offset; 306 } 307 out: 308 bl_put_extent(be); 309 bl_put_extent(cow_read); 310 bl_submit_bio(READ, bio); 311 put_parallel(par); 312 return PNFS_ATTEMPTED; 313 314 use_mds: 315 dprintk("Giving up and using normal NFS\n"); 316 return PNFS_NOT_ATTEMPTED; 317 } 318 319 static void mark_extents_written(struct pnfs_block_layout *bl, 320 __u64 offset, __u32 count) 321 { 322 sector_t isect, end; 323 struct pnfs_block_extent *be; 324 struct pnfs_block_short_extent *se; 325 326 dprintk("%s(%llu, %u)\n", __func__, offset, count); 327 if (count == 0) 328 return; 329 isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; 330 end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); 331 end >>= SECTOR_SHIFT; 332 while (isect < end) { 333 sector_t len; 334 be = bl_find_get_extent(bl, isect, NULL); 335 BUG_ON(!be); /* FIXME */ 336 len = min(end, be->be_f_offset + be->be_length) - isect; 337 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 338 se = bl_pop_one_short_extent(be->be_inval); 339 BUG_ON(!se); 340 bl_mark_for_commit(be, isect, len, se); 341 } 342 isect += len; 343 bl_put_extent(be); 344 } 345 } 346 347 static void bl_end_io_write_zero(struct bio *bio, int err) 348 { 349 struct parallel_io *par = bio->bi_private; 350 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 351 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 352 353 do { 354 struct page *page = bvec->bv_page; 355 356 if (--bvec >= bio->bi_io_vec) 357 prefetchw(&bvec->bv_page->flags); 358 /* This is the zeroing page we added */ 359 end_page_writeback(page); 360 page_cache_release(page); 361 } while (bvec >= bio->bi_io_vec); 362 363 if (unlikely(!uptodate)) { 364 struct nfs_write_data *data = par->data; 365 struct nfs_pgio_header *header = data->header; 366 367 if (!header->pnfs_error) 368 header->pnfs_error = -EIO; 369 pnfs_set_lo_fail(header->lseg); 370 } 371 bio_put(bio); 372 put_parallel(par); 373 } 374 375 static void bl_end_io_write(struct bio *bio, int err) 376 { 377 struct parallel_io *par = bio->bi_private; 378 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 379 struct nfs_write_data *data = par->data; 380 struct nfs_pgio_header *header = data->header; 381 382 if (!uptodate) { 383 if (!header->pnfs_error) 384 header->pnfs_error = -EIO; 385 pnfs_set_lo_fail(header->lseg); 386 } 387 bio_put(bio); 388 put_parallel(par); 389 } 390 391 /* Function scheduled for call during bl_end_par_io_write, 392 * it marks sectors as written and extends the commitlist. 393 */ 394 static void bl_write_cleanup(struct work_struct *work) 395 { 396 struct rpc_task *task; 397 struct nfs_write_data *wdata; 398 dprintk("%s enter\n", __func__); 399 task = container_of(work, struct rpc_task, u.tk_work); 400 wdata = container_of(task, struct nfs_write_data, task); 401 if (likely(!wdata->header->pnfs_error)) { 402 /* Marks for LAYOUTCOMMIT */ 403 mark_extents_written(BLK_LSEG2EXT(wdata->header->lseg), 404 wdata->args.offset, wdata->args.count); 405 } 406 pnfs_ld_write_done(wdata); 407 } 408 409 /* Called when last of bios associated with a bl_write_pagelist call finishes */ 410 static void bl_end_par_io_write(void *data, int num_se) 411 { 412 struct nfs_write_data *wdata = data; 413 414 if (unlikely(wdata->header->pnfs_error)) { 415 bl_free_short_extents(&BLK_LSEG2EXT(wdata->header->lseg)->bl_inval, 416 num_se); 417 } 418 419 wdata->task.tk_status = wdata->header->pnfs_error; 420 wdata->verf.committed = NFS_FILE_SYNC; 421 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); 422 schedule_work(&wdata->task.u.tk_work); 423 } 424 425 /* FIXME STUB - mark intersection of layout and page as bad, so is not 426 * used again. 427 */ 428 static void mark_bad_read(void) 429 { 430 return; 431 } 432 433 /* 434 * map_block: map a requested I/0 block (isect) into an offset in the LVM 435 * block_device 436 */ 437 static void 438 map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) 439 { 440 dprintk("%s enter be=%p\n", __func__, be); 441 442 set_buffer_mapped(bh); 443 bh->b_bdev = be->be_mdev; 444 bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> 445 (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); 446 447 dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", 448 __func__, (unsigned long long)isect, (long)bh->b_blocknr, 449 bh->b_size); 450 return; 451 } 452 453 /* Given an unmapped page, zero it or read in page for COW, page is locked 454 * by caller. 455 */ 456 static int 457 init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) 458 { 459 struct buffer_head *bh = NULL; 460 int ret = 0; 461 sector_t isect; 462 463 dprintk("%s enter, %p\n", __func__, page); 464 BUG_ON(PageUptodate(page)); 465 if (!cow_read) { 466 zero_user_segment(page, 0, PAGE_SIZE); 467 SetPageUptodate(page); 468 goto cleanup; 469 } 470 471 bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); 472 if (!bh) { 473 ret = -ENOMEM; 474 goto cleanup; 475 } 476 477 isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; 478 map_block(bh, isect, cow_read); 479 if (!bh_uptodate_or_lock(bh)) 480 ret = bh_submit_read(bh); 481 if (ret) 482 goto cleanup; 483 SetPageUptodate(page); 484 485 cleanup: 486 bl_put_extent(cow_read); 487 if (bh) 488 free_buffer_head(bh); 489 if (ret) { 490 /* Need to mark layout with bad read...should now 491 * just use nfs4 for reads and writes. 492 */ 493 mark_bad_read(); 494 } 495 return ret; 496 } 497 498 /* Find or create a zeroing page marked being writeback. 499 * Return ERR_PTR on error, NULL to indicate skip this page and page itself 500 * to indicate write out. 501 */ 502 static struct page * 503 bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, 504 struct pnfs_block_extent *cow_read) 505 { 506 struct page *page; 507 int locked = 0; 508 page = find_get_page(inode->i_mapping, index); 509 if (page) 510 goto check_page; 511 512 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 513 if (unlikely(!page)) { 514 dprintk("%s oom\n", __func__); 515 return ERR_PTR(-ENOMEM); 516 } 517 locked = 1; 518 519 check_page: 520 /* PageDirty: Other will write this out 521 * PageWriteback: Other is writing this out 522 * PageUptodate: It was read before 523 */ 524 if (PageDirty(page) || PageWriteback(page)) { 525 print_page(page); 526 if (locked) 527 unlock_page(page); 528 page_cache_release(page); 529 return NULL; 530 } 531 532 if (!locked) { 533 lock_page(page); 534 locked = 1; 535 goto check_page; 536 } 537 if (!PageUptodate(page)) { 538 /* New page, readin or zero it */ 539 init_page_for_write(page, cow_read); 540 } 541 set_page_writeback(page); 542 unlock_page(page); 543 544 return page; 545 } 546 547 static enum pnfs_try_status 548 bl_write_pagelist(struct nfs_write_data *wdata, int sync) 549 { 550 struct nfs_pgio_header *header = wdata->header; 551 int i, ret, npg_zero, pg_index, last = 0; 552 struct bio *bio = NULL; 553 struct pnfs_block_extent *be = NULL, *cow_read = NULL; 554 sector_t isect, last_isect = 0, extent_length = 0; 555 struct parallel_io *par; 556 loff_t offset = wdata->args.offset; 557 size_t count = wdata->args.count; 558 struct page **pages = wdata->args.pages; 559 struct page *page; 560 pgoff_t index; 561 u64 temp; 562 int npg_per_block = 563 NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; 564 565 dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); 566 /* At this point, wdata->pages is a (sequential) list of nfs_pages. 567 * We want to write each, and if there is an error set pnfs_error 568 * to have it redone using nfs. 569 */ 570 par = alloc_parallel(wdata); 571 if (!par) 572 goto out_mds; 573 par->pnfs_callback = bl_end_par_io_write; 574 /* At this point, have to be more careful with error handling */ 575 576 isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 577 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); 578 if (!be || !is_writable(be, isect)) { 579 dprintk("%s no matching extents!\n", __func__); 580 goto out_mds; 581 } 582 583 /* First page inside INVALID extent */ 584 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 585 if (likely(!bl_push_one_short_extent(be->be_inval))) 586 par->bse_count++; 587 else 588 goto out_mds; 589 temp = offset >> PAGE_CACHE_SHIFT; 590 npg_zero = do_div(temp, npg_per_block); 591 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & 592 (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); 593 extent_length = be->be_length - (isect - be->be_f_offset); 594 595 fill_invalid_ext: 596 dprintk("%s need to zero %d pages\n", __func__, npg_zero); 597 for (;npg_zero > 0; npg_zero--) { 598 if (bl_is_sector_init(be->be_inval, isect)) { 599 dprintk("isect %llu already init\n", 600 (unsigned long long)isect); 601 goto next_page; 602 } 603 /* page ref released in bl_end_io_write_zero */ 604 index = isect >> PAGE_CACHE_SECTOR_SHIFT; 605 dprintk("%s zero %dth page: index %lu isect %llu\n", 606 __func__, npg_zero, index, 607 (unsigned long long)isect); 608 page = bl_find_get_zeroing_page(header->inode, index, 609 cow_read); 610 if (unlikely(IS_ERR(page))) { 611 header->pnfs_error = PTR_ERR(page); 612 goto out; 613 } else if (page == NULL) 614 goto next_page; 615 616 ret = bl_mark_sectors_init(be->be_inval, isect, 617 PAGE_CACHE_SECTORS); 618 if (unlikely(ret)) { 619 dprintk("%s bl_mark_sectors_init fail %d\n", 620 __func__, ret); 621 end_page_writeback(page); 622 page_cache_release(page); 623 header->pnfs_error = ret; 624 goto out; 625 } 626 if (likely(!bl_push_one_short_extent(be->be_inval))) 627 par->bse_count++; 628 else { 629 end_page_writeback(page); 630 page_cache_release(page); 631 header->pnfs_error = -ENOMEM; 632 goto out; 633 } 634 /* FIXME: This should be done in bi_end_io */ 635 mark_extents_written(BLK_LSEG2EXT(header->lseg), 636 page->index << PAGE_CACHE_SHIFT, 637 PAGE_CACHE_SIZE); 638 639 bio = bl_add_page_to_bio(bio, npg_zero, WRITE, 640 isect, page, be, 641 bl_end_io_write_zero, par); 642 if (IS_ERR(bio)) { 643 header->pnfs_error = PTR_ERR(bio); 644 bio = NULL; 645 goto out; 646 } 647 next_page: 648 isect += PAGE_CACHE_SECTORS; 649 extent_length -= PAGE_CACHE_SECTORS; 650 } 651 if (last) 652 goto write_done; 653 } 654 bio = bl_submit_bio(WRITE, bio); 655 656 /* Middle pages */ 657 pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT; 658 for (i = pg_index; i < wdata->pages.npages; i++) { 659 if (!extent_length) { 660 /* We've used up the previous extent */ 661 bl_put_extent(be); 662 bio = bl_submit_bio(WRITE, bio); 663 /* Get the next one */ 664 be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), 665 isect, NULL); 666 if (!be || !is_writable(be, isect)) { 667 header->pnfs_error = -EINVAL; 668 goto out; 669 } 670 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 671 if (likely(!bl_push_one_short_extent( 672 be->be_inval))) 673 par->bse_count++; 674 else { 675 header->pnfs_error = -ENOMEM; 676 goto out; 677 } 678 } 679 extent_length = be->be_length - 680 (isect - be->be_f_offset); 681 } 682 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 683 ret = bl_mark_sectors_init(be->be_inval, isect, 684 PAGE_CACHE_SECTORS); 685 if (unlikely(ret)) { 686 dprintk("%s bl_mark_sectors_init fail %d\n", 687 __func__, ret); 688 header->pnfs_error = ret; 689 goto out; 690 } 691 } 692 bio = bl_add_page_to_bio(bio, wdata->pages.npages - i, WRITE, 693 isect, pages[i], be, 694 bl_end_io_write, par); 695 if (IS_ERR(bio)) { 696 header->pnfs_error = PTR_ERR(bio); 697 bio = NULL; 698 goto out; 699 } 700 isect += PAGE_CACHE_SECTORS; 701 last_isect = isect; 702 extent_length -= PAGE_CACHE_SECTORS; 703 } 704 705 /* Last page inside INVALID extent */ 706 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 707 bio = bl_submit_bio(WRITE, bio); 708 temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; 709 npg_zero = npg_per_block - do_div(temp, npg_per_block); 710 if (npg_zero < npg_per_block) { 711 last = 1; 712 goto fill_invalid_ext; 713 } 714 } 715 716 write_done: 717 wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset); 718 if (count < wdata->res.count) { 719 wdata->res.count = count; 720 } 721 out: 722 bl_put_extent(be); 723 bl_submit_bio(WRITE, bio); 724 put_parallel(par); 725 return PNFS_ATTEMPTED; 726 out_mds: 727 bl_put_extent(be); 728 kfree(par); 729 return PNFS_NOT_ATTEMPTED; 730 } 731 732 /* FIXME - range ignored */ 733 static void 734 release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) 735 { 736 int i; 737 struct pnfs_block_extent *be; 738 739 spin_lock(&bl->bl_ext_lock); 740 for (i = 0; i < EXTENT_LISTS; i++) { 741 while (!list_empty(&bl->bl_extents[i])) { 742 be = list_first_entry(&bl->bl_extents[i], 743 struct pnfs_block_extent, 744 be_node); 745 list_del(&be->be_node); 746 bl_put_extent(be); 747 } 748 } 749 spin_unlock(&bl->bl_ext_lock); 750 } 751 752 static void 753 release_inval_marks(struct pnfs_inval_markings *marks) 754 { 755 struct pnfs_inval_tracking *pos, *temp; 756 struct pnfs_block_short_extent *se, *stemp; 757 758 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { 759 list_del(&pos->it_link); 760 kfree(pos); 761 } 762 763 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { 764 list_del(&se->bse_node); 765 kfree(se); 766 } 767 return; 768 } 769 770 static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) 771 { 772 struct pnfs_block_layout *bl = BLK_LO2EXT(lo); 773 774 dprintk("%s enter\n", __func__); 775 release_extents(bl, NULL); 776 release_inval_marks(&bl->bl_inval); 777 kfree(bl); 778 } 779 780 static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, 781 gfp_t gfp_flags) 782 { 783 struct pnfs_block_layout *bl; 784 785 dprintk("%s enter\n", __func__); 786 bl = kzalloc(sizeof(*bl), gfp_flags); 787 if (!bl) 788 return NULL; 789 spin_lock_init(&bl->bl_ext_lock); 790 INIT_LIST_HEAD(&bl->bl_extents[0]); 791 INIT_LIST_HEAD(&bl->bl_extents[1]); 792 INIT_LIST_HEAD(&bl->bl_commit); 793 INIT_LIST_HEAD(&bl->bl_committing); 794 bl->bl_count = 0; 795 bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; 796 BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); 797 return &bl->bl_layout; 798 } 799 800 static void bl_free_lseg(struct pnfs_layout_segment *lseg) 801 { 802 dprintk("%s enter\n", __func__); 803 kfree(lseg); 804 } 805 806 /* We pretty much ignore lseg, and store all data layout wide, so we 807 * can correctly merge. 808 */ 809 static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, 810 struct nfs4_layoutget_res *lgr, 811 gfp_t gfp_flags) 812 { 813 struct pnfs_layout_segment *lseg; 814 int status; 815 816 dprintk("%s enter\n", __func__); 817 lseg = kzalloc(sizeof(*lseg), gfp_flags); 818 if (!lseg) 819 return ERR_PTR(-ENOMEM); 820 status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); 821 if (status) { 822 /* We don't want to call the full-blown bl_free_lseg, 823 * since on error extents were not touched. 824 */ 825 kfree(lseg); 826 return ERR_PTR(status); 827 } 828 return lseg; 829 } 830 831 static void 832 bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, 833 const struct nfs4_layoutcommit_args *arg) 834 { 835 dprintk("%s enter\n", __func__); 836 encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); 837 } 838 839 static void 840 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) 841 { 842 struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; 843 844 dprintk("%s enter\n", __func__); 845 clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); 846 } 847 848 static void free_blk_mountid(struct block_mount_id *mid) 849 { 850 if (mid) { 851 struct pnfs_block_dev *dev, *tmp; 852 853 /* No need to take bm_lock as we are last user freeing bm_devlist */ 854 list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { 855 list_del(&dev->bm_node); 856 bl_free_block_dev(dev); 857 } 858 kfree(mid); 859 } 860 } 861 862 /* This is mostly copied from the filelayout's get_device_info function. 863 * It seems much of this should be at the generic pnfs level. 864 */ 865 static struct pnfs_block_dev * 866 nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, 867 struct nfs4_deviceid *d_id) 868 { 869 struct pnfs_device *dev; 870 struct pnfs_block_dev *rv; 871 u32 max_resp_sz; 872 int max_pages; 873 struct page **pages = NULL; 874 int i, rc; 875 876 /* 877 * Use the session max response size as the basis for setting 878 * GETDEVICEINFO's maxcount 879 */ 880 max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; 881 max_pages = nfs_page_array_len(0, max_resp_sz); 882 dprintk("%s max_resp_sz %u max_pages %d\n", 883 __func__, max_resp_sz, max_pages); 884 885 dev = kmalloc(sizeof(*dev), GFP_NOFS); 886 if (!dev) { 887 dprintk("%s kmalloc failed\n", __func__); 888 return ERR_PTR(-ENOMEM); 889 } 890 891 pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS); 892 if (pages == NULL) { 893 kfree(dev); 894 return ERR_PTR(-ENOMEM); 895 } 896 for (i = 0; i < max_pages; i++) { 897 pages[i] = alloc_page(GFP_NOFS); 898 if (!pages[i]) { 899 rv = ERR_PTR(-ENOMEM); 900 goto out_free; 901 } 902 } 903 904 memcpy(&dev->dev_id, d_id, sizeof(*d_id)); 905 dev->layout_type = LAYOUT_BLOCK_VOLUME; 906 dev->pages = pages; 907 dev->pgbase = 0; 908 dev->pglen = PAGE_SIZE * max_pages; 909 dev->mincount = 0; 910 911 dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); 912 rc = nfs4_proc_getdeviceinfo(server, dev); 913 dprintk("%s getdevice info returns %d\n", __func__, rc); 914 if (rc) { 915 rv = ERR_PTR(rc); 916 goto out_free; 917 } 918 919 rv = nfs4_blk_decode_device(server, dev); 920 out_free: 921 for (i = 0; i < max_pages; i++) 922 __free_page(pages[i]); 923 kfree(pages); 924 kfree(dev); 925 return rv; 926 } 927 928 static int 929 bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) 930 { 931 struct block_mount_id *b_mt_id = NULL; 932 struct pnfs_devicelist *dlist = NULL; 933 struct pnfs_block_dev *bdev; 934 LIST_HEAD(block_disklist); 935 int status, i; 936 937 dprintk("%s enter\n", __func__); 938 939 if (server->pnfs_blksize == 0) { 940 dprintk("%s Server did not return blksize\n", __func__); 941 return -EINVAL; 942 } 943 b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); 944 if (!b_mt_id) { 945 status = -ENOMEM; 946 goto out_error; 947 } 948 /* Initialize nfs4 block layout mount id */ 949 spin_lock_init(&b_mt_id->bm_lock); 950 INIT_LIST_HEAD(&b_mt_id->bm_devlist); 951 952 dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); 953 if (!dlist) { 954 status = -ENOMEM; 955 goto out_error; 956 } 957 dlist->eof = 0; 958 while (!dlist->eof) { 959 status = nfs4_proc_getdevicelist(server, fh, dlist); 960 if (status) 961 goto out_error; 962 dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", 963 __func__, dlist->num_devs, dlist->eof); 964 for (i = 0; i < dlist->num_devs; i++) { 965 bdev = nfs4_blk_get_deviceinfo(server, fh, 966 &dlist->dev_id[i]); 967 if (IS_ERR(bdev)) { 968 status = PTR_ERR(bdev); 969 goto out_error; 970 } 971 spin_lock(&b_mt_id->bm_lock); 972 list_add(&bdev->bm_node, &b_mt_id->bm_devlist); 973 spin_unlock(&b_mt_id->bm_lock); 974 } 975 } 976 dprintk("%s SUCCESS\n", __func__); 977 server->pnfs_ld_data = b_mt_id; 978 979 out_return: 980 kfree(dlist); 981 return status; 982 983 out_error: 984 free_blk_mountid(b_mt_id); 985 goto out_return; 986 } 987 988 static int 989 bl_clear_layoutdriver(struct nfs_server *server) 990 { 991 struct block_mount_id *b_mt_id = server->pnfs_ld_data; 992 993 dprintk("%s enter\n", __func__); 994 free_blk_mountid(b_mt_id); 995 dprintk("%s RETURNS\n", __func__); 996 return 0; 997 } 998 999 static const struct nfs_pageio_ops bl_pg_read_ops = { 1000 .pg_init = pnfs_generic_pg_init_read, 1001 .pg_test = pnfs_generic_pg_test, 1002 .pg_doio = pnfs_generic_pg_readpages, 1003 }; 1004 1005 static const struct nfs_pageio_ops bl_pg_write_ops = { 1006 .pg_init = pnfs_generic_pg_init_write, 1007 .pg_test = pnfs_generic_pg_test, 1008 .pg_doio = pnfs_generic_pg_writepages, 1009 }; 1010 1011 static struct pnfs_layoutdriver_type blocklayout_type = { 1012 .id = LAYOUT_BLOCK_VOLUME, 1013 .name = "LAYOUT_BLOCK_VOLUME", 1014 .read_pagelist = bl_read_pagelist, 1015 .write_pagelist = bl_write_pagelist, 1016 .alloc_layout_hdr = bl_alloc_layout_hdr, 1017 .free_layout_hdr = bl_free_layout_hdr, 1018 .alloc_lseg = bl_alloc_lseg, 1019 .free_lseg = bl_free_lseg, 1020 .encode_layoutcommit = bl_encode_layoutcommit, 1021 .cleanup_layoutcommit = bl_cleanup_layoutcommit, 1022 .set_layoutdriver = bl_set_layoutdriver, 1023 .clear_layoutdriver = bl_clear_layoutdriver, 1024 .pg_read_ops = &bl_pg_read_ops, 1025 .pg_write_ops = &bl_pg_write_ops, 1026 }; 1027 1028 static const struct rpc_pipe_ops bl_upcall_ops = { 1029 .upcall = rpc_pipe_generic_upcall, 1030 .downcall = bl_pipe_downcall, 1031 .destroy_msg = bl_pipe_destroy_msg, 1032 }; 1033 1034 static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, 1035 struct rpc_pipe *pipe) 1036 { 1037 struct dentry *dir, *dentry; 1038 1039 dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); 1040 if (dir == NULL) 1041 return ERR_PTR(-ENOENT); 1042 dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); 1043 dput(dir); 1044 return dentry; 1045 } 1046 1047 static void nfs4blocklayout_unregister_sb(struct super_block *sb, 1048 struct rpc_pipe *pipe) 1049 { 1050 if (pipe->dentry) 1051 rpc_unlink(pipe->dentry); 1052 } 1053 1054 static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, 1055 void *ptr) 1056 { 1057 struct super_block *sb = ptr; 1058 struct net *net = sb->s_fs_info; 1059 struct nfs_net *nn = net_generic(net, nfs_net_id); 1060 struct dentry *dentry; 1061 int ret = 0; 1062 1063 if (!try_module_get(THIS_MODULE)) 1064 return 0; 1065 1066 if (nn->bl_device_pipe == NULL) { 1067 module_put(THIS_MODULE); 1068 return 0; 1069 } 1070 1071 switch (event) { 1072 case RPC_PIPEFS_MOUNT: 1073 dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); 1074 if (IS_ERR(dentry)) { 1075 ret = PTR_ERR(dentry); 1076 break; 1077 } 1078 nn->bl_device_pipe->dentry = dentry; 1079 break; 1080 case RPC_PIPEFS_UMOUNT: 1081 if (nn->bl_device_pipe->dentry) 1082 nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); 1083 break; 1084 default: 1085 ret = -ENOTSUPP; 1086 break; 1087 } 1088 module_put(THIS_MODULE); 1089 return ret; 1090 } 1091 1092 static struct notifier_block nfs4blocklayout_block = { 1093 .notifier_call = rpc_pipefs_event, 1094 }; 1095 1096 static struct dentry *nfs4blocklayout_register_net(struct net *net, 1097 struct rpc_pipe *pipe) 1098 { 1099 struct super_block *pipefs_sb; 1100 struct dentry *dentry; 1101 1102 pipefs_sb = rpc_get_sb_net(net); 1103 if (!pipefs_sb) 1104 return NULL; 1105 dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); 1106 rpc_put_sb_net(net); 1107 return dentry; 1108 } 1109 1110 static void nfs4blocklayout_unregister_net(struct net *net, 1111 struct rpc_pipe *pipe) 1112 { 1113 struct super_block *pipefs_sb; 1114 1115 pipefs_sb = rpc_get_sb_net(net); 1116 if (pipefs_sb) { 1117 nfs4blocklayout_unregister_sb(pipefs_sb, pipe); 1118 rpc_put_sb_net(net); 1119 } 1120 } 1121 1122 static int nfs4blocklayout_net_init(struct net *net) 1123 { 1124 struct nfs_net *nn = net_generic(net, nfs_net_id); 1125 struct dentry *dentry; 1126 1127 init_waitqueue_head(&nn->bl_wq); 1128 nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); 1129 if (IS_ERR(nn->bl_device_pipe)) 1130 return PTR_ERR(nn->bl_device_pipe); 1131 dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); 1132 if (IS_ERR(dentry)) { 1133 rpc_destroy_pipe_data(nn->bl_device_pipe); 1134 return PTR_ERR(dentry); 1135 } 1136 nn->bl_device_pipe->dentry = dentry; 1137 return 0; 1138 } 1139 1140 static void nfs4blocklayout_net_exit(struct net *net) 1141 { 1142 struct nfs_net *nn = net_generic(net, nfs_net_id); 1143 1144 nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); 1145 rpc_destroy_pipe_data(nn->bl_device_pipe); 1146 nn->bl_device_pipe = NULL; 1147 } 1148 1149 static struct pernet_operations nfs4blocklayout_net_ops = { 1150 .init = nfs4blocklayout_net_init, 1151 .exit = nfs4blocklayout_net_exit, 1152 }; 1153 1154 static int __init nfs4blocklayout_init(void) 1155 { 1156 int ret; 1157 1158 dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__); 1159 1160 ret = pnfs_register_layoutdriver(&blocklayout_type); 1161 if (ret) 1162 goto out; 1163 1164 ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); 1165 if (ret) 1166 goto out_remove; 1167 ret = register_pernet_subsys(&nfs4blocklayout_net_ops); 1168 if (ret) 1169 goto out_notifier; 1170 out: 1171 return ret; 1172 1173 out_notifier: 1174 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); 1175 out_remove: 1176 pnfs_unregister_layoutdriver(&blocklayout_type); 1177 return ret; 1178 } 1179 1180 static void __exit nfs4blocklayout_exit(void) 1181 { 1182 dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", 1183 __func__); 1184 1185 rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); 1186 unregister_pernet_subsys(&nfs4blocklayout_net_ops); 1187 pnfs_unregister_layoutdriver(&blocklayout_type); 1188 } 1189 1190 MODULE_ALIAS("nfs-layouttype4-3"); 1191 1192 module_init(nfs4blocklayout_init); 1193 module_exit(nfs4blocklayout_exit); 1194