xref: /linux/fs/nfs/blocklayout/blocklayout.c (revision 2c1ba398ac9da3305815f6ae8e95ae2b9fd3b5ff)
1 /*
2  *  linux/fs/nfs/blocklayout/blocklayout.c
3  *
4  *  Module for the NFSv4.1 pNFS block layout driver.
5  *
6  *  Copyright (c) 2006 The Regents of the University of Michigan.
7  *  All rights reserved.
8  *
9  *  Andy Adamson <andros@citi.umich.edu>
10  *  Fred Isaman <iisaman@umich.edu>
11  *
12  * permission is granted to use, copy, create derivative works and
13  * redistribute this software and such derivative works for any purpose,
14  * so long as the name of the university of michigan is not used in
15  * any advertising or publicity pertaining to the use or distribution
16  * of this software without specific, written prior authorization.  if
17  * the above copyright notice or any other identification of the
18  * university of michigan is included in any copy of any portion of
19  * this software, then the disclaimer below must also be included.
20  *
21  * this software is provided as is, without representation from the
22  * university of michigan as to its fitness for any purpose, and without
23  * warranty by the university of michigan of any kind, either express
24  * or implied, including without limitation the implied warranties of
25  * merchantability and fitness for a particular purpose.  the regents
26  * of the university of michigan shall not be liable for any damages,
27  * including special, indirect, incidental, or consequential damages,
28  * with respect to any claim arising out or in connection with the use
29  * of the software, even if it has been or is hereafter advised of the
30  * possibility of such damages.
31  */
32 
33 #include <linux/module.h>
34 #include <linux/init.h>
35 #include <linux/mount.h>
36 #include <linux/namei.h>
37 #include <linux/bio.h>		/* struct bio */
38 #include <linux/buffer_head.h>	/* various write calls */
39 
40 #include "blocklayout.h"
41 
42 #define NFSDBG_FACILITY	NFSDBG_PNFS_LD
43 
44 MODULE_LICENSE("GPL");
45 MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
46 MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
47 
48 struct dentry *bl_device_pipe;
49 wait_queue_head_t bl_wq;
50 
51 static void print_page(struct page *page)
52 {
53 	dprintk("PRINTPAGE page %p\n", page);
54 	dprintk("	PagePrivate %d\n", PagePrivate(page));
55 	dprintk("	PageUptodate %d\n", PageUptodate(page));
56 	dprintk("	PageError %d\n", PageError(page));
57 	dprintk("	PageDirty %d\n", PageDirty(page));
58 	dprintk("	PageReferenced %d\n", PageReferenced(page));
59 	dprintk("	PageLocked %d\n", PageLocked(page));
60 	dprintk("	PageWriteback %d\n", PageWriteback(page));
61 	dprintk("	PageMappedToDisk %d\n", PageMappedToDisk(page));
62 	dprintk("\n");
63 }
64 
65 /* Given the be associated with isect, determine if page data needs to be
66  * initialized.
67  */
68 static int is_hole(struct pnfs_block_extent *be, sector_t isect)
69 {
70 	if (be->be_state == PNFS_BLOCK_NONE_DATA)
71 		return 1;
72 	else if (be->be_state != PNFS_BLOCK_INVALID_DATA)
73 		return 0;
74 	else
75 		return !bl_is_sector_init(be->be_inval, isect);
76 }
77 
78 /* Given the be associated with isect, determine if page data can be
79  * written to disk.
80  */
81 static int is_writable(struct pnfs_block_extent *be, sector_t isect)
82 {
83 	return (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
84 		be->be_state == PNFS_BLOCK_INVALID_DATA);
85 }
86 
87 /* The data we are handed might be spread across several bios.  We need
88  * to track when the last one is finished.
89  */
90 struct parallel_io {
91 	struct kref refcnt;
92 	struct rpc_call_ops call_ops;
93 	void (*pnfs_callback) (void *data);
94 	void *data;
95 };
96 
97 static inline struct parallel_io *alloc_parallel(void *data)
98 {
99 	struct parallel_io *rv;
100 
101 	rv  = kmalloc(sizeof(*rv), GFP_NOFS);
102 	if (rv) {
103 		rv->data = data;
104 		kref_init(&rv->refcnt);
105 	}
106 	return rv;
107 }
108 
109 static inline void get_parallel(struct parallel_io *p)
110 {
111 	kref_get(&p->refcnt);
112 }
113 
114 static void destroy_parallel(struct kref *kref)
115 {
116 	struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
117 
118 	dprintk("%s enter\n", __func__);
119 	p->pnfs_callback(p->data);
120 	kfree(p);
121 }
122 
123 static inline void put_parallel(struct parallel_io *p)
124 {
125 	kref_put(&p->refcnt, destroy_parallel);
126 }
127 
128 static struct bio *
129 bl_submit_bio(int rw, struct bio *bio)
130 {
131 	if (bio) {
132 		get_parallel(bio->bi_private);
133 		dprintk("%s submitting %s bio %u@%llu\n", __func__,
134 			rw == READ ? "read" : "write",
135 			bio->bi_size, (unsigned long long)bio->bi_sector);
136 		submit_bio(rw, bio);
137 	}
138 	return NULL;
139 }
140 
141 static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
142 				     struct pnfs_block_extent *be,
143 				     void (*end_io)(struct bio *, int err),
144 				     struct parallel_io *par)
145 {
146 	struct bio *bio;
147 
148 	bio = bio_alloc(GFP_NOIO, npg);
149 	if (!bio)
150 		return NULL;
151 
152 	bio->bi_sector = isect - be->be_f_offset + be->be_v_offset;
153 	bio->bi_bdev = be->be_mdev;
154 	bio->bi_end_io = end_io;
155 	bio->bi_private = par;
156 	return bio;
157 }
158 
159 static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw,
160 				      sector_t isect, struct page *page,
161 				      struct pnfs_block_extent *be,
162 				      void (*end_io)(struct bio *, int err),
163 				      struct parallel_io *par)
164 {
165 retry:
166 	if (!bio) {
167 		bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
168 		if (!bio)
169 			return ERR_PTR(-ENOMEM);
170 	}
171 	if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
172 		bio = bl_submit_bio(rw, bio);
173 		goto retry;
174 	}
175 	return bio;
176 }
177 
178 static void bl_set_lo_fail(struct pnfs_layout_segment *lseg)
179 {
180 	if (lseg->pls_range.iomode == IOMODE_RW) {
181 		dprintk("%s Setting layout IOMODE_RW fail bit\n", __func__);
182 		set_bit(lo_fail_bit(IOMODE_RW), &lseg->pls_layout->plh_flags);
183 	} else {
184 		dprintk("%s Setting layout IOMODE_READ fail bit\n", __func__);
185 		set_bit(lo_fail_bit(IOMODE_READ), &lseg->pls_layout->plh_flags);
186 	}
187 }
188 
189 /* This is basically copied from mpage_end_io_read */
190 static void bl_end_io_read(struct bio *bio, int err)
191 {
192 	struct parallel_io *par = bio->bi_private;
193 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
194 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
195 	struct nfs_read_data *rdata = (struct nfs_read_data *)par->data;
196 
197 	do {
198 		struct page *page = bvec->bv_page;
199 
200 		if (--bvec >= bio->bi_io_vec)
201 			prefetchw(&bvec->bv_page->flags);
202 		if (uptodate)
203 			SetPageUptodate(page);
204 	} while (bvec >= bio->bi_io_vec);
205 	if (!uptodate) {
206 		if (!rdata->pnfs_error)
207 			rdata->pnfs_error = -EIO;
208 		bl_set_lo_fail(rdata->lseg);
209 	}
210 	bio_put(bio);
211 	put_parallel(par);
212 }
213 
214 static void bl_read_cleanup(struct work_struct *work)
215 {
216 	struct rpc_task *task;
217 	struct nfs_read_data *rdata;
218 	dprintk("%s enter\n", __func__);
219 	task = container_of(work, struct rpc_task, u.tk_work);
220 	rdata = container_of(task, struct nfs_read_data, task);
221 	pnfs_ld_read_done(rdata);
222 }
223 
224 static void
225 bl_end_par_io_read(void *data)
226 {
227 	struct nfs_read_data *rdata = data;
228 
229 	INIT_WORK(&rdata->task.u.tk_work, bl_read_cleanup);
230 	schedule_work(&rdata->task.u.tk_work);
231 }
232 
233 /* We don't want normal .rpc_call_done callback used, so we replace it
234  * with this stub.
235  */
236 static void bl_rpc_do_nothing(struct rpc_task *task, void *calldata)
237 {
238 	return;
239 }
240 
241 static enum pnfs_try_status
242 bl_read_pagelist(struct nfs_read_data *rdata)
243 {
244 	int i, hole;
245 	struct bio *bio = NULL;
246 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
247 	sector_t isect, extent_length = 0;
248 	struct parallel_io *par;
249 	loff_t f_offset = rdata->args.offset;
250 	size_t count = rdata->args.count;
251 	struct page **pages = rdata->args.pages;
252 	int pg_index = rdata->args.pgbase >> PAGE_CACHE_SHIFT;
253 
254 	dprintk("%s enter nr_pages %u offset %lld count %Zd\n", __func__,
255 	       rdata->npages, f_offset, count);
256 
257 	par = alloc_parallel(rdata);
258 	if (!par)
259 		goto use_mds;
260 	par->call_ops = *rdata->mds_ops;
261 	par->call_ops.rpc_call_done = bl_rpc_do_nothing;
262 	par->pnfs_callback = bl_end_par_io_read;
263 	/* At this point, we can no longer jump to use_mds */
264 
265 	isect = (sector_t) (f_offset >> SECTOR_SHIFT);
266 	/* Code assumes extents are page-aligned */
267 	for (i = pg_index; i < rdata->npages; i++) {
268 		if (!extent_length) {
269 			/* We've used up the previous extent */
270 			bl_put_extent(be);
271 			bl_put_extent(cow_read);
272 			bio = bl_submit_bio(READ, bio);
273 			/* Get the next one */
274 			be = bl_find_get_extent(BLK_LSEG2EXT(rdata->lseg),
275 					     isect, &cow_read);
276 			if (!be) {
277 				rdata->pnfs_error = -EIO;
278 				goto out;
279 			}
280 			extent_length = be->be_length -
281 				(isect - be->be_f_offset);
282 			if (cow_read) {
283 				sector_t cow_length = cow_read->be_length -
284 					(isect - cow_read->be_f_offset);
285 				extent_length = min(extent_length, cow_length);
286 			}
287 		}
288 		hole = is_hole(be, isect);
289 		if (hole && !cow_read) {
290 			bio = bl_submit_bio(READ, bio);
291 			/* Fill hole w/ zeroes w/o accessing device */
292 			dprintk("%s Zeroing page for hole\n", __func__);
293 			zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
294 			print_page(pages[i]);
295 			SetPageUptodate(pages[i]);
296 		} else {
297 			struct pnfs_block_extent *be_read;
298 
299 			be_read = (hole && cow_read) ? cow_read : be;
300 			bio = bl_add_page_to_bio(bio, rdata->npages - i, READ,
301 						 isect, pages[i], be_read,
302 						 bl_end_io_read, par);
303 			if (IS_ERR(bio)) {
304 				rdata->pnfs_error = PTR_ERR(bio);
305 				goto out;
306 			}
307 		}
308 		isect += PAGE_CACHE_SECTORS;
309 		extent_length -= PAGE_CACHE_SECTORS;
310 	}
311 	if ((isect << SECTOR_SHIFT) >= rdata->inode->i_size) {
312 		rdata->res.eof = 1;
313 		rdata->res.count = rdata->inode->i_size - f_offset;
314 	} else {
315 		rdata->res.count = (isect << SECTOR_SHIFT) - f_offset;
316 	}
317 out:
318 	bl_put_extent(be);
319 	bl_put_extent(cow_read);
320 	bl_submit_bio(READ, bio);
321 	put_parallel(par);
322 	return PNFS_ATTEMPTED;
323 
324  use_mds:
325 	dprintk("Giving up and using normal NFS\n");
326 	return PNFS_NOT_ATTEMPTED;
327 }
328 
329 static void mark_extents_written(struct pnfs_block_layout *bl,
330 				 __u64 offset, __u32 count)
331 {
332 	sector_t isect, end;
333 	struct pnfs_block_extent *be;
334 
335 	dprintk("%s(%llu, %u)\n", __func__, offset, count);
336 	if (count == 0)
337 		return;
338 	isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT;
339 	end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK);
340 	end >>= SECTOR_SHIFT;
341 	while (isect < end) {
342 		sector_t len;
343 		be = bl_find_get_extent(bl, isect, NULL);
344 		BUG_ON(!be); /* FIXME */
345 		len = min(end, be->be_f_offset + be->be_length) - isect;
346 		if (be->be_state == PNFS_BLOCK_INVALID_DATA)
347 			bl_mark_for_commit(be, isect, len); /* What if fails? */
348 		isect += len;
349 		bl_put_extent(be);
350 	}
351 }
352 
353 static void bl_end_io_write_zero(struct bio *bio, int err)
354 {
355 	struct parallel_io *par = bio->bi_private;
356 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
357 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
358 	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
359 
360 	do {
361 		struct page *page = bvec->bv_page;
362 
363 		if (--bvec >= bio->bi_io_vec)
364 			prefetchw(&bvec->bv_page->flags);
365 		/* This is the zeroing page we added */
366 		end_page_writeback(page);
367 		page_cache_release(page);
368 	} while (bvec >= bio->bi_io_vec);
369 	if (!uptodate) {
370 		if (!wdata->pnfs_error)
371 			wdata->pnfs_error = -EIO;
372 		bl_set_lo_fail(wdata->lseg);
373 	}
374 	bio_put(bio);
375 	put_parallel(par);
376 }
377 
378 /* This is basically copied from mpage_end_io_read */
379 static void bl_end_io_write(struct bio *bio, int err)
380 {
381 	struct parallel_io *par = bio->bi_private;
382 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
383 	struct nfs_write_data *wdata = (struct nfs_write_data *)par->data;
384 
385 	if (!uptodate) {
386 		if (!wdata->pnfs_error)
387 			wdata->pnfs_error = -EIO;
388 		bl_set_lo_fail(wdata->lseg);
389 	}
390 	bio_put(bio);
391 	put_parallel(par);
392 }
393 
394 /* Function scheduled for call during bl_end_par_io_write,
395  * it marks sectors as written and extends the commitlist.
396  */
397 static void bl_write_cleanup(struct work_struct *work)
398 {
399 	struct rpc_task *task;
400 	struct nfs_write_data *wdata;
401 	dprintk("%s enter\n", __func__);
402 	task = container_of(work, struct rpc_task, u.tk_work);
403 	wdata = container_of(task, struct nfs_write_data, task);
404 	if (!wdata->pnfs_error) {
405 		/* Marks for LAYOUTCOMMIT */
406 		mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
407 				     wdata->args.offset, wdata->args.count);
408 	}
409 	pnfs_ld_write_done(wdata);
410 }
411 
412 /* Called when last of bios associated with a bl_write_pagelist call finishes */
413 static void bl_end_par_io_write(void *data)
414 {
415 	struct nfs_write_data *wdata = data;
416 
417 	wdata->task.tk_status = 0;
418 	wdata->verf.committed = NFS_FILE_SYNC;
419 	INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
420 	schedule_work(&wdata->task.u.tk_work);
421 }
422 
423 /* FIXME STUB - mark intersection of layout and page as bad, so is not
424  * used again.
425  */
426 static void mark_bad_read(void)
427 {
428 	return;
429 }
430 
431 /*
432  * map_block:  map a requested I/0 block (isect) into an offset in the LVM
433  * block_device
434  */
435 static void
436 map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be)
437 {
438 	dprintk("%s enter be=%p\n", __func__, be);
439 
440 	set_buffer_mapped(bh);
441 	bh->b_bdev = be->be_mdev;
442 	bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >>
443 	    (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT);
444 
445 	dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n",
446 		__func__, (unsigned long long)isect, (long)bh->b_blocknr,
447 		bh->b_size);
448 	return;
449 }
450 
451 /* Given an unmapped page, zero it or read in page for COW, page is locked
452  * by caller.
453  */
454 static int
455 init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read)
456 {
457 	struct buffer_head *bh = NULL;
458 	int ret = 0;
459 	sector_t isect;
460 
461 	dprintk("%s enter, %p\n", __func__, page);
462 	BUG_ON(PageUptodate(page));
463 	if (!cow_read) {
464 		zero_user_segment(page, 0, PAGE_SIZE);
465 		SetPageUptodate(page);
466 		goto cleanup;
467 	}
468 
469 	bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0);
470 	if (!bh) {
471 		ret = -ENOMEM;
472 		goto cleanup;
473 	}
474 
475 	isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT;
476 	map_block(bh, isect, cow_read);
477 	if (!bh_uptodate_or_lock(bh))
478 		ret = bh_submit_read(bh);
479 	if (ret)
480 		goto cleanup;
481 	SetPageUptodate(page);
482 
483 cleanup:
484 	bl_put_extent(cow_read);
485 	if (bh)
486 		free_buffer_head(bh);
487 	if (ret) {
488 		/* Need to mark layout with bad read...should now
489 		 * just use nfs4 for reads and writes.
490 		 */
491 		mark_bad_read();
492 	}
493 	return ret;
494 }
495 
496 static enum pnfs_try_status
497 bl_write_pagelist(struct nfs_write_data *wdata, int sync)
498 {
499 	int i, ret, npg_zero, pg_index, last = 0;
500 	struct bio *bio = NULL;
501 	struct pnfs_block_extent *be = NULL, *cow_read = NULL;
502 	sector_t isect, last_isect = 0, extent_length = 0;
503 	struct parallel_io *par;
504 	loff_t offset = wdata->args.offset;
505 	size_t count = wdata->args.count;
506 	struct page **pages = wdata->args.pages;
507 	struct page *page;
508 	pgoff_t index;
509 	u64 temp;
510 	int npg_per_block =
511 	    NFS_SERVER(wdata->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT;
512 
513 	dprintk("%s enter, %Zu@%lld\n", __func__, count, offset);
514 	/* At this point, wdata->pages is a (sequential) list of nfs_pages.
515 	 * We want to write each, and if there is an error set pnfs_error
516 	 * to have it redone using nfs.
517 	 */
518 	par = alloc_parallel(wdata);
519 	if (!par)
520 		return PNFS_NOT_ATTEMPTED;
521 	par->call_ops = *wdata->mds_ops;
522 	par->call_ops.rpc_call_done = bl_rpc_do_nothing;
523 	par->pnfs_callback = bl_end_par_io_write;
524 	/* At this point, have to be more careful with error handling */
525 
526 	isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
527 	be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
528 	if (!be || !is_writable(be, isect)) {
529 		dprintk("%s no matching extents!\n", __func__);
530 		wdata->pnfs_error = -EINVAL;
531 		goto out;
532 	}
533 
534 	/* First page inside INVALID extent */
535 	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
536 		temp = offset >> PAGE_CACHE_SHIFT;
537 		npg_zero = do_div(temp, npg_per_block);
538 		isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
539 				     (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT);
540 		extent_length = be->be_length - (isect - be->be_f_offset);
541 
542 fill_invalid_ext:
543 		dprintk("%s need to zero %d pages\n", __func__, npg_zero);
544 		for (;npg_zero > 0; npg_zero--) {
545 			/* page ref released in bl_end_io_write_zero */
546 			index = isect >> PAGE_CACHE_SECTOR_SHIFT;
547 			dprintk("%s zero %dth page: index %lu isect %llu\n",
548 				__func__, npg_zero, index,
549 				(unsigned long long)isect);
550 			page =
551 			    find_or_create_page(wdata->inode->i_mapping, index,
552 						GFP_NOFS);
553 			if (!page) {
554 				dprintk("%s oom\n", __func__);
555 				wdata->pnfs_error = -ENOMEM;
556 				goto out;
557 			}
558 
559 			/* PageDirty: Other will write this out
560 			 * PageWriteback: Other is writing this out
561 			 * PageUptodate: It was read before
562 			 * sector_initialized: already written out
563 			 */
564 			if (PageDirty(page) || PageWriteback(page) ||
565 			    bl_is_sector_init(be->be_inval, isect)) {
566 				print_page(page);
567 				unlock_page(page);
568 				page_cache_release(page);
569 				goto next_page;
570 			}
571 			if (!PageUptodate(page)) {
572 				/* New page, readin or zero it */
573 				init_page_for_write(page, cow_read);
574 			}
575 			set_page_writeback(page);
576 			unlock_page(page);
577 
578 			ret = bl_mark_sectors_init(be->be_inval, isect,
579 						       PAGE_CACHE_SECTORS,
580 						       NULL);
581 			if (unlikely(ret)) {
582 				dprintk("%s bl_mark_sectors_init fail %d\n",
583 					__func__, ret);
584 				end_page_writeback(page);
585 				page_cache_release(page);
586 				wdata->pnfs_error = ret;
587 				goto out;
588 			}
589 			bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
590 						 isect, page, be,
591 						 bl_end_io_write_zero, par);
592 			if (IS_ERR(bio)) {
593 				wdata->pnfs_error = PTR_ERR(bio);
594 				goto out;
595 			}
596 			/* FIXME: This should be done in bi_end_io */
597 			mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
598 					     page->index << PAGE_CACHE_SHIFT,
599 					     PAGE_CACHE_SIZE);
600 next_page:
601 			isect += PAGE_CACHE_SECTORS;
602 			extent_length -= PAGE_CACHE_SECTORS;
603 		}
604 		if (last)
605 			goto write_done;
606 	}
607 	bio = bl_submit_bio(WRITE, bio);
608 
609 	/* Middle pages */
610 	pg_index = wdata->args.pgbase >> PAGE_CACHE_SHIFT;
611 	for (i = pg_index; i < wdata->npages; i++) {
612 		if (!extent_length) {
613 			/* We've used up the previous extent */
614 			bl_put_extent(be);
615 			bio = bl_submit_bio(WRITE, bio);
616 			/* Get the next one */
617 			be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg),
618 					     isect, NULL);
619 			if (!be || !is_writable(be, isect)) {
620 				wdata->pnfs_error = -EINVAL;
621 				goto out;
622 			}
623 			extent_length = be->be_length -
624 			    (isect - be->be_f_offset);
625 		}
626 		if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
627 			ret = bl_mark_sectors_init(be->be_inval, isect,
628 						       PAGE_CACHE_SECTORS,
629 						       NULL);
630 			if (unlikely(ret)) {
631 				dprintk("%s bl_mark_sectors_init fail %d\n",
632 					__func__, ret);
633 				wdata->pnfs_error = ret;
634 				goto out;
635 			}
636 		}
637 		bio = bl_add_page_to_bio(bio, wdata->npages - i, WRITE,
638 					 isect, pages[i], be,
639 					 bl_end_io_write, par);
640 		if (IS_ERR(bio)) {
641 			wdata->pnfs_error = PTR_ERR(bio);
642 			goto out;
643 		}
644 		isect += PAGE_CACHE_SECTORS;
645 		last_isect = isect;
646 		extent_length -= PAGE_CACHE_SECTORS;
647 	}
648 
649 	/* Last page inside INVALID extent */
650 	if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
651 		bio = bl_submit_bio(WRITE, bio);
652 		temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT;
653 		npg_zero = npg_per_block - do_div(temp, npg_per_block);
654 		if (npg_zero < npg_per_block) {
655 			last = 1;
656 			goto fill_invalid_ext;
657 		}
658 	}
659 
660 write_done:
661 	wdata->res.count = (last_isect << SECTOR_SHIFT) - (offset);
662 	if (count < wdata->res.count) {
663 		wdata->res.count = count;
664 	}
665 out:
666 	bl_put_extent(be);
667 	bl_submit_bio(WRITE, bio);
668 	put_parallel(par);
669 	return PNFS_ATTEMPTED;
670 }
671 
672 /* FIXME - range ignored */
673 static void
674 release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range)
675 {
676 	int i;
677 	struct pnfs_block_extent *be;
678 
679 	spin_lock(&bl->bl_ext_lock);
680 	for (i = 0; i < EXTENT_LISTS; i++) {
681 		while (!list_empty(&bl->bl_extents[i])) {
682 			be = list_first_entry(&bl->bl_extents[i],
683 					      struct pnfs_block_extent,
684 					      be_node);
685 			list_del(&be->be_node);
686 			bl_put_extent(be);
687 		}
688 	}
689 	spin_unlock(&bl->bl_ext_lock);
690 }
691 
692 static void
693 release_inval_marks(struct pnfs_inval_markings *marks)
694 {
695 	struct pnfs_inval_tracking *pos, *temp;
696 
697 	list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
698 		list_del(&pos->it_link);
699 		kfree(pos);
700 	}
701 	return;
702 }
703 
704 static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
705 {
706 	struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
707 
708 	dprintk("%s enter\n", __func__);
709 	release_extents(bl, NULL);
710 	release_inval_marks(&bl->bl_inval);
711 	kfree(bl);
712 }
713 
714 static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
715 						   gfp_t gfp_flags)
716 {
717 	struct pnfs_block_layout *bl;
718 
719 	dprintk("%s enter\n", __func__);
720 	bl = kzalloc(sizeof(*bl), gfp_flags);
721 	if (!bl)
722 		return NULL;
723 	spin_lock_init(&bl->bl_ext_lock);
724 	INIT_LIST_HEAD(&bl->bl_extents[0]);
725 	INIT_LIST_HEAD(&bl->bl_extents[1]);
726 	INIT_LIST_HEAD(&bl->bl_commit);
727 	INIT_LIST_HEAD(&bl->bl_committing);
728 	bl->bl_count = 0;
729 	bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT;
730 	BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize);
731 	return &bl->bl_layout;
732 }
733 
734 static void bl_free_lseg(struct pnfs_layout_segment *lseg)
735 {
736 	dprintk("%s enter\n", __func__);
737 	kfree(lseg);
738 }
739 
740 /* We pretty much ignore lseg, and store all data layout wide, so we
741  * can correctly merge.
742  */
743 static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo,
744 						 struct nfs4_layoutget_res *lgr,
745 						 gfp_t gfp_flags)
746 {
747 	struct pnfs_layout_segment *lseg;
748 	int status;
749 
750 	dprintk("%s enter\n", __func__);
751 	lseg = kzalloc(sizeof(*lseg), gfp_flags);
752 	if (!lseg)
753 		return ERR_PTR(-ENOMEM);
754 	status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags);
755 	if (status) {
756 		/* We don't want to call the full-blown bl_free_lseg,
757 		 * since on error extents were not touched.
758 		 */
759 		kfree(lseg);
760 		return ERR_PTR(status);
761 	}
762 	return lseg;
763 }
764 
765 static void
766 bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr,
767 		       const struct nfs4_layoutcommit_args *arg)
768 {
769 	dprintk("%s enter\n", __func__);
770 	encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg);
771 }
772 
773 static void
774 bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
775 {
776 	struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout;
777 
778 	dprintk("%s enter\n", __func__);
779 	clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status);
780 }
781 
782 static void free_blk_mountid(struct block_mount_id *mid)
783 {
784 	if (mid) {
785 		struct pnfs_block_dev *dev;
786 		spin_lock(&mid->bm_lock);
787 		while (!list_empty(&mid->bm_devlist)) {
788 			dev = list_first_entry(&mid->bm_devlist,
789 					       struct pnfs_block_dev,
790 					       bm_node);
791 			list_del(&dev->bm_node);
792 			bl_free_block_dev(dev);
793 		}
794 		spin_unlock(&mid->bm_lock);
795 		kfree(mid);
796 	}
797 }
798 
799 /* This is mostly copied from the filelayout's get_device_info function.
800  * It seems much of this should be at the generic pnfs level.
801  */
802 static struct pnfs_block_dev *
803 nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh,
804 			struct nfs4_deviceid *d_id)
805 {
806 	struct pnfs_device *dev;
807 	struct pnfs_block_dev *rv = NULL;
808 	u32 max_resp_sz;
809 	int max_pages;
810 	struct page **pages = NULL;
811 	int i, rc;
812 
813 	/*
814 	 * Use the session max response size as the basis for setting
815 	 * GETDEVICEINFO's maxcount
816 	 */
817 	max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz;
818 	max_pages = max_resp_sz >> PAGE_SHIFT;
819 	dprintk("%s max_resp_sz %u max_pages %d\n",
820 		__func__, max_resp_sz, max_pages);
821 
822 	dev = kmalloc(sizeof(*dev), GFP_NOFS);
823 	if (!dev) {
824 		dprintk("%s kmalloc failed\n", __func__);
825 		return NULL;
826 	}
827 
828 	pages = kzalloc(max_pages * sizeof(struct page *), GFP_NOFS);
829 	if (pages == NULL) {
830 		kfree(dev);
831 		return NULL;
832 	}
833 	for (i = 0; i < max_pages; i++) {
834 		pages[i] = alloc_page(GFP_NOFS);
835 		if (!pages[i])
836 			goto out_free;
837 	}
838 
839 	memcpy(&dev->dev_id, d_id, sizeof(*d_id));
840 	dev->layout_type = LAYOUT_BLOCK_VOLUME;
841 	dev->pages = pages;
842 	dev->pgbase = 0;
843 	dev->pglen = PAGE_SIZE * max_pages;
844 	dev->mincount = 0;
845 
846 	dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data);
847 	rc = nfs4_proc_getdeviceinfo(server, dev);
848 	dprintk("%s getdevice info returns %d\n", __func__, rc);
849 	if (rc)
850 		goto out_free;
851 
852 	rv = nfs4_blk_decode_device(server, dev);
853  out_free:
854 	for (i = 0; i < max_pages; i++)
855 		__free_page(pages[i]);
856 	kfree(pages);
857 	kfree(dev);
858 	return rv;
859 }
860 
861 static int
862 bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
863 {
864 	struct block_mount_id *b_mt_id = NULL;
865 	struct pnfs_devicelist *dlist = NULL;
866 	struct pnfs_block_dev *bdev;
867 	LIST_HEAD(block_disklist);
868 	int status = 0, i;
869 
870 	dprintk("%s enter\n", __func__);
871 
872 	if (server->pnfs_blksize == 0) {
873 		dprintk("%s Server did not return blksize\n", __func__);
874 		return -EINVAL;
875 	}
876 	b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS);
877 	if (!b_mt_id) {
878 		status = -ENOMEM;
879 		goto out_error;
880 	}
881 	/* Initialize nfs4 block layout mount id */
882 	spin_lock_init(&b_mt_id->bm_lock);
883 	INIT_LIST_HEAD(&b_mt_id->bm_devlist);
884 
885 	dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS);
886 	if (!dlist) {
887 		status = -ENOMEM;
888 		goto out_error;
889 	}
890 	dlist->eof = 0;
891 	while (!dlist->eof) {
892 		status = nfs4_proc_getdevicelist(server, fh, dlist);
893 		if (status)
894 			goto out_error;
895 		dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n",
896 			__func__, dlist->num_devs, dlist->eof);
897 		for (i = 0; i < dlist->num_devs; i++) {
898 			bdev = nfs4_blk_get_deviceinfo(server, fh,
899 						       &dlist->dev_id[i]);
900 			if (!bdev) {
901 				status = -ENODEV;
902 				goto out_error;
903 			}
904 			spin_lock(&b_mt_id->bm_lock);
905 			list_add(&bdev->bm_node, &b_mt_id->bm_devlist);
906 			spin_unlock(&b_mt_id->bm_lock);
907 		}
908 	}
909 	dprintk("%s SUCCESS\n", __func__);
910 	server->pnfs_ld_data = b_mt_id;
911 
912  out_return:
913 	kfree(dlist);
914 	return status;
915 
916  out_error:
917 	free_blk_mountid(b_mt_id);
918 	goto out_return;
919 }
920 
921 static int
922 bl_clear_layoutdriver(struct nfs_server *server)
923 {
924 	struct block_mount_id *b_mt_id = server->pnfs_ld_data;
925 
926 	dprintk("%s enter\n", __func__);
927 	free_blk_mountid(b_mt_id);
928 	dprintk("%s RETURNS\n", __func__);
929 	return 0;
930 }
931 
932 static const struct nfs_pageio_ops bl_pg_read_ops = {
933 	.pg_init = pnfs_generic_pg_init_read,
934 	.pg_test = pnfs_generic_pg_test,
935 	.pg_doio = pnfs_generic_pg_readpages,
936 };
937 
938 static const struct nfs_pageio_ops bl_pg_write_ops = {
939 	.pg_init = pnfs_generic_pg_init_write,
940 	.pg_test = pnfs_generic_pg_test,
941 	.pg_doio = pnfs_generic_pg_writepages,
942 };
943 
944 static struct pnfs_layoutdriver_type blocklayout_type = {
945 	.id				= LAYOUT_BLOCK_VOLUME,
946 	.name				= "LAYOUT_BLOCK_VOLUME",
947 	.read_pagelist			= bl_read_pagelist,
948 	.write_pagelist			= bl_write_pagelist,
949 	.alloc_layout_hdr		= bl_alloc_layout_hdr,
950 	.free_layout_hdr		= bl_free_layout_hdr,
951 	.alloc_lseg			= bl_alloc_lseg,
952 	.free_lseg			= bl_free_lseg,
953 	.encode_layoutcommit		= bl_encode_layoutcommit,
954 	.cleanup_layoutcommit		= bl_cleanup_layoutcommit,
955 	.set_layoutdriver		= bl_set_layoutdriver,
956 	.clear_layoutdriver		= bl_clear_layoutdriver,
957 	.pg_read_ops			= &bl_pg_read_ops,
958 	.pg_write_ops			= &bl_pg_write_ops,
959 };
960 
961 static const struct rpc_pipe_ops bl_upcall_ops = {
962 	.upcall		= bl_pipe_upcall,
963 	.downcall	= bl_pipe_downcall,
964 	.destroy_msg	= bl_pipe_destroy_msg,
965 };
966 
967 static int __init nfs4blocklayout_init(void)
968 {
969 	struct vfsmount *mnt;
970 	struct path path;
971 	int ret;
972 
973 	dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
974 
975 	ret = pnfs_register_layoutdriver(&blocklayout_type);
976 	if (ret)
977 		goto out;
978 
979 	init_waitqueue_head(&bl_wq);
980 
981 	mnt = rpc_get_mount();
982 	if (IS_ERR(mnt)) {
983 		ret = PTR_ERR(mnt);
984 		goto out_remove;
985 	}
986 
987 	ret = vfs_path_lookup(mnt->mnt_root,
988 			      mnt,
989 			      NFS_PIPE_DIRNAME, 0, &path);
990 	if (ret)
991 		goto out_remove;
992 
993 	bl_device_pipe = rpc_mkpipe(path.dentry, "blocklayout", NULL,
994 				    &bl_upcall_ops, 0);
995 	if (IS_ERR(bl_device_pipe)) {
996 		ret = PTR_ERR(bl_device_pipe);
997 		goto out_remove;
998 	}
999 out:
1000 	return ret;
1001 
1002 out_remove:
1003 	pnfs_unregister_layoutdriver(&blocklayout_type);
1004 	return ret;
1005 }
1006 
1007 static void __exit nfs4blocklayout_exit(void)
1008 {
1009 	dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1010 	       __func__);
1011 
1012 	pnfs_unregister_layoutdriver(&blocklayout_type);
1013 	rpc_unlink(bl_device_pipe);
1014 }
1015 
1016 MODULE_ALIAS("nfs-layouttype4-3");
1017 
1018 module_init(nfs4blocklayout_init);
1019 module_exit(nfs4blocklayout_exit);
1020