1 /****************************************************************************** 2 * 3 * Back-end of the driver for virtual block devices. This portion of the 4 * driver exports a 'unified' block-device interface that can be accessed 5 * by any operating system that implements a compatible front end. A 6 * reference front-end implementation can be found in: 7 * drivers/block/xen-blkfront.c 8 * 9 * Copyright (c) 2003-2004, Keir Fraser & Steve Hand 10 * Copyright (c) 2005, Christopher Clark 11 * 12 * This program is free software; you can redistribute it and/or 13 * modify it under the terms of the GNU General Public License version 2 14 * as published by the Free Software Foundation; or, when distributed 15 * separately from the Linux kernel or incorporated into other 16 * software packages, subject to the following license: 17 * 18 * Permission is hereby granted, free of charge, to any person obtaining a copy 19 * of this source file (the "Software"), to deal in the Software without 20 * restriction, including without limitation the rights to use, copy, modify, 21 * merge, publish, distribute, sublicense, and/or sell copies of the Software, 22 * and to permit persons to whom the Software is furnished to do so, subject to 23 * the following conditions: 24 * 25 * The above copyright notice and this permission notice shall be included in 26 * all copies or substantial portions of the Software. 27 * 28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 34 * IN THE SOFTWARE. 35 */ 36 37 #include <linux/spinlock.h> 38 #include <linux/kthread.h> 39 #include <linux/list.h> 40 #include <linux/delay.h> 41 #include <linux/freezer.h> 42 #include <linux/loop.h> 43 #include <linux/falloc.h> 44 #include <linux/fs.h> 45 46 #include <xen/events.h> 47 #include <xen/page.h> 48 #include <asm/xen/hypervisor.h> 49 #include <asm/xen/hypercall.h> 50 #include "common.h" 51 52 /* 53 * These are rather arbitrary. They are fairly large because adjacent requests 54 * pulled from a communication ring are quite likely to end up being part of 55 * the same scatter/gather request at the disc. 56 * 57 * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** 58 * 59 * This will increase the chances of being able to write whole tracks. 60 * 64 should be enough to keep us competitive with Linux. 61 */ 62 static int xen_blkif_reqs = 64; 63 module_param_named(reqs, xen_blkif_reqs, int, 0); 64 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); 65 66 /* Run-time switchable: /sys/module/blkback/parameters/ */ 67 static unsigned int log_stats; 68 module_param(log_stats, int, 0644); 69 70 /* 71 * Each outstanding request that we've passed to the lower device layers has a 72 * 'pending_req' allocated to it. Each buffer_head that completes decrements 73 * the pendcnt towards zero. When it hits zero, the specified domain has a 74 * response queued for it, with the saved 'id' passed back. 75 */ 76 struct pending_req { 77 struct xen_blkif *blkif; 78 u64 id; 79 int nr_pages; 80 atomic_t pendcnt; 81 unsigned short operation; 82 int status; 83 struct list_head free_list; 84 }; 85 86 #define BLKBACK_INVALID_HANDLE (~0) 87 88 struct xen_blkbk { 89 struct pending_req *pending_reqs; 90 /* List of all 'pending_req' available */ 91 struct list_head pending_free; 92 /* And its spinlock. */ 93 spinlock_t pending_free_lock; 94 wait_queue_head_t pending_free_wq; 95 /* The list of all pages that are available. */ 96 struct page **pending_pages; 97 /* And the grant handles that are available. */ 98 grant_handle_t *pending_grant_handles; 99 }; 100 101 static struct xen_blkbk *blkbk; 102 103 /* 104 * Little helpful macro to figure out the index and virtual address of the 105 * pending_pages[..]. For each 'pending_req' we have have up to 106 * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through 107 * 10 and would index in the pending_pages[..]. 108 */ 109 static inline int vaddr_pagenr(struct pending_req *req, int seg) 110 { 111 return (req - blkbk->pending_reqs) * 112 BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; 113 } 114 115 #define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] 116 117 static inline unsigned long vaddr(struct pending_req *req, int seg) 118 { 119 unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); 120 return (unsigned long)pfn_to_kaddr(pfn); 121 } 122 123 #define pending_handle(_req, _seg) \ 124 (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) 125 126 127 static int do_block_io_op(struct xen_blkif *blkif); 128 static int dispatch_rw_block_io(struct xen_blkif *blkif, 129 struct blkif_request *req, 130 struct pending_req *pending_req); 131 static void make_response(struct xen_blkif *blkif, u64 id, 132 unsigned short op, int st); 133 134 /* 135 * Retrieve from the 'pending_reqs' a free pending_req structure to be used. 136 */ 137 static struct pending_req *alloc_req(void) 138 { 139 struct pending_req *req = NULL; 140 unsigned long flags; 141 142 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 143 if (!list_empty(&blkbk->pending_free)) { 144 req = list_entry(blkbk->pending_free.next, struct pending_req, 145 free_list); 146 list_del(&req->free_list); 147 } 148 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 149 return req; 150 } 151 152 /* 153 * Return the 'pending_req' structure back to the freepool. We also 154 * wake up the thread if it was waiting for a free page. 155 */ 156 static void free_req(struct pending_req *req) 157 { 158 unsigned long flags; 159 int was_empty; 160 161 spin_lock_irqsave(&blkbk->pending_free_lock, flags); 162 was_empty = list_empty(&blkbk->pending_free); 163 list_add(&req->free_list, &blkbk->pending_free); 164 spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); 165 if (was_empty) 166 wake_up(&blkbk->pending_free_wq); 167 } 168 169 /* 170 * Routines for managing virtual block devices (vbds). 171 */ 172 static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, 173 int operation) 174 { 175 struct xen_vbd *vbd = &blkif->vbd; 176 int rc = -EACCES; 177 178 if ((operation != READ) && vbd->readonly) 179 goto out; 180 181 if (likely(req->nr_sects)) { 182 blkif_sector_t end = req->sector_number + req->nr_sects; 183 184 if (unlikely(end < req->sector_number)) 185 goto out; 186 if (unlikely(end > vbd_sz(vbd))) 187 goto out; 188 } 189 190 req->dev = vbd->pdevice; 191 req->bdev = vbd->bdev; 192 rc = 0; 193 194 out: 195 return rc; 196 } 197 198 static void xen_vbd_resize(struct xen_blkif *blkif) 199 { 200 struct xen_vbd *vbd = &blkif->vbd; 201 struct xenbus_transaction xbt; 202 int err; 203 struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be); 204 unsigned long long new_size = vbd_sz(vbd); 205 206 pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n", 207 blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice)); 208 pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size); 209 vbd->size = new_size; 210 again: 211 err = xenbus_transaction_start(&xbt); 212 if (err) { 213 pr_warn(DRV_PFX "Error starting transaction"); 214 return; 215 } 216 err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", 217 (unsigned long long)vbd_sz(vbd)); 218 if (err) { 219 pr_warn(DRV_PFX "Error writing new size"); 220 goto abort; 221 } 222 /* 223 * Write the current state; we will use this to synchronize 224 * the front-end. If the current state is "connected" the 225 * front-end will get the new size information online. 226 */ 227 err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); 228 if (err) { 229 pr_warn(DRV_PFX "Error writing the state"); 230 goto abort; 231 } 232 233 err = xenbus_transaction_end(xbt, 0); 234 if (err == -EAGAIN) 235 goto again; 236 if (err) 237 pr_warn(DRV_PFX "Error ending transaction"); 238 return; 239 abort: 240 xenbus_transaction_end(xbt, 1); 241 } 242 243 /* 244 * Notification from the guest OS. 245 */ 246 static void blkif_notify_work(struct xen_blkif *blkif) 247 { 248 blkif->waiting_reqs = 1; 249 wake_up(&blkif->wq); 250 } 251 252 irqreturn_t xen_blkif_be_int(int irq, void *dev_id) 253 { 254 blkif_notify_work(dev_id); 255 return IRQ_HANDLED; 256 } 257 258 /* 259 * SCHEDULER FUNCTIONS 260 */ 261 262 static void print_stats(struct xen_blkif *blkif) 263 { 264 pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d" 265 " | ds %4d\n", 266 current->comm, blkif->st_oo_req, 267 blkif->st_rd_req, blkif->st_wr_req, 268 blkif->st_f_req, blkif->st_ds_req); 269 blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); 270 blkif->st_rd_req = 0; 271 blkif->st_wr_req = 0; 272 blkif->st_oo_req = 0; 273 blkif->st_ds_req = 0; 274 } 275 276 int xen_blkif_schedule(void *arg) 277 { 278 struct xen_blkif *blkif = arg; 279 struct xen_vbd *vbd = &blkif->vbd; 280 281 xen_blkif_get(blkif); 282 283 while (!kthread_should_stop()) { 284 if (try_to_freeze()) 285 continue; 286 if (unlikely(vbd->size != vbd_sz(vbd))) 287 xen_vbd_resize(blkif); 288 289 wait_event_interruptible( 290 blkif->wq, 291 blkif->waiting_reqs || kthread_should_stop()); 292 wait_event_interruptible( 293 blkbk->pending_free_wq, 294 !list_empty(&blkbk->pending_free) || 295 kthread_should_stop()); 296 297 blkif->waiting_reqs = 0; 298 smp_mb(); /* clear flag *before* checking for work */ 299 300 if (do_block_io_op(blkif)) 301 blkif->waiting_reqs = 1; 302 303 if (log_stats && time_after(jiffies, blkif->st_print)) 304 print_stats(blkif); 305 } 306 307 if (log_stats) 308 print_stats(blkif); 309 310 blkif->xenblkd = NULL; 311 xen_blkif_put(blkif); 312 313 return 0; 314 } 315 316 struct seg_buf { 317 unsigned long buf; 318 unsigned int nsec; 319 }; 320 /* 321 * Unmap the grant references, and also remove the M2P over-rides 322 * used in the 'pending_req'. 323 */ 324 static void xen_blkbk_unmap(struct pending_req *req) 325 { 326 struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 327 unsigned int i, invcount = 0; 328 grant_handle_t handle; 329 int ret; 330 331 for (i = 0; i < req->nr_pages; i++) { 332 handle = pending_handle(req, i); 333 if (handle == BLKBACK_INVALID_HANDLE) 334 continue; 335 gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), 336 GNTMAP_host_map, handle); 337 pending_handle(req, i) = BLKBACK_INVALID_HANDLE; 338 invcount++; 339 } 340 341 ret = HYPERVISOR_grant_table_op( 342 GNTTABOP_unmap_grant_ref, unmap, invcount); 343 BUG_ON(ret); 344 /* 345 * Note, we use invcount, so nr->pages, so we can't index 346 * using vaddr(req, i). 347 */ 348 for (i = 0; i < invcount; i++) { 349 ret = m2p_remove_override( 350 virt_to_page(unmap[i].host_addr), false); 351 if (ret) { 352 pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n", 353 (unsigned long)unmap[i].host_addr); 354 continue; 355 } 356 } 357 } 358 359 static int xen_blkbk_map(struct blkif_request *req, 360 struct pending_req *pending_req, 361 struct seg_buf seg[]) 362 { 363 struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 364 int i; 365 int nseg = req->nr_segments; 366 int ret = 0; 367 368 /* 369 * Fill out preq.nr_sects with proper amount of sectors, and setup 370 * assign map[..] with the PFN of the page in our domain with the 371 * corresponding grant reference for each page. 372 */ 373 for (i = 0; i < nseg; i++) { 374 uint32_t flags; 375 376 flags = GNTMAP_host_map; 377 if (pending_req->operation != BLKIF_OP_READ) 378 flags |= GNTMAP_readonly; 379 gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, 380 req->u.rw.seg[i].gref, 381 pending_req->blkif->domid); 382 } 383 384 ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); 385 BUG_ON(ret); 386 387 /* 388 * Now swizzle the MFN in our domain with the MFN from the other domain 389 * so that when we access vaddr(pending_req,i) it has the contents of 390 * the page from the other domain. 391 */ 392 for (i = 0; i < nseg; i++) { 393 if (unlikely(map[i].status != 0)) { 394 pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); 395 map[i].handle = BLKBACK_INVALID_HANDLE; 396 ret |= 1; 397 } 398 399 pending_handle(pending_req, i) = map[i].handle; 400 401 if (ret) 402 continue; 403 404 ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr), 405 blkbk->pending_page(pending_req, i), NULL); 406 if (ret) { 407 pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n", 408 (unsigned long)map[i].dev_bus_addr, ret); 409 /* We could switch over to GNTTABOP_copy */ 410 continue; 411 } 412 413 seg[i].buf = map[i].dev_bus_addr | 414 (req->u.rw.seg[i].first_sect << 9); 415 } 416 return ret; 417 } 418 419 static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req) 420 { 421 int err = 0; 422 int status = BLKIF_RSP_OKAY; 423 struct block_device *bdev = blkif->vbd.bdev; 424 425 if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) 426 /* just forward the discard request */ 427 err = blkdev_issue_discard(bdev, 428 req->u.discard.sector_number, 429 req->u.discard.nr_sectors, 430 GFP_KERNEL, 0); 431 else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) { 432 /* punch a hole in the backing file */ 433 struct loop_device *lo = bdev->bd_disk->private_data; 434 struct file *file = lo->lo_backing_file; 435 436 if (file->f_op->fallocate) 437 err = file->f_op->fallocate(file, 438 FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 439 req->u.discard.sector_number << 9, 440 req->u.discard.nr_sectors << 9); 441 else 442 err = -EOPNOTSUPP; 443 } else 444 err = -EOPNOTSUPP; 445 446 if (err == -EOPNOTSUPP) { 447 pr_debug(DRV_PFX "discard op failed, not supported\n"); 448 status = BLKIF_RSP_EOPNOTSUPP; 449 } else if (err) 450 status = BLKIF_RSP_ERROR; 451 452 make_response(blkif, req->id, req->operation, status); 453 } 454 455 static void xen_blk_drain_io(struct xen_blkif *blkif) 456 { 457 atomic_set(&blkif->drain, 1); 458 do { 459 /* The initial value is one, and one refcnt taken at the 460 * start of the xen_blkif_schedule thread. */ 461 if (atomic_read(&blkif->refcnt) <= 2) 462 break; 463 wait_for_completion_interruptible_timeout( 464 &blkif->drain_complete, HZ); 465 466 if (!atomic_read(&blkif->drain)) 467 break; 468 } while (!kthread_should_stop()); 469 atomic_set(&blkif->drain, 0); 470 } 471 472 /* 473 * Completion callback on the bio's. Called as bh->b_end_io() 474 */ 475 476 static void __end_block_io_op(struct pending_req *pending_req, int error) 477 { 478 /* An error fails the entire request. */ 479 if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && 480 (error == -EOPNOTSUPP)) { 481 pr_debug(DRV_PFX "flush diskcache op failed, not supported\n"); 482 xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); 483 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 484 } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && 485 (error == -EOPNOTSUPP)) { 486 pr_debug(DRV_PFX "write barrier op failed, not supported\n"); 487 xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); 488 pending_req->status = BLKIF_RSP_EOPNOTSUPP; 489 } else if (error) { 490 pr_debug(DRV_PFX "Buffer not up-to-date at end of operation," 491 " error=%d\n", error); 492 pending_req->status = BLKIF_RSP_ERROR; 493 } 494 495 /* 496 * If all of the bio's have completed it is time to unmap 497 * the grant references associated with 'request' and provide 498 * the proper response on the ring. 499 */ 500 if (atomic_dec_and_test(&pending_req->pendcnt)) { 501 xen_blkbk_unmap(pending_req); 502 make_response(pending_req->blkif, pending_req->id, 503 pending_req->operation, pending_req->status); 504 xen_blkif_put(pending_req->blkif); 505 if (atomic_read(&pending_req->blkif->refcnt) <= 2) { 506 if (atomic_read(&pending_req->blkif->drain)) 507 complete(&pending_req->blkif->drain_complete); 508 } 509 free_req(pending_req); 510 } 511 } 512 513 /* 514 * bio callback. 515 */ 516 static void end_block_io_op(struct bio *bio, int error) 517 { 518 __end_block_io_op(bio->bi_private, error); 519 bio_put(bio); 520 } 521 522 523 524 /* 525 * Function to copy the from the ring buffer the 'struct blkif_request' 526 * (which has the sectors we want, number of them, grant references, etc), 527 * and transmute it to the block API to hand it over to the proper block disk. 528 */ 529 static int 530 __do_block_io_op(struct xen_blkif *blkif) 531 { 532 union blkif_back_rings *blk_rings = &blkif->blk_rings; 533 struct blkif_request req; 534 struct pending_req *pending_req; 535 RING_IDX rc, rp; 536 int more_to_do = 0; 537 538 rc = blk_rings->common.req_cons; 539 rp = blk_rings->common.sring->req_prod; 540 rmb(); /* Ensure we see queued requests up to 'rp'. */ 541 542 while (rc != rp) { 543 544 if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) 545 break; 546 547 if (kthread_should_stop()) { 548 more_to_do = 1; 549 break; 550 } 551 552 pending_req = alloc_req(); 553 if (NULL == pending_req) { 554 blkif->st_oo_req++; 555 more_to_do = 1; 556 break; 557 } 558 559 switch (blkif->blk_protocol) { 560 case BLKIF_PROTOCOL_NATIVE: 561 memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); 562 break; 563 case BLKIF_PROTOCOL_X86_32: 564 blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); 565 break; 566 case BLKIF_PROTOCOL_X86_64: 567 blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); 568 break; 569 default: 570 BUG(); 571 } 572 blk_rings->common.req_cons = ++rc; /* before make_response() */ 573 574 /* Apply all sanity checks to /private copy/ of request. */ 575 barrier(); 576 577 if (dispatch_rw_block_io(blkif, &req, pending_req)) 578 break; 579 580 /* Yield point for this unbounded loop. */ 581 cond_resched(); 582 } 583 584 return more_to_do; 585 } 586 587 static int 588 do_block_io_op(struct xen_blkif *blkif) 589 { 590 union blkif_back_rings *blk_rings = &blkif->blk_rings; 591 int more_to_do; 592 593 do { 594 more_to_do = __do_block_io_op(blkif); 595 if (more_to_do) 596 break; 597 598 RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); 599 } while (more_to_do); 600 601 return more_to_do; 602 } 603 /* 604 * Transmutation of the 'struct blkif_request' to a proper 'struct bio' 605 * and call the 'submit_bio' to pass it to the underlying storage. 606 */ 607 static int dispatch_rw_block_io(struct xen_blkif *blkif, 608 struct blkif_request *req, 609 struct pending_req *pending_req) 610 { 611 struct phys_req preq; 612 struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 613 unsigned int nseg; 614 struct bio *bio = NULL; 615 struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; 616 int i, nbio = 0; 617 int operation; 618 struct blk_plug plug; 619 bool drain = false; 620 621 switch (req->operation) { 622 case BLKIF_OP_READ: 623 blkif->st_rd_req++; 624 operation = READ; 625 break; 626 case BLKIF_OP_WRITE: 627 blkif->st_wr_req++; 628 operation = WRITE_ODIRECT; 629 break; 630 case BLKIF_OP_WRITE_BARRIER: 631 drain = true; 632 case BLKIF_OP_FLUSH_DISKCACHE: 633 blkif->st_f_req++; 634 operation = WRITE_FLUSH; 635 break; 636 case BLKIF_OP_DISCARD: 637 blkif->st_ds_req++; 638 operation = REQ_DISCARD; 639 break; 640 default: 641 operation = 0; /* make gcc happy */ 642 goto fail_response; 643 break; 644 } 645 646 /* Check that the number of segments is sane. */ 647 nseg = req->nr_segments; 648 if (unlikely(nseg == 0 && operation != WRITE_FLUSH && 649 operation != REQ_DISCARD) || 650 unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { 651 pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", 652 nseg); 653 /* Haven't submitted any bio's yet. */ 654 goto fail_response; 655 } 656 657 preq.dev = req->handle; 658 preq.sector_number = req->u.rw.sector_number; 659 preq.nr_sects = 0; 660 661 pending_req->blkif = blkif; 662 pending_req->id = req->id; 663 pending_req->operation = req->operation; 664 pending_req->status = BLKIF_RSP_OKAY; 665 pending_req->nr_pages = nseg; 666 667 for (i = 0; i < nseg; i++) { 668 seg[i].nsec = req->u.rw.seg[i].last_sect - 669 req->u.rw.seg[i].first_sect + 1; 670 if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || 671 (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) 672 goto fail_response; 673 preq.nr_sects += seg[i].nsec; 674 675 } 676 677 if (xen_vbd_translate(&preq, blkif, operation) != 0) { 678 pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n", 679 operation == READ ? "read" : "write", 680 preq.sector_number, 681 preq.sector_number + preq.nr_sects, preq.dev); 682 goto fail_response; 683 } 684 685 /* 686 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev 687 * is set there. 688 */ 689 for (i = 0; i < nseg; i++) { 690 if (((int)preq.sector_number|(int)seg[i].nsec) & 691 ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { 692 pr_debug(DRV_PFX "Misaligned I/O request from domain %d", 693 blkif->domid); 694 goto fail_response; 695 } 696 } 697 698 /* Wait on all outstanding I/O's and once that has been completed 699 * issue the WRITE_FLUSH. 700 */ 701 if (drain) 702 xen_blk_drain_io(pending_req->blkif); 703 704 /* 705 * If we have failed at this point, we need to undo the M2P override, 706 * set gnttab_set_unmap_op on all of the grant references and perform 707 * the hypercall to unmap the grants - that is all done in 708 * xen_blkbk_unmap. 709 */ 710 if (operation != REQ_DISCARD && xen_blkbk_map(req, pending_req, seg)) 711 goto fail_flush; 712 713 /* 714 * This corresponding xen_blkif_put is done in __end_block_io_op, or 715 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. 716 */ 717 xen_blkif_get(blkif); 718 719 for (i = 0; i < nseg; i++) { 720 while ((bio == NULL) || 721 (bio_add_page(bio, 722 blkbk->pending_page(pending_req, i), 723 seg[i].nsec << 9, 724 seg[i].buf & ~PAGE_MASK) == 0)) { 725 726 bio = bio_alloc(GFP_KERNEL, nseg-i); 727 if (unlikely(bio == NULL)) 728 goto fail_put_bio; 729 730 biolist[nbio++] = bio; 731 bio->bi_bdev = preq.bdev; 732 bio->bi_private = pending_req; 733 bio->bi_end_io = end_block_io_op; 734 bio->bi_sector = preq.sector_number; 735 } 736 737 preq.sector_number += seg[i].nsec; 738 } 739 740 /* This will be hit if the operation was a flush or discard. */ 741 if (!bio) { 742 BUG_ON(operation != WRITE_FLUSH && operation != REQ_DISCARD); 743 744 if (operation == WRITE_FLUSH) { 745 bio = bio_alloc(GFP_KERNEL, 0); 746 if (unlikely(bio == NULL)) 747 goto fail_put_bio; 748 749 biolist[nbio++] = bio; 750 bio->bi_bdev = preq.bdev; 751 bio->bi_private = pending_req; 752 bio->bi_end_io = end_block_io_op; 753 } else if (operation == REQ_DISCARD) { 754 xen_blk_discard(blkif, req); 755 xen_blkif_put(blkif); 756 free_req(pending_req); 757 return 0; 758 } 759 } 760 761 /* 762 * We set it one so that the last submit_bio does not have to call 763 * atomic_inc. 764 */ 765 atomic_set(&pending_req->pendcnt, nbio); 766 767 /* Get a reference count for the disk queue and start sending I/O */ 768 blk_start_plug(&plug); 769 770 for (i = 0; i < nbio; i++) 771 submit_bio(operation, biolist[i]); 772 773 /* Let the I/Os go.. */ 774 blk_finish_plug(&plug); 775 776 if (operation == READ) 777 blkif->st_rd_sect += preq.nr_sects; 778 else if (operation & WRITE) 779 blkif->st_wr_sect += preq.nr_sects; 780 781 return 0; 782 783 fail_flush: 784 xen_blkbk_unmap(pending_req); 785 fail_response: 786 /* Haven't submitted any bio's yet. */ 787 make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); 788 free_req(pending_req); 789 msleep(1); /* back off a bit */ 790 return -EIO; 791 792 fail_put_bio: 793 for (i = 0; i < nbio; i++) 794 bio_put(biolist[i]); 795 __end_block_io_op(pending_req, -EINVAL); 796 msleep(1); /* back off a bit */ 797 return -EIO; 798 } 799 800 801 802 /* 803 * Put a response on the ring on how the operation fared. 804 */ 805 static void make_response(struct xen_blkif *blkif, u64 id, 806 unsigned short op, int st) 807 { 808 struct blkif_response resp; 809 unsigned long flags; 810 union blkif_back_rings *blk_rings = &blkif->blk_rings; 811 int notify; 812 813 resp.id = id; 814 resp.operation = op; 815 resp.status = st; 816 817 spin_lock_irqsave(&blkif->blk_ring_lock, flags); 818 /* Place on the response ring for the relevant domain. */ 819 switch (blkif->blk_protocol) { 820 case BLKIF_PROTOCOL_NATIVE: 821 memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), 822 &resp, sizeof(resp)); 823 break; 824 case BLKIF_PROTOCOL_X86_32: 825 memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), 826 &resp, sizeof(resp)); 827 break; 828 case BLKIF_PROTOCOL_X86_64: 829 memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), 830 &resp, sizeof(resp)); 831 break; 832 default: 833 BUG(); 834 } 835 blk_rings->common.rsp_prod_pvt++; 836 RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); 837 spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); 838 if (notify) 839 notify_remote_via_irq(blkif->irq); 840 } 841 842 static int __init xen_blkif_init(void) 843 { 844 int i, mmap_pages; 845 int rc = 0; 846 847 if (!xen_pv_domain()) 848 return -ENODEV; 849 850 blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); 851 if (!blkbk) { 852 pr_alert(DRV_PFX "%s: out of memory!\n", __func__); 853 return -ENOMEM; 854 } 855 856 mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; 857 858 blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * 859 xen_blkif_reqs, GFP_KERNEL); 860 blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * 861 mmap_pages, GFP_KERNEL); 862 blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * 863 mmap_pages, GFP_KERNEL); 864 865 if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || 866 !blkbk->pending_pages) { 867 rc = -ENOMEM; 868 goto out_of_memory; 869 } 870 871 for (i = 0; i < mmap_pages; i++) { 872 blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; 873 blkbk->pending_pages[i] = alloc_page(GFP_KERNEL); 874 if (blkbk->pending_pages[i] == NULL) { 875 rc = -ENOMEM; 876 goto out_of_memory; 877 } 878 } 879 rc = xen_blkif_interface_init(); 880 if (rc) 881 goto failed_init; 882 883 INIT_LIST_HEAD(&blkbk->pending_free); 884 spin_lock_init(&blkbk->pending_free_lock); 885 init_waitqueue_head(&blkbk->pending_free_wq); 886 887 for (i = 0; i < xen_blkif_reqs; i++) 888 list_add_tail(&blkbk->pending_reqs[i].free_list, 889 &blkbk->pending_free); 890 891 rc = xen_blkif_xenbus_init(); 892 if (rc) 893 goto failed_init; 894 895 return 0; 896 897 out_of_memory: 898 pr_alert(DRV_PFX "%s: out of memory\n", __func__); 899 failed_init: 900 kfree(blkbk->pending_reqs); 901 kfree(blkbk->pending_grant_handles); 902 if (blkbk->pending_pages) { 903 for (i = 0; i < mmap_pages; i++) { 904 if (blkbk->pending_pages[i]) 905 __free_page(blkbk->pending_pages[i]); 906 } 907 kfree(blkbk->pending_pages); 908 } 909 kfree(blkbk); 910 blkbk = NULL; 911 return rc; 912 } 913 914 module_init(xen_blkif_init); 915 916 MODULE_LICENSE("Dual BSD/GPL"); 917 MODULE_ALIAS("xen-backend:vbd"); 918