1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2022 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/iomap.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/blkdev.h> 13 #include <linux/statfs.h> 14 #include <linux/writeback.h> 15 #include <linux/quotaops.h> 16 #include <linux/seq_file.h> 17 #include <linux/parser.h> 18 #include <linux/uio.h> 19 #include <linux/mman.h> 20 #include <linux/sched/mm.h> 21 #include <linux/task_io_accounting_ops.h> 22 23 #include "zonefs.h" 24 25 #include "trace.h" 26 27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, 28 loff_t length, unsigned int flags, 29 struct iomap *iomap, struct iomap *srcmap) 30 { 31 struct zonefs_inode_info *zi = ZONEFS_I(inode); 32 struct zonefs_zone *z = zonefs_inode_zone(inode); 33 struct super_block *sb = inode->i_sb; 34 loff_t isize; 35 36 /* 37 * All blocks are always mapped below EOF. If reading past EOF, 38 * act as if there is a hole up to the file maximum size. 39 */ 40 mutex_lock(&zi->i_truncate_mutex); 41 iomap->bdev = inode->i_sb->s_bdev; 42 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 43 isize = i_size_read(inode); 44 if (iomap->offset >= isize) { 45 iomap->type = IOMAP_HOLE; 46 iomap->addr = IOMAP_NULL_ADDR; 47 iomap->length = length; 48 } else { 49 iomap->type = IOMAP_MAPPED; 50 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 51 iomap->length = isize - iomap->offset; 52 } 53 mutex_unlock(&zi->i_truncate_mutex); 54 55 trace_zonefs_iomap_begin(inode, iomap); 56 57 return 0; 58 } 59 60 static const struct iomap_ops zonefs_read_iomap_ops = { 61 .iomap_begin = zonefs_read_iomap_begin, 62 }; 63 64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, 65 loff_t length, unsigned int flags, 66 struct iomap *iomap, struct iomap *srcmap) 67 { 68 struct zonefs_inode_info *zi = ZONEFS_I(inode); 69 struct zonefs_zone *z = zonefs_inode_zone(inode); 70 struct super_block *sb = inode->i_sb; 71 loff_t isize; 72 73 /* All write I/Os should always be within the file maximum size */ 74 if (WARN_ON_ONCE(offset + length > z->z_capacity)) 75 return -EIO; 76 77 /* 78 * Sequential zones can only accept direct writes. This is already 79 * checked when writes are issued, so warn if we see a page writeback 80 * operation. 81 */ 82 if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) 83 return -EIO; 84 85 /* 86 * For conventional zones, all blocks are always mapped. For sequential 87 * zones, all blocks after always mapped below the inode size (zone 88 * write pointer) and unwriten beyond. 89 */ 90 mutex_lock(&zi->i_truncate_mutex); 91 iomap->bdev = inode->i_sb->s_bdev; 92 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 93 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 94 isize = i_size_read(inode); 95 if (iomap->offset >= isize) { 96 iomap->type = IOMAP_UNWRITTEN; 97 iomap->length = z->z_capacity - iomap->offset; 98 } else { 99 iomap->type = IOMAP_MAPPED; 100 iomap->length = isize - iomap->offset; 101 } 102 mutex_unlock(&zi->i_truncate_mutex); 103 104 trace_zonefs_iomap_begin(inode, iomap); 105 106 return 0; 107 } 108 109 static const struct iomap_ops zonefs_write_iomap_ops = { 110 .iomap_begin = zonefs_write_iomap_begin, 111 }; 112 113 static int zonefs_read_folio(struct file *unused, struct folio *folio) 114 { 115 return iomap_read_folio(folio, &zonefs_read_iomap_ops); 116 } 117 118 static void zonefs_readahead(struct readahead_control *rac) 119 { 120 iomap_readahead(rac, &zonefs_read_iomap_ops); 121 } 122 123 /* 124 * Map blocks for page writeback. This is used only on conventional zone files, 125 * which implies that the page range can only be within the fixed inode size. 126 */ 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, 128 struct inode *inode, loff_t offset) 129 { 130 struct zonefs_zone *z = zonefs_inode_zone(inode); 131 132 if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) 133 return -EIO; 134 if (WARN_ON_ONCE(offset >= i_size_read(inode))) 135 return -EIO; 136 137 /* If the mapping is already OK, nothing needs to be done */ 138 if (offset >= wpc->iomap.offset && 139 offset < wpc->iomap.offset + wpc->iomap.length) 140 return 0; 141 142 return zonefs_write_iomap_begin(inode, offset, 143 z->z_capacity - offset, 144 IOMAP_WRITE, &wpc->iomap, NULL); 145 } 146 147 static const struct iomap_writeback_ops zonefs_writeback_ops = { 148 .map_blocks = zonefs_write_map_blocks, 149 }; 150 151 static int zonefs_writepages(struct address_space *mapping, 152 struct writeback_control *wbc) 153 { 154 struct iomap_writepage_ctx wpc = { }; 155 156 return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); 157 } 158 159 static int zonefs_swap_activate(struct swap_info_struct *sis, 160 struct file *swap_file, sector_t *span) 161 { 162 struct inode *inode = file_inode(swap_file); 163 164 if (zonefs_inode_is_seq(inode)) { 165 zonefs_err(inode->i_sb, 166 "swap file: not a conventional zone file\n"); 167 return -EINVAL; 168 } 169 170 return iomap_swapfile_activate(sis, swap_file, span, 171 &zonefs_read_iomap_ops); 172 } 173 174 const struct address_space_operations zonefs_file_aops = { 175 .read_folio = zonefs_read_folio, 176 .readahead = zonefs_readahead, 177 .writepages = zonefs_writepages, 178 .dirty_folio = filemap_dirty_folio, 179 .release_folio = iomap_release_folio, 180 .invalidate_folio = iomap_invalidate_folio, 181 .migrate_folio = filemap_migrate_folio, 182 .is_partially_uptodate = iomap_is_partially_uptodate, 183 .error_remove_page = generic_error_remove_page, 184 .direct_IO = noop_direct_IO, 185 .swap_activate = zonefs_swap_activate, 186 }; 187 188 int zonefs_file_truncate(struct inode *inode, loff_t isize) 189 { 190 struct zonefs_inode_info *zi = ZONEFS_I(inode); 191 struct zonefs_zone *z = zonefs_inode_zone(inode); 192 loff_t old_isize; 193 enum req_op op; 194 int ret = 0; 195 196 /* 197 * Only sequential zone files can be truncated and truncation is allowed 198 * only down to a 0 size, which is equivalent to a zone reset, and to 199 * the maximum file size, which is equivalent to a zone finish. 200 */ 201 if (!zonefs_zone_is_seq(z)) 202 return -EPERM; 203 204 if (!isize) 205 op = REQ_OP_ZONE_RESET; 206 else if (isize == z->z_capacity) 207 op = REQ_OP_ZONE_FINISH; 208 else 209 return -EPERM; 210 211 inode_dio_wait(inode); 212 213 /* Serialize against page faults */ 214 filemap_invalidate_lock(inode->i_mapping); 215 216 /* Serialize against zonefs_iomap_begin() */ 217 mutex_lock(&zi->i_truncate_mutex); 218 219 old_isize = i_size_read(inode); 220 if (isize == old_isize) 221 goto unlock; 222 223 ret = zonefs_inode_zone_mgmt(inode, op); 224 if (ret) 225 goto unlock; 226 227 /* 228 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 229 * take care of open zones. 230 */ 231 if (z->z_flags & ZONEFS_ZONE_OPEN) { 232 /* 233 * Truncating a zone to EMPTY or FULL is the equivalent of 234 * closing the zone. For a truncation to 0, we need to 235 * re-open the zone to ensure new writes can be processed. 236 * For a truncation to the maximum file size, the zone is 237 * closed and writes cannot be accepted anymore, so clear 238 * the open flag. 239 */ 240 if (!isize) 241 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 242 else 243 z->z_flags &= ~ZONEFS_ZONE_OPEN; 244 } 245 246 zonefs_update_stats(inode, isize); 247 truncate_setsize(inode, isize); 248 z->z_wpoffset = isize; 249 zonefs_inode_account_active(inode); 250 251 unlock: 252 mutex_unlock(&zi->i_truncate_mutex); 253 filemap_invalidate_unlock(inode->i_mapping); 254 255 return ret; 256 } 257 258 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 259 int datasync) 260 { 261 struct inode *inode = file_inode(file); 262 int ret = 0; 263 264 if (unlikely(IS_IMMUTABLE(inode))) 265 return -EPERM; 266 267 /* 268 * Since only direct writes are allowed in sequential files, page cache 269 * flush is needed only for conventional zone files. 270 */ 271 if (zonefs_inode_is_cnv(inode)) 272 ret = file_write_and_wait_range(file, start, end); 273 if (!ret) 274 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 275 276 if (ret) 277 zonefs_io_error(inode, true); 278 279 return ret; 280 } 281 282 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 283 { 284 struct inode *inode = file_inode(vmf->vma->vm_file); 285 vm_fault_t ret; 286 287 if (unlikely(IS_IMMUTABLE(inode))) 288 return VM_FAULT_SIGBUS; 289 290 /* 291 * Sanity check: only conventional zone files can have shared 292 * writeable mappings. 293 */ 294 if (zonefs_inode_is_seq(inode)) 295 return VM_FAULT_NOPAGE; 296 297 sb_start_pagefault(inode->i_sb); 298 file_update_time(vmf->vma->vm_file); 299 300 /* Serialize against truncates */ 301 filemap_invalidate_lock_shared(inode->i_mapping); 302 ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); 303 filemap_invalidate_unlock_shared(inode->i_mapping); 304 305 sb_end_pagefault(inode->i_sb); 306 return ret; 307 } 308 309 static const struct vm_operations_struct zonefs_file_vm_ops = { 310 .fault = filemap_fault, 311 .map_pages = filemap_map_pages, 312 .page_mkwrite = zonefs_filemap_page_mkwrite, 313 }; 314 315 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) 316 { 317 /* 318 * Conventional zones accept random writes, so their files can support 319 * shared writable mappings. For sequential zone files, only read 320 * mappings are possible since there are no guarantees for write 321 * ordering between msync() and page cache writeback. 322 */ 323 if (zonefs_inode_is_seq(file_inode(file)) && 324 (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 325 return -EINVAL; 326 327 file_accessed(file); 328 vma->vm_ops = &zonefs_file_vm_ops; 329 330 return 0; 331 } 332 333 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 334 { 335 loff_t isize = i_size_read(file_inode(file)); 336 337 /* 338 * Seeks are limited to below the zone size for conventional zones 339 * and below the zone write pointer for sequential zones. In both 340 * cases, this limit is the inode size. 341 */ 342 return generic_file_llseek_size(file, offset, whence, isize, isize); 343 } 344 345 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 346 int error, unsigned int flags) 347 { 348 struct inode *inode = file_inode(iocb->ki_filp); 349 struct zonefs_inode_info *zi = ZONEFS_I(inode); 350 351 if (error) { 352 zonefs_io_error(inode, true); 353 return error; 354 } 355 356 if (size && zonefs_inode_is_seq(inode)) { 357 /* 358 * Note that we may be seeing completions out of order, 359 * but that is not a problem since a write completed 360 * successfully necessarily means that all preceding writes 361 * were also successful. So we can safely increase the inode 362 * size to the write end location. 363 */ 364 mutex_lock(&zi->i_truncate_mutex); 365 if (i_size_read(inode) < iocb->ki_pos + size) { 366 zonefs_update_stats(inode, iocb->ki_pos + size); 367 zonefs_i_size_write(inode, iocb->ki_pos + size); 368 } 369 mutex_unlock(&zi->i_truncate_mutex); 370 } 371 372 return 0; 373 } 374 375 static const struct iomap_dio_ops zonefs_write_dio_ops = { 376 .end_io = zonefs_file_write_dio_end_io, 377 }; 378 379 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) 380 { 381 struct inode *inode = file_inode(iocb->ki_filp); 382 struct zonefs_zone *z = zonefs_inode_zone(inode); 383 struct block_device *bdev = inode->i_sb->s_bdev; 384 unsigned int max = bdev_max_zone_append_sectors(bdev); 385 pgoff_t start, end; 386 struct bio *bio; 387 ssize_t size = 0; 388 int nr_pages; 389 ssize_t ret; 390 391 max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); 392 iov_iter_truncate(from, max); 393 394 /* 395 * If the inode block size (zone write granularity) is smaller than the 396 * page size, we may be appending data belonging to the last page of the 397 * inode straddling inode->i_size, with that page already cached due to 398 * a buffered read or readahead. So make sure to invalidate that page. 399 * This will always be a no-op for the case where the block size is 400 * equal to the page size. 401 */ 402 start = iocb->ki_pos >> PAGE_SHIFT; 403 end = (iocb->ki_pos + iov_iter_count(from) - 1) >> PAGE_SHIFT; 404 if (invalidate_inode_pages2_range(inode->i_mapping, start, end)) 405 return -EBUSY; 406 407 nr_pages = iov_iter_npages(from, BIO_MAX_VECS); 408 if (!nr_pages) 409 return 0; 410 411 bio = bio_alloc(bdev, nr_pages, 412 REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); 413 bio->bi_iter.bi_sector = z->z_sector; 414 bio->bi_ioprio = iocb->ki_ioprio; 415 if (iocb_is_dsync(iocb)) 416 bio->bi_opf |= REQ_FUA; 417 418 ret = bio_iov_iter_get_pages(bio, from); 419 if (unlikely(ret)) 420 goto out_release; 421 422 size = bio->bi_iter.bi_size; 423 task_io_account_write(size); 424 425 if (iocb->ki_flags & IOCB_HIPRI) 426 bio_set_polled(bio, iocb); 427 428 ret = submit_bio_wait(bio); 429 430 /* 431 * If the file zone was written underneath the file system, the zone 432 * write pointer may not be where we expect it to be, but the zone 433 * append write can still succeed. So check manually that we wrote where 434 * we intended to, that is, at zi->i_wpoffset. 435 */ 436 if (!ret) { 437 sector_t wpsector = 438 z->z_sector + (z->z_wpoffset >> SECTOR_SHIFT); 439 440 if (bio->bi_iter.bi_sector != wpsector) { 441 zonefs_warn(inode->i_sb, 442 "Corrupted write pointer %llu for zone at %llu\n", 443 bio->bi_iter.bi_sector, z->z_sector); 444 ret = -EIO; 445 } 446 } 447 448 zonefs_file_write_dio_end_io(iocb, size, ret, 0); 449 trace_zonefs_file_dio_append(inode, size, ret); 450 451 out_release: 452 bio_release_pages(bio, false); 453 bio_put(bio); 454 455 if (ret >= 0) { 456 iocb->ki_pos += size; 457 return size; 458 } 459 460 return ret; 461 } 462 463 /* 464 * Do not exceed the LFS limits nor the file zone size. If pos is under the 465 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 466 */ 467 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 468 loff_t count) 469 { 470 struct inode *inode = file_inode(file); 471 struct zonefs_zone *z = zonefs_inode_zone(inode); 472 loff_t limit = rlimit(RLIMIT_FSIZE); 473 loff_t max_size = z->z_capacity; 474 475 if (limit != RLIM_INFINITY) { 476 if (pos >= limit) { 477 send_sig(SIGXFSZ, current, 0); 478 return -EFBIG; 479 } 480 count = min(count, limit - pos); 481 } 482 483 if (!(file->f_flags & O_LARGEFILE)) 484 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 485 486 if (unlikely(pos >= max_size)) 487 return -EFBIG; 488 489 return min(count, max_size - pos); 490 } 491 492 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 493 { 494 struct file *file = iocb->ki_filp; 495 struct inode *inode = file_inode(file); 496 struct zonefs_inode_info *zi = ZONEFS_I(inode); 497 struct zonefs_zone *z = zonefs_inode_zone(inode); 498 loff_t count; 499 500 if (IS_SWAPFILE(inode)) 501 return -ETXTBSY; 502 503 if (!iov_iter_count(from)) 504 return 0; 505 506 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 507 return -EINVAL; 508 509 if (iocb->ki_flags & IOCB_APPEND) { 510 if (zonefs_zone_is_cnv(z)) 511 return -EINVAL; 512 mutex_lock(&zi->i_truncate_mutex); 513 iocb->ki_pos = z->z_wpoffset; 514 mutex_unlock(&zi->i_truncate_mutex); 515 } 516 517 count = zonefs_write_check_limits(file, iocb->ki_pos, 518 iov_iter_count(from)); 519 if (count < 0) 520 return count; 521 522 iov_iter_truncate(from, count); 523 return iov_iter_count(from); 524 } 525 526 /* 527 * Handle direct writes. For sequential zone files, this is the only possible 528 * write path. For these files, check that the user is issuing writes 529 * sequentially from the end of the file. This code assumes that the block layer 530 * delivers write requests to the device in sequential order. This is always the 531 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 532 * elevator feature is being used (e.g. mq-deadline). The block layer always 533 * automatically select such an elevator for zoned block devices during the 534 * device initialization. 535 */ 536 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 537 { 538 struct inode *inode = file_inode(iocb->ki_filp); 539 struct zonefs_inode_info *zi = ZONEFS_I(inode); 540 struct zonefs_zone *z = zonefs_inode_zone(inode); 541 struct super_block *sb = inode->i_sb; 542 bool sync = is_sync_kiocb(iocb); 543 bool append = false; 544 ssize_t ret, count; 545 546 /* 547 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 548 * as this can cause write reordering (e.g. the first aio gets EAGAIN 549 * on the inode lock but the second goes through but is now unaligned). 550 */ 551 if (zonefs_zone_is_seq(z) && !sync && (iocb->ki_flags & IOCB_NOWAIT)) 552 return -EOPNOTSUPP; 553 554 if (iocb->ki_flags & IOCB_NOWAIT) { 555 if (!inode_trylock(inode)) 556 return -EAGAIN; 557 } else { 558 inode_lock(inode); 559 } 560 561 count = zonefs_write_checks(iocb, from); 562 if (count <= 0) { 563 ret = count; 564 goto inode_unlock; 565 } 566 567 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 568 ret = -EINVAL; 569 goto inode_unlock; 570 } 571 572 /* Enforce sequential writes (append only) in sequential zones */ 573 if (zonefs_zone_is_seq(z)) { 574 mutex_lock(&zi->i_truncate_mutex); 575 if (iocb->ki_pos != z->z_wpoffset) { 576 mutex_unlock(&zi->i_truncate_mutex); 577 ret = -EINVAL; 578 goto inode_unlock; 579 } 580 mutex_unlock(&zi->i_truncate_mutex); 581 append = sync; 582 } 583 584 if (append) { 585 ret = zonefs_file_dio_append(iocb, from); 586 } else { 587 /* 588 * iomap_dio_rw() may return ENOTBLK if there was an issue with 589 * page invalidation. Overwrite that error code with EBUSY to 590 * be consistent with zonefs_file_dio_append() return value for 591 * similar issues. 592 */ 593 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, 594 &zonefs_write_dio_ops, 0, NULL, 0); 595 if (ret == -ENOTBLK) 596 ret = -EBUSY; 597 } 598 599 if (zonefs_zone_is_seq(z) && 600 (ret > 0 || ret == -EIOCBQUEUED)) { 601 if (ret > 0) 602 count = ret; 603 604 /* 605 * Update the zone write pointer offset assuming the write 606 * operation succeeded. If it did not, the error recovery path 607 * will correct it. Also do active seq file accounting. 608 */ 609 mutex_lock(&zi->i_truncate_mutex); 610 z->z_wpoffset += count; 611 zonefs_inode_account_active(inode); 612 mutex_unlock(&zi->i_truncate_mutex); 613 } 614 615 inode_unlock: 616 inode_unlock(inode); 617 618 return ret; 619 } 620 621 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 622 struct iov_iter *from) 623 { 624 struct inode *inode = file_inode(iocb->ki_filp); 625 ssize_t ret; 626 627 /* 628 * Direct IO writes are mandatory for sequential zone files so that the 629 * write IO issuing order is preserved. 630 */ 631 if (zonefs_inode_is_seq(inode)) 632 return -EIO; 633 634 if (iocb->ki_flags & IOCB_NOWAIT) { 635 if (!inode_trylock(inode)) 636 return -EAGAIN; 637 } else { 638 inode_lock(inode); 639 } 640 641 ret = zonefs_write_checks(iocb, from); 642 if (ret <= 0) 643 goto inode_unlock; 644 645 ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); 646 if (ret > 0) 647 iocb->ki_pos += ret; 648 else if (ret == -EIO) 649 zonefs_io_error(inode, true); 650 651 inode_unlock: 652 inode_unlock(inode); 653 if (ret > 0) 654 ret = generic_write_sync(iocb, ret); 655 656 return ret; 657 } 658 659 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 660 { 661 struct inode *inode = file_inode(iocb->ki_filp); 662 struct zonefs_zone *z = zonefs_inode_zone(inode); 663 664 if (unlikely(IS_IMMUTABLE(inode))) 665 return -EPERM; 666 667 if (sb_rdonly(inode->i_sb)) 668 return -EROFS; 669 670 /* Write operations beyond the zone capacity are not allowed */ 671 if (iocb->ki_pos >= z->z_capacity) 672 return -EFBIG; 673 674 if (iocb->ki_flags & IOCB_DIRECT) { 675 ssize_t ret = zonefs_file_dio_write(iocb, from); 676 677 if (ret != -ENOTBLK) 678 return ret; 679 } 680 681 return zonefs_file_buffered_write(iocb, from); 682 } 683 684 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 685 int error, unsigned int flags) 686 { 687 if (error) { 688 zonefs_io_error(file_inode(iocb->ki_filp), false); 689 return error; 690 } 691 692 return 0; 693 } 694 695 static const struct iomap_dio_ops zonefs_read_dio_ops = { 696 .end_io = zonefs_file_read_dio_end_io, 697 }; 698 699 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 700 { 701 struct inode *inode = file_inode(iocb->ki_filp); 702 struct zonefs_inode_info *zi = ZONEFS_I(inode); 703 struct zonefs_zone *z = zonefs_inode_zone(inode); 704 struct super_block *sb = inode->i_sb; 705 loff_t isize; 706 ssize_t ret; 707 708 /* Offline zones cannot be read */ 709 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 710 return -EPERM; 711 712 if (iocb->ki_pos >= z->z_capacity) 713 return 0; 714 715 if (iocb->ki_flags & IOCB_NOWAIT) { 716 if (!inode_trylock_shared(inode)) 717 return -EAGAIN; 718 } else { 719 inode_lock_shared(inode); 720 } 721 722 /* Limit read operations to written data */ 723 mutex_lock(&zi->i_truncate_mutex); 724 isize = i_size_read(inode); 725 if (iocb->ki_pos >= isize) { 726 mutex_unlock(&zi->i_truncate_mutex); 727 ret = 0; 728 goto inode_unlock; 729 } 730 iov_iter_truncate(to, isize - iocb->ki_pos); 731 mutex_unlock(&zi->i_truncate_mutex); 732 733 if (iocb->ki_flags & IOCB_DIRECT) { 734 size_t count = iov_iter_count(to); 735 736 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 737 ret = -EINVAL; 738 goto inode_unlock; 739 } 740 file_accessed(iocb->ki_filp); 741 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, 742 &zonefs_read_dio_ops, 0, NULL, 0); 743 } else { 744 ret = generic_file_read_iter(iocb, to); 745 if (ret == -EIO) 746 zonefs_io_error(inode, false); 747 } 748 749 inode_unlock: 750 inode_unlock_shared(inode); 751 752 return ret; 753 } 754 755 /* 756 * Write open accounting is done only for sequential files. 757 */ 758 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 759 struct file *file) 760 { 761 if (zonefs_inode_is_cnv(inode)) 762 return false; 763 764 if (!(file->f_mode & FMODE_WRITE)) 765 return false; 766 767 return true; 768 } 769 770 static int zonefs_seq_file_write_open(struct inode *inode) 771 { 772 struct zonefs_inode_info *zi = ZONEFS_I(inode); 773 struct zonefs_zone *z = zonefs_inode_zone(inode); 774 int ret = 0; 775 776 mutex_lock(&zi->i_truncate_mutex); 777 778 if (!zi->i_wr_refcnt) { 779 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 780 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 781 782 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 783 784 if (sbi->s_max_wro_seq_files 785 && wro > sbi->s_max_wro_seq_files) { 786 atomic_dec(&sbi->s_wro_seq_files); 787 ret = -EBUSY; 788 goto unlock; 789 } 790 791 if (i_size_read(inode) < z->z_capacity) { 792 ret = zonefs_inode_zone_mgmt(inode, 793 REQ_OP_ZONE_OPEN); 794 if (ret) { 795 atomic_dec(&sbi->s_wro_seq_files); 796 goto unlock; 797 } 798 z->z_flags |= ZONEFS_ZONE_OPEN; 799 zonefs_inode_account_active(inode); 800 } 801 } 802 } 803 804 zi->i_wr_refcnt++; 805 806 unlock: 807 mutex_unlock(&zi->i_truncate_mutex); 808 809 return ret; 810 } 811 812 static int zonefs_file_open(struct inode *inode, struct file *file) 813 { 814 int ret; 815 816 ret = generic_file_open(inode, file); 817 if (ret) 818 return ret; 819 820 if (zonefs_seq_file_need_wro(inode, file)) 821 return zonefs_seq_file_write_open(inode); 822 823 return 0; 824 } 825 826 static void zonefs_seq_file_write_close(struct inode *inode) 827 { 828 struct zonefs_inode_info *zi = ZONEFS_I(inode); 829 struct zonefs_zone *z = zonefs_inode_zone(inode); 830 struct super_block *sb = inode->i_sb; 831 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 832 int ret = 0; 833 834 mutex_lock(&zi->i_truncate_mutex); 835 836 zi->i_wr_refcnt--; 837 if (zi->i_wr_refcnt) 838 goto unlock; 839 840 /* 841 * The file zone may not be open anymore (e.g. the file was truncated to 842 * its maximum size or it was fully written). For this case, we only 843 * need to decrement the write open count. 844 */ 845 if (z->z_flags & ZONEFS_ZONE_OPEN) { 846 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 847 if (ret) { 848 __zonefs_io_error(inode, false); 849 /* 850 * Leaving zones explicitly open may lead to a state 851 * where most zones cannot be written (zone resources 852 * exhausted). So take preventive action by remounting 853 * read-only. 854 */ 855 if (z->z_flags & ZONEFS_ZONE_OPEN && 856 !(sb->s_flags & SB_RDONLY)) { 857 zonefs_warn(sb, 858 "closing zone at %llu failed %d\n", 859 z->z_sector, ret); 860 zonefs_warn(sb, 861 "remounting filesystem read-only\n"); 862 sb->s_flags |= SB_RDONLY; 863 } 864 goto unlock; 865 } 866 867 z->z_flags &= ~ZONEFS_ZONE_OPEN; 868 zonefs_inode_account_active(inode); 869 } 870 871 atomic_dec(&sbi->s_wro_seq_files); 872 873 unlock: 874 mutex_unlock(&zi->i_truncate_mutex); 875 } 876 877 static int zonefs_file_release(struct inode *inode, struct file *file) 878 { 879 /* 880 * If we explicitly open a zone we must close it again as well, but the 881 * zone management operation can fail (either due to an IO error or as 882 * the zone has gone offline or read-only). Make sure we don't fail the 883 * close(2) for user-space. 884 */ 885 if (zonefs_seq_file_need_wro(inode, file)) 886 zonefs_seq_file_write_close(inode); 887 888 return 0; 889 } 890 891 const struct file_operations zonefs_file_operations = { 892 .open = zonefs_file_open, 893 .release = zonefs_file_release, 894 .fsync = zonefs_file_fsync, 895 .mmap = zonefs_file_mmap, 896 .llseek = zonefs_file_llseek, 897 .read_iter = zonefs_file_read_iter, 898 .write_iter = zonefs_file_write_iter, 899 .splice_read = generic_file_splice_read, 900 .splice_write = iter_file_splice_write, 901 .iopoll = iocb_bio_iopoll, 902 }; 903