1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2022 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/iomap.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/blkdev.h> 13 #include <linux/statfs.h> 14 #include <linux/writeback.h> 15 #include <linux/quotaops.h> 16 #include <linux/seq_file.h> 17 #include <linux/parser.h> 18 #include <linux/uio.h> 19 #include <linux/mman.h> 20 #include <linux/sched/mm.h> 21 #include <linux/task_io_accounting_ops.h> 22 23 #include "zonefs.h" 24 25 #include "trace.h" 26 27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, 28 loff_t length, unsigned int flags, 29 struct iomap *iomap, struct iomap *srcmap) 30 { 31 struct zonefs_inode_info *zi = ZONEFS_I(inode); 32 struct zonefs_zone *z = zonefs_inode_zone(inode); 33 struct super_block *sb = inode->i_sb; 34 loff_t isize; 35 36 /* 37 * All blocks are always mapped below EOF. If reading past EOF, 38 * act as if there is a hole up to the file maximum size. 39 */ 40 mutex_lock(&zi->i_truncate_mutex); 41 iomap->bdev = inode->i_sb->s_bdev; 42 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 43 isize = i_size_read(inode); 44 if (iomap->offset >= isize) { 45 iomap->type = IOMAP_HOLE; 46 iomap->addr = IOMAP_NULL_ADDR; 47 iomap->length = length; 48 } else { 49 iomap->type = IOMAP_MAPPED; 50 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 51 iomap->length = isize - iomap->offset; 52 } 53 mutex_unlock(&zi->i_truncate_mutex); 54 55 trace_zonefs_iomap_begin(inode, iomap); 56 57 return 0; 58 } 59 60 static const struct iomap_ops zonefs_read_iomap_ops = { 61 .iomap_begin = zonefs_read_iomap_begin, 62 }; 63 64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, 65 loff_t length, unsigned int flags, 66 struct iomap *iomap, struct iomap *srcmap) 67 { 68 struct zonefs_inode_info *zi = ZONEFS_I(inode); 69 struct zonefs_zone *z = zonefs_inode_zone(inode); 70 struct super_block *sb = inode->i_sb; 71 loff_t isize; 72 73 /* All write I/Os should always be within the file maximum size */ 74 if (WARN_ON_ONCE(offset + length > z->z_capacity)) 75 return -EIO; 76 77 /* 78 * Sequential zones can only accept direct writes. This is already 79 * checked when writes are issued, so warn if we see a page writeback 80 * operation. 81 */ 82 if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) 83 return -EIO; 84 85 /* 86 * For conventional zones, all blocks are always mapped. For sequential 87 * zones, all blocks after always mapped below the inode size (zone 88 * write pointer) and unwriten beyond. 89 */ 90 mutex_lock(&zi->i_truncate_mutex); 91 iomap->bdev = inode->i_sb->s_bdev; 92 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 93 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 94 isize = i_size_read(inode); 95 if (iomap->offset >= isize) { 96 iomap->type = IOMAP_UNWRITTEN; 97 iomap->length = z->z_capacity - iomap->offset; 98 } else { 99 iomap->type = IOMAP_MAPPED; 100 iomap->length = isize - iomap->offset; 101 } 102 mutex_unlock(&zi->i_truncate_mutex); 103 104 trace_zonefs_iomap_begin(inode, iomap); 105 106 return 0; 107 } 108 109 static const struct iomap_ops zonefs_write_iomap_ops = { 110 .iomap_begin = zonefs_write_iomap_begin, 111 }; 112 113 static int zonefs_read_folio(struct file *unused, struct folio *folio) 114 { 115 return iomap_read_folio(folio, &zonefs_read_iomap_ops); 116 } 117 118 static void zonefs_readahead(struct readahead_control *rac) 119 { 120 iomap_readahead(rac, &zonefs_read_iomap_ops); 121 } 122 123 /* 124 * Map blocks for page writeback. This is used only on conventional zone files, 125 * which implies that the page range can only be within the fixed inode size. 126 */ 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, 128 struct inode *inode, loff_t offset) 129 { 130 struct zonefs_zone *z = zonefs_inode_zone(inode); 131 132 if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) 133 return -EIO; 134 if (WARN_ON_ONCE(offset >= i_size_read(inode))) 135 return -EIO; 136 137 /* If the mapping is already OK, nothing needs to be done */ 138 if (offset >= wpc->iomap.offset && 139 offset < wpc->iomap.offset + wpc->iomap.length) 140 return 0; 141 142 return zonefs_write_iomap_begin(inode, offset, 143 z->z_capacity - offset, 144 IOMAP_WRITE, &wpc->iomap, NULL); 145 } 146 147 static const struct iomap_writeback_ops zonefs_writeback_ops = { 148 .map_blocks = zonefs_write_map_blocks, 149 }; 150 151 static int zonefs_writepages(struct address_space *mapping, 152 struct writeback_control *wbc) 153 { 154 struct iomap_writepage_ctx wpc = { }; 155 156 return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); 157 } 158 159 static int zonefs_swap_activate(struct swap_info_struct *sis, 160 struct file *swap_file, sector_t *span) 161 { 162 struct inode *inode = file_inode(swap_file); 163 164 if (zonefs_inode_is_seq(inode)) { 165 zonefs_err(inode->i_sb, 166 "swap file: not a conventional zone file\n"); 167 return -EINVAL; 168 } 169 170 return iomap_swapfile_activate(sis, swap_file, span, 171 &zonefs_read_iomap_ops); 172 } 173 174 const struct address_space_operations zonefs_file_aops = { 175 .read_folio = zonefs_read_folio, 176 .readahead = zonefs_readahead, 177 .writepages = zonefs_writepages, 178 .dirty_folio = iomap_dirty_folio, 179 .release_folio = iomap_release_folio, 180 .invalidate_folio = iomap_invalidate_folio, 181 .migrate_folio = filemap_migrate_folio, 182 .is_partially_uptodate = iomap_is_partially_uptodate, 183 .error_remove_folio = generic_error_remove_folio, 184 .swap_activate = zonefs_swap_activate, 185 }; 186 187 int zonefs_file_truncate(struct inode *inode, loff_t isize) 188 { 189 struct zonefs_inode_info *zi = ZONEFS_I(inode); 190 struct zonefs_zone *z = zonefs_inode_zone(inode); 191 loff_t old_isize; 192 enum req_op op; 193 int ret = 0; 194 195 /* 196 * Only sequential zone files can be truncated and truncation is allowed 197 * only down to a 0 size, which is equivalent to a zone reset, and to 198 * the maximum file size, which is equivalent to a zone finish. 199 */ 200 if (!zonefs_zone_is_seq(z)) 201 return -EPERM; 202 203 if (!isize) 204 op = REQ_OP_ZONE_RESET; 205 else if (isize == z->z_capacity) 206 op = REQ_OP_ZONE_FINISH; 207 else 208 return -EPERM; 209 210 inode_dio_wait(inode); 211 212 /* Serialize against page faults */ 213 filemap_invalidate_lock(inode->i_mapping); 214 215 /* Serialize against zonefs_iomap_begin() */ 216 mutex_lock(&zi->i_truncate_mutex); 217 218 old_isize = i_size_read(inode); 219 if (isize == old_isize) 220 goto unlock; 221 222 ret = zonefs_inode_zone_mgmt(inode, op); 223 if (ret) 224 goto unlock; 225 226 /* 227 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 228 * take care of open zones. 229 */ 230 if (z->z_flags & ZONEFS_ZONE_OPEN) { 231 /* 232 * Truncating a zone to EMPTY or FULL is the equivalent of 233 * closing the zone. For a truncation to 0, we need to 234 * re-open the zone to ensure new writes can be processed. 235 * For a truncation to the maximum file size, the zone is 236 * closed and writes cannot be accepted anymore, so clear 237 * the open flag. 238 */ 239 if (!isize) 240 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 241 else 242 z->z_flags &= ~ZONEFS_ZONE_OPEN; 243 } 244 245 zonefs_update_stats(inode, isize); 246 truncate_setsize(inode, isize); 247 z->z_wpoffset = isize; 248 zonefs_inode_account_active(inode); 249 250 unlock: 251 mutex_unlock(&zi->i_truncate_mutex); 252 filemap_invalidate_unlock(inode->i_mapping); 253 254 return ret; 255 } 256 257 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 258 int datasync) 259 { 260 struct inode *inode = file_inode(file); 261 int ret = 0; 262 263 if (unlikely(IS_IMMUTABLE(inode))) 264 return -EPERM; 265 266 /* 267 * Since only direct writes are allowed in sequential files, page cache 268 * flush is needed only for conventional zone files. 269 */ 270 if (zonefs_inode_is_cnv(inode)) 271 ret = file_write_and_wait_range(file, start, end); 272 if (!ret) 273 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 274 275 if (ret) 276 zonefs_io_error(inode, true); 277 278 return ret; 279 } 280 281 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 282 { 283 struct inode *inode = file_inode(vmf->vma->vm_file); 284 vm_fault_t ret; 285 286 if (unlikely(IS_IMMUTABLE(inode))) 287 return VM_FAULT_SIGBUS; 288 289 /* 290 * Sanity check: only conventional zone files can have shared 291 * writeable mappings. 292 */ 293 if (zonefs_inode_is_seq(inode)) 294 return VM_FAULT_NOPAGE; 295 296 sb_start_pagefault(inode->i_sb); 297 file_update_time(vmf->vma->vm_file); 298 299 /* Serialize against truncates */ 300 filemap_invalidate_lock_shared(inode->i_mapping); 301 ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); 302 filemap_invalidate_unlock_shared(inode->i_mapping); 303 304 sb_end_pagefault(inode->i_sb); 305 return ret; 306 } 307 308 static const struct vm_operations_struct zonefs_file_vm_ops = { 309 .fault = filemap_fault, 310 .map_pages = filemap_map_pages, 311 .page_mkwrite = zonefs_filemap_page_mkwrite, 312 }; 313 314 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) 315 { 316 /* 317 * Conventional zones accept random writes, so their files can support 318 * shared writable mappings. For sequential zone files, only read 319 * mappings are possible since there are no guarantees for write 320 * ordering between msync() and page cache writeback. 321 */ 322 if (zonefs_inode_is_seq(file_inode(file)) && 323 (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 324 return -EINVAL; 325 326 file_accessed(file); 327 vma->vm_ops = &zonefs_file_vm_ops; 328 329 return 0; 330 } 331 332 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 333 { 334 loff_t isize = i_size_read(file_inode(file)); 335 336 /* 337 * Seeks are limited to below the zone size for conventional zones 338 * and below the zone write pointer for sequential zones. In both 339 * cases, this limit is the inode size. 340 */ 341 return generic_file_llseek_size(file, offset, whence, isize, isize); 342 } 343 344 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 345 int error, unsigned int flags) 346 { 347 struct inode *inode = file_inode(iocb->ki_filp); 348 struct zonefs_inode_info *zi = ZONEFS_I(inode); 349 350 if (error) { 351 /* 352 * For Sync IOs, error recovery is called from 353 * zonefs_file_dio_write(). 354 */ 355 if (!is_sync_kiocb(iocb)) 356 zonefs_io_error(inode, true); 357 return error; 358 } 359 360 if (size && zonefs_inode_is_seq(inode)) { 361 /* 362 * Note that we may be seeing completions out of order, 363 * but that is not a problem since a write completed 364 * successfully necessarily means that all preceding writes 365 * were also successful. So we can safely increase the inode 366 * size to the write end location. 367 */ 368 mutex_lock(&zi->i_truncate_mutex); 369 if (i_size_read(inode) < iocb->ki_pos + size) { 370 zonefs_update_stats(inode, iocb->ki_pos + size); 371 zonefs_i_size_write(inode, iocb->ki_pos + size); 372 } 373 mutex_unlock(&zi->i_truncate_mutex); 374 } 375 376 return 0; 377 } 378 379 static const struct iomap_dio_ops zonefs_write_dio_ops = { 380 .end_io = zonefs_file_write_dio_end_io, 381 }; 382 383 /* 384 * Do not exceed the LFS limits nor the file zone size. If pos is under the 385 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 386 */ 387 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 388 loff_t count) 389 { 390 struct inode *inode = file_inode(file); 391 struct zonefs_zone *z = zonefs_inode_zone(inode); 392 loff_t limit = rlimit(RLIMIT_FSIZE); 393 loff_t max_size = z->z_capacity; 394 395 if (limit != RLIM_INFINITY) { 396 if (pos >= limit) { 397 send_sig(SIGXFSZ, current, 0); 398 return -EFBIG; 399 } 400 count = min(count, limit - pos); 401 } 402 403 if (!(file->f_flags & O_LARGEFILE)) 404 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 405 406 if (unlikely(pos >= max_size)) 407 return -EFBIG; 408 409 return min(count, max_size - pos); 410 } 411 412 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 413 { 414 struct file *file = iocb->ki_filp; 415 struct inode *inode = file_inode(file); 416 struct zonefs_inode_info *zi = ZONEFS_I(inode); 417 struct zonefs_zone *z = zonefs_inode_zone(inode); 418 loff_t count; 419 420 if (IS_SWAPFILE(inode)) 421 return -ETXTBSY; 422 423 if (!iov_iter_count(from)) 424 return 0; 425 426 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 427 return -EINVAL; 428 429 if (iocb->ki_flags & IOCB_APPEND) { 430 if (zonefs_zone_is_cnv(z)) 431 return -EINVAL; 432 mutex_lock(&zi->i_truncate_mutex); 433 iocb->ki_pos = z->z_wpoffset; 434 mutex_unlock(&zi->i_truncate_mutex); 435 } 436 437 count = zonefs_write_check_limits(file, iocb->ki_pos, 438 iov_iter_count(from)); 439 if (count < 0) 440 return count; 441 442 iov_iter_truncate(from, count); 443 return iov_iter_count(from); 444 } 445 446 /* 447 * Handle direct writes. For sequential zone files, this is the only possible 448 * write path. For these files, check that the user is issuing writes 449 * sequentially from the end of the file. This code assumes that the block layer 450 * delivers write requests to the device in sequential order. This is always the 451 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 452 * elevator feature is being used (e.g. mq-deadline). The block layer always 453 * automatically select such an elevator for zoned block devices during the 454 * device initialization. 455 */ 456 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 457 { 458 struct inode *inode = file_inode(iocb->ki_filp); 459 struct zonefs_inode_info *zi = ZONEFS_I(inode); 460 struct zonefs_zone *z = zonefs_inode_zone(inode); 461 struct super_block *sb = inode->i_sb; 462 ssize_t ret, count; 463 464 /* 465 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 466 * as this can cause write reordering (e.g. the first aio gets EAGAIN 467 * on the inode lock but the second goes through but is now unaligned). 468 */ 469 if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) && 470 (iocb->ki_flags & IOCB_NOWAIT)) 471 return -EOPNOTSUPP; 472 473 if (iocb->ki_flags & IOCB_NOWAIT) { 474 if (!inode_trylock(inode)) 475 return -EAGAIN; 476 } else { 477 inode_lock(inode); 478 } 479 480 count = zonefs_write_checks(iocb, from); 481 if (count <= 0) { 482 ret = count; 483 goto inode_unlock; 484 } 485 486 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 487 ret = -EINVAL; 488 goto inode_unlock; 489 } 490 491 /* Enforce sequential writes (append only) in sequential zones */ 492 if (zonefs_zone_is_seq(z)) { 493 mutex_lock(&zi->i_truncate_mutex); 494 if (iocb->ki_pos != z->z_wpoffset) { 495 mutex_unlock(&zi->i_truncate_mutex); 496 ret = -EINVAL; 497 goto inode_unlock; 498 } 499 /* 500 * Advance the zone write pointer offset. This assumes that the 501 * IO will succeed, which is OK to do because we do not allow 502 * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO 503 * fails, the error path will correct the write pointer offset. 504 */ 505 z->z_wpoffset += count; 506 zonefs_inode_account_active(inode); 507 mutex_unlock(&zi->i_truncate_mutex); 508 } 509 510 /* 511 * iomap_dio_rw() may return ENOTBLK if there was an issue with 512 * page invalidation. Overwrite that error code with EBUSY so that 513 * the user can make sense of the error. 514 */ 515 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, 516 &zonefs_write_dio_ops, 0, NULL, 0); 517 if (ret == -ENOTBLK) 518 ret = -EBUSY; 519 520 /* 521 * For a failed IO or partial completion, trigger error recovery 522 * to update the zone write pointer offset to a correct value. 523 * For asynchronous IOs, zonefs_file_write_dio_end_io() may already 524 * have executed error recovery if the IO already completed when we 525 * reach here. However, we cannot know that and execute error recovery 526 * again (that will not change anything). 527 */ 528 if (zonefs_zone_is_seq(z)) { 529 if (ret > 0 && ret != count) 530 ret = -EIO; 531 if (ret < 0 && ret != -EIOCBQUEUED) 532 zonefs_io_error(inode, true); 533 } 534 535 inode_unlock: 536 inode_unlock(inode); 537 538 return ret; 539 } 540 541 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 542 struct iov_iter *from) 543 { 544 struct inode *inode = file_inode(iocb->ki_filp); 545 ssize_t ret; 546 547 /* 548 * Direct IO writes are mandatory for sequential zone files so that the 549 * write IO issuing order is preserved. 550 */ 551 if (zonefs_inode_is_seq(inode)) 552 return -EIO; 553 554 if (iocb->ki_flags & IOCB_NOWAIT) { 555 if (!inode_trylock(inode)) 556 return -EAGAIN; 557 } else { 558 inode_lock(inode); 559 } 560 561 ret = zonefs_write_checks(iocb, from); 562 if (ret <= 0) 563 goto inode_unlock; 564 565 ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops); 566 if (ret == -EIO) 567 zonefs_io_error(inode, true); 568 569 inode_unlock: 570 inode_unlock(inode); 571 if (ret > 0) 572 ret = generic_write_sync(iocb, ret); 573 574 return ret; 575 } 576 577 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 578 { 579 struct inode *inode = file_inode(iocb->ki_filp); 580 struct zonefs_zone *z = zonefs_inode_zone(inode); 581 582 if (unlikely(IS_IMMUTABLE(inode))) 583 return -EPERM; 584 585 if (sb_rdonly(inode->i_sb)) 586 return -EROFS; 587 588 /* Write operations beyond the zone capacity are not allowed */ 589 if (iocb->ki_pos >= z->z_capacity) 590 return -EFBIG; 591 592 if (iocb->ki_flags & IOCB_DIRECT) { 593 ssize_t ret = zonefs_file_dio_write(iocb, from); 594 595 if (ret != -ENOTBLK) 596 return ret; 597 } 598 599 return zonefs_file_buffered_write(iocb, from); 600 } 601 602 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 603 int error, unsigned int flags) 604 { 605 if (error) { 606 zonefs_io_error(file_inode(iocb->ki_filp), false); 607 return error; 608 } 609 610 return 0; 611 } 612 613 static const struct iomap_dio_ops zonefs_read_dio_ops = { 614 .end_io = zonefs_file_read_dio_end_io, 615 }; 616 617 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 618 { 619 struct inode *inode = file_inode(iocb->ki_filp); 620 struct zonefs_inode_info *zi = ZONEFS_I(inode); 621 struct zonefs_zone *z = zonefs_inode_zone(inode); 622 struct super_block *sb = inode->i_sb; 623 loff_t isize; 624 ssize_t ret; 625 626 /* Offline zones cannot be read */ 627 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 628 return -EPERM; 629 630 if (iocb->ki_pos >= z->z_capacity) 631 return 0; 632 633 if (iocb->ki_flags & IOCB_NOWAIT) { 634 if (!inode_trylock_shared(inode)) 635 return -EAGAIN; 636 } else { 637 inode_lock_shared(inode); 638 } 639 640 /* Limit read operations to written data */ 641 mutex_lock(&zi->i_truncate_mutex); 642 isize = i_size_read(inode); 643 if (iocb->ki_pos >= isize) { 644 mutex_unlock(&zi->i_truncate_mutex); 645 ret = 0; 646 goto inode_unlock; 647 } 648 iov_iter_truncate(to, isize - iocb->ki_pos); 649 mutex_unlock(&zi->i_truncate_mutex); 650 651 if (iocb->ki_flags & IOCB_DIRECT) { 652 size_t count = iov_iter_count(to); 653 654 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 655 ret = -EINVAL; 656 goto inode_unlock; 657 } 658 file_accessed(iocb->ki_filp); 659 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, 660 &zonefs_read_dio_ops, 0, NULL, 0); 661 } else { 662 ret = generic_file_read_iter(iocb, to); 663 if (ret == -EIO) 664 zonefs_io_error(inode, false); 665 } 666 667 inode_unlock: 668 inode_unlock_shared(inode); 669 670 return ret; 671 } 672 673 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos, 674 struct pipe_inode_info *pipe, 675 size_t len, unsigned int flags) 676 { 677 struct inode *inode = file_inode(in); 678 struct zonefs_inode_info *zi = ZONEFS_I(inode); 679 struct zonefs_zone *z = zonefs_inode_zone(inode); 680 loff_t isize; 681 ssize_t ret = 0; 682 683 /* Offline zones cannot be read */ 684 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 685 return -EPERM; 686 687 if (*ppos >= z->z_capacity) 688 return 0; 689 690 inode_lock_shared(inode); 691 692 /* Limit read operations to written data */ 693 mutex_lock(&zi->i_truncate_mutex); 694 isize = i_size_read(inode); 695 if (*ppos >= isize) 696 len = 0; 697 else 698 len = min_t(loff_t, len, isize - *ppos); 699 mutex_unlock(&zi->i_truncate_mutex); 700 701 if (len > 0) { 702 ret = filemap_splice_read(in, ppos, pipe, len, flags); 703 if (ret == -EIO) 704 zonefs_io_error(inode, false); 705 } 706 707 inode_unlock_shared(inode); 708 return ret; 709 } 710 711 /* 712 * Write open accounting is done only for sequential files. 713 */ 714 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 715 struct file *file) 716 { 717 if (zonefs_inode_is_cnv(inode)) 718 return false; 719 720 if (!(file->f_mode & FMODE_WRITE)) 721 return false; 722 723 return true; 724 } 725 726 static int zonefs_seq_file_write_open(struct inode *inode) 727 { 728 struct zonefs_inode_info *zi = ZONEFS_I(inode); 729 struct zonefs_zone *z = zonefs_inode_zone(inode); 730 int ret = 0; 731 732 mutex_lock(&zi->i_truncate_mutex); 733 734 if (!zi->i_wr_refcnt) { 735 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 736 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 737 738 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 739 740 if (sbi->s_max_wro_seq_files 741 && wro > sbi->s_max_wro_seq_files) { 742 atomic_dec(&sbi->s_wro_seq_files); 743 ret = -EBUSY; 744 goto unlock; 745 } 746 747 if (i_size_read(inode) < z->z_capacity) { 748 ret = zonefs_inode_zone_mgmt(inode, 749 REQ_OP_ZONE_OPEN); 750 if (ret) { 751 atomic_dec(&sbi->s_wro_seq_files); 752 goto unlock; 753 } 754 z->z_flags |= ZONEFS_ZONE_OPEN; 755 zonefs_inode_account_active(inode); 756 } 757 } 758 } 759 760 zi->i_wr_refcnt++; 761 762 unlock: 763 mutex_unlock(&zi->i_truncate_mutex); 764 765 return ret; 766 } 767 768 static int zonefs_file_open(struct inode *inode, struct file *file) 769 { 770 int ret; 771 772 file->f_mode |= FMODE_CAN_ODIRECT; 773 ret = generic_file_open(inode, file); 774 if (ret) 775 return ret; 776 777 if (zonefs_seq_file_need_wro(inode, file)) 778 return zonefs_seq_file_write_open(inode); 779 780 return 0; 781 } 782 783 static void zonefs_seq_file_write_close(struct inode *inode) 784 { 785 struct zonefs_inode_info *zi = ZONEFS_I(inode); 786 struct zonefs_zone *z = zonefs_inode_zone(inode); 787 struct super_block *sb = inode->i_sb; 788 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 789 int ret = 0; 790 791 mutex_lock(&zi->i_truncate_mutex); 792 793 zi->i_wr_refcnt--; 794 if (zi->i_wr_refcnt) 795 goto unlock; 796 797 /* 798 * The file zone may not be open anymore (e.g. the file was truncated to 799 * its maximum size or it was fully written). For this case, we only 800 * need to decrement the write open count. 801 */ 802 if (z->z_flags & ZONEFS_ZONE_OPEN) { 803 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 804 if (ret) { 805 __zonefs_io_error(inode, false); 806 /* 807 * Leaving zones explicitly open may lead to a state 808 * where most zones cannot be written (zone resources 809 * exhausted). So take preventive action by remounting 810 * read-only. 811 */ 812 if (z->z_flags & ZONEFS_ZONE_OPEN && 813 !(sb->s_flags & SB_RDONLY)) { 814 zonefs_warn(sb, 815 "closing zone at %llu failed %d\n", 816 z->z_sector, ret); 817 zonefs_warn(sb, 818 "remounting filesystem read-only\n"); 819 sb->s_flags |= SB_RDONLY; 820 } 821 goto unlock; 822 } 823 824 z->z_flags &= ~ZONEFS_ZONE_OPEN; 825 zonefs_inode_account_active(inode); 826 } 827 828 atomic_dec(&sbi->s_wro_seq_files); 829 830 unlock: 831 mutex_unlock(&zi->i_truncate_mutex); 832 } 833 834 static int zonefs_file_release(struct inode *inode, struct file *file) 835 { 836 /* 837 * If we explicitly open a zone we must close it again as well, but the 838 * zone management operation can fail (either due to an IO error or as 839 * the zone has gone offline or read-only). Make sure we don't fail the 840 * close(2) for user-space. 841 */ 842 if (zonefs_seq_file_need_wro(inode, file)) 843 zonefs_seq_file_write_close(inode); 844 845 return 0; 846 } 847 848 const struct file_operations zonefs_file_operations = { 849 .open = zonefs_file_open, 850 .release = zonefs_file_release, 851 .fsync = zonefs_file_fsync, 852 .mmap = zonefs_file_mmap, 853 .llseek = zonefs_file_llseek, 854 .read_iter = zonefs_file_read_iter, 855 .write_iter = zonefs_file_write_iter, 856 .splice_read = zonefs_file_splice_read, 857 .splice_write = iter_file_splice_write, 858 .iopoll = iocb_bio_iopoll, 859 }; 860