1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2022 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/iomap.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/blkdev.h> 13 #include <linux/statfs.h> 14 #include <linux/writeback.h> 15 #include <linux/quotaops.h> 16 #include <linux/seq_file.h> 17 #include <linux/parser.h> 18 #include <linux/uio.h> 19 #include <linux/mman.h> 20 #include <linux/sched/mm.h> 21 #include <linux/task_io_accounting_ops.h> 22 23 #include "zonefs.h" 24 25 #include "trace.h" 26 27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, 28 loff_t length, unsigned int flags, 29 struct iomap *iomap, struct iomap *srcmap) 30 { 31 struct zonefs_inode_info *zi = ZONEFS_I(inode); 32 struct zonefs_zone *z = zonefs_inode_zone(inode); 33 struct super_block *sb = inode->i_sb; 34 loff_t isize; 35 36 /* 37 * All blocks are always mapped below EOF. If reading past EOF, 38 * act as if there is a hole up to the file maximum size. 39 */ 40 mutex_lock(&zi->i_truncate_mutex); 41 iomap->bdev = inode->i_sb->s_bdev; 42 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 43 isize = i_size_read(inode); 44 if (iomap->offset >= isize) { 45 iomap->type = IOMAP_HOLE; 46 iomap->addr = IOMAP_NULL_ADDR; 47 iomap->length = length; 48 } else { 49 iomap->type = IOMAP_MAPPED; 50 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 51 iomap->length = isize - iomap->offset; 52 } 53 mutex_unlock(&zi->i_truncate_mutex); 54 55 trace_zonefs_iomap_begin(inode, iomap); 56 57 return 0; 58 } 59 60 static const struct iomap_ops zonefs_read_iomap_ops = { 61 .iomap_begin = zonefs_read_iomap_begin, 62 }; 63 64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, 65 loff_t length, unsigned int flags, 66 struct iomap *iomap, struct iomap *srcmap) 67 { 68 struct zonefs_inode_info *zi = ZONEFS_I(inode); 69 struct zonefs_zone *z = zonefs_inode_zone(inode); 70 struct super_block *sb = inode->i_sb; 71 loff_t isize; 72 73 /* All write I/Os should always be within the file maximum size */ 74 if (WARN_ON_ONCE(offset + length > z->z_capacity)) 75 return -EIO; 76 77 /* 78 * Sequential zones can only accept direct writes. This is already 79 * checked when writes are issued, so warn if we see a page writeback 80 * operation. 81 */ 82 if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) 83 return -EIO; 84 85 /* 86 * For conventional zones, all blocks are always mapped. For sequential 87 * zones, all blocks after always mapped below the inode size (zone 88 * write pointer) and unwriten beyond. 89 */ 90 mutex_lock(&zi->i_truncate_mutex); 91 iomap->bdev = inode->i_sb->s_bdev; 92 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 93 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 94 isize = i_size_read(inode); 95 if (iomap->offset >= isize) { 96 iomap->type = IOMAP_UNWRITTEN; 97 iomap->length = z->z_capacity - iomap->offset; 98 } else { 99 iomap->type = IOMAP_MAPPED; 100 iomap->length = isize - iomap->offset; 101 } 102 mutex_unlock(&zi->i_truncate_mutex); 103 104 trace_zonefs_iomap_begin(inode, iomap); 105 106 return 0; 107 } 108 109 static const struct iomap_ops zonefs_write_iomap_ops = { 110 .iomap_begin = zonefs_write_iomap_begin, 111 }; 112 113 static int zonefs_read_folio(struct file *unused, struct folio *folio) 114 { 115 return iomap_read_folio(folio, &zonefs_read_iomap_ops); 116 } 117 118 static void zonefs_readahead(struct readahead_control *rac) 119 { 120 iomap_readahead(rac, &zonefs_read_iomap_ops); 121 } 122 123 /* 124 * Map blocks for page writeback. This is used only on conventional zone files, 125 * which implies that the page range can only be within the fixed inode size. 126 */ 127 static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc, 128 struct folio *folio, u64 offset, unsigned len, u64 end_pos) 129 { 130 struct zonefs_zone *z = zonefs_inode_zone(wpc->inode); 131 132 if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) 133 return -EIO; 134 if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode))) 135 return -EIO; 136 137 /* If the mapping is already OK, nothing needs to be done */ 138 if (offset < wpc->iomap.offset || 139 offset >= wpc->iomap.offset + wpc->iomap.length) { 140 int error; 141 142 error = zonefs_write_iomap_begin(wpc->inode, offset, 143 z->z_capacity - offset, IOMAP_WRITE, 144 &wpc->iomap, NULL); 145 if (error) 146 return error; 147 } 148 149 return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); 150 } 151 152 static const struct iomap_writeback_ops zonefs_writeback_ops = { 153 .writeback_range = zonefs_writeback_range, 154 }; 155 156 static int zonefs_writepages(struct address_space *mapping, 157 struct writeback_control *wbc) 158 { 159 struct iomap_writepage_ctx wpc = { 160 .inode = mapping->host, 161 .wbc = wbc, 162 .ops = &zonefs_writeback_ops, 163 }; 164 165 return iomap_writepages(&wpc); 166 } 167 168 static int zonefs_swap_activate(struct swap_info_struct *sis, 169 struct file *swap_file, sector_t *span) 170 { 171 struct inode *inode = file_inode(swap_file); 172 173 if (zonefs_inode_is_seq(inode)) { 174 zonefs_err(inode->i_sb, 175 "swap file: not a conventional zone file\n"); 176 return -EINVAL; 177 } 178 179 return iomap_swapfile_activate(sis, swap_file, span, 180 &zonefs_read_iomap_ops); 181 } 182 183 const struct address_space_operations zonefs_file_aops = { 184 .read_folio = zonefs_read_folio, 185 .readahead = zonefs_readahead, 186 .writepages = zonefs_writepages, 187 .dirty_folio = iomap_dirty_folio, 188 .release_folio = iomap_release_folio, 189 .invalidate_folio = iomap_invalidate_folio, 190 .migrate_folio = filemap_migrate_folio, 191 .is_partially_uptodate = iomap_is_partially_uptodate, 192 .error_remove_folio = generic_error_remove_folio, 193 .swap_activate = zonefs_swap_activate, 194 }; 195 196 int zonefs_file_truncate(struct inode *inode, loff_t isize) 197 { 198 struct zonefs_inode_info *zi = ZONEFS_I(inode); 199 struct zonefs_zone *z = zonefs_inode_zone(inode); 200 loff_t old_isize; 201 enum req_op op; 202 int ret = 0; 203 204 /* 205 * Only sequential zone files can be truncated and truncation is allowed 206 * only down to a 0 size, which is equivalent to a zone reset, and to 207 * the maximum file size, which is equivalent to a zone finish. 208 */ 209 if (!zonefs_zone_is_seq(z)) 210 return -EPERM; 211 212 if (!isize) 213 op = REQ_OP_ZONE_RESET; 214 else if (isize == z->z_capacity) 215 op = REQ_OP_ZONE_FINISH; 216 else 217 return -EPERM; 218 219 inode_dio_wait(inode); 220 221 /* Serialize against page faults */ 222 filemap_invalidate_lock(inode->i_mapping); 223 224 /* Serialize against zonefs_iomap_begin() */ 225 mutex_lock(&zi->i_truncate_mutex); 226 227 old_isize = i_size_read(inode); 228 if (isize == old_isize) 229 goto unlock; 230 231 ret = zonefs_inode_zone_mgmt(inode, op); 232 if (ret) 233 goto unlock; 234 235 /* 236 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 237 * take care of open zones. 238 */ 239 if (z->z_flags & ZONEFS_ZONE_OPEN) { 240 /* 241 * Truncating a zone to EMPTY or FULL is the equivalent of 242 * closing the zone. For a truncation to 0, we need to 243 * re-open the zone to ensure new writes can be processed. 244 * For a truncation to the maximum file size, the zone is 245 * closed and writes cannot be accepted anymore, so clear 246 * the open flag. 247 */ 248 if (!isize) 249 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 250 else 251 z->z_flags &= ~ZONEFS_ZONE_OPEN; 252 } 253 254 zonefs_update_stats(inode, isize); 255 truncate_setsize(inode, isize); 256 z->z_wpoffset = isize; 257 zonefs_inode_account_active(inode); 258 259 unlock: 260 mutex_unlock(&zi->i_truncate_mutex); 261 filemap_invalidate_unlock(inode->i_mapping); 262 263 return ret; 264 } 265 266 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 267 int datasync) 268 { 269 struct inode *inode = file_inode(file); 270 int ret = 0; 271 272 if (unlikely(IS_IMMUTABLE(inode))) 273 return -EPERM; 274 275 /* 276 * Since only direct writes are allowed in sequential files, page cache 277 * flush is needed only for conventional zone files. 278 */ 279 if (zonefs_inode_is_cnv(inode)) 280 ret = file_write_and_wait_range(file, start, end); 281 if (!ret) 282 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 283 284 if (ret) 285 zonefs_io_error(inode, true); 286 287 return ret; 288 } 289 290 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 291 { 292 struct inode *inode = file_inode(vmf->vma->vm_file); 293 vm_fault_t ret; 294 295 if (unlikely(IS_IMMUTABLE(inode))) 296 return VM_FAULT_SIGBUS; 297 298 /* 299 * Sanity check: only conventional zone files can have shared 300 * writeable mappings. 301 */ 302 if (zonefs_inode_is_seq(inode)) 303 return VM_FAULT_NOPAGE; 304 305 sb_start_pagefault(inode->i_sb); 306 file_update_time(vmf->vma->vm_file); 307 308 /* Serialize against truncates */ 309 filemap_invalidate_lock_shared(inode->i_mapping); 310 ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); 311 filemap_invalidate_unlock_shared(inode->i_mapping); 312 313 sb_end_pagefault(inode->i_sb); 314 return ret; 315 } 316 317 static const struct vm_operations_struct zonefs_file_vm_ops = { 318 .fault = filemap_fault, 319 .map_pages = filemap_map_pages, 320 .page_mkwrite = zonefs_filemap_page_mkwrite, 321 }; 322 323 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) 324 { 325 /* 326 * Conventional zones accept random writes, so their files can support 327 * shared writable mappings. For sequential zone files, only read 328 * mappings are possible since there are no guarantees for write 329 * ordering between msync() and page cache writeback. 330 */ 331 if (zonefs_inode_is_seq(file_inode(file)) && 332 (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 333 return -EINVAL; 334 335 file_accessed(file); 336 vma->vm_ops = &zonefs_file_vm_ops; 337 338 return 0; 339 } 340 341 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 342 { 343 loff_t isize = i_size_read(file_inode(file)); 344 345 /* 346 * Seeks are limited to below the zone size for conventional zones 347 * and below the zone write pointer for sequential zones. In both 348 * cases, this limit is the inode size. 349 */ 350 return generic_file_llseek_size(file, offset, whence, isize, isize); 351 } 352 353 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 354 int error, unsigned int flags) 355 { 356 struct inode *inode = file_inode(iocb->ki_filp); 357 struct zonefs_inode_info *zi = ZONEFS_I(inode); 358 359 if (error) { 360 /* 361 * For Sync IOs, error recovery is called from 362 * zonefs_file_dio_write(). 363 */ 364 if (!is_sync_kiocb(iocb)) 365 zonefs_io_error(inode, true); 366 return error; 367 } 368 369 if (size && zonefs_inode_is_seq(inode)) { 370 /* 371 * Note that we may be seeing completions out of order, 372 * but that is not a problem since a write completed 373 * successfully necessarily means that all preceding writes 374 * were also successful. So we can safely increase the inode 375 * size to the write end location. 376 */ 377 mutex_lock(&zi->i_truncate_mutex); 378 if (i_size_read(inode) < iocb->ki_pos + size) { 379 zonefs_update_stats(inode, iocb->ki_pos + size); 380 zonefs_i_size_write(inode, iocb->ki_pos + size); 381 } 382 mutex_unlock(&zi->i_truncate_mutex); 383 } 384 385 return 0; 386 } 387 388 static const struct iomap_dio_ops zonefs_write_dio_ops = { 389 .end_io = zonefs_file_write_dio_end_io, 390 }; 391 392 /* 393 * Do not exceed the LFS limits nor the file zone size. If pos is under the 394 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 395 */ 396 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 397 loff_t count) 398 { 399 struct inode *inode = file_inode(file); 400 struct zonefs_zone *z = zonefs_inode_zone(inode); 401 loff_t limit = rlimit(RLIMIT_FSIZE); 402 loff_t max_size = z->z_capacity; 403 404 if (limit != RLIM_INFINITY) { 405 if (pos >= limit) { 406 send_sig(SIGXFSZ, current, 0); 407 return -EFBIG; 408 } 409 count = min(count, limit - pos); 410 } 411 412 if (!(file->f_flags & O_LARGEFILE)) 413 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 414 415 if (unlikely(pos >= max_size)) 416 return -EFBIG; 417 418 return min(count, max_size - pos); 419 } 420 421 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 422 { 423 struct file *file = iocb->ki_filp; 424 struct inode *inode = file_inode(file); 425 struct zonefs_inode_info *zi = ZONEFS_I(inode); 426 struct zonefs_zone *z = zonefs_inode_zone(inode); 427 loff_t count; 428 429 if (IS_SWAPFILE(inode)) 430 return -ETXTBSY; 431 432 if (!iov_iter_count(from)) 433 return 0; 434 435 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 436 return -EINVAL; 437 438 if (iocb->ki_flags & IOCB_APPEND) { 439 if (zonefs_zone_is_cnv(z)) 440 return -EINVAL; 441 mutex_lock(&zi->i_truncate_mutex); 442 iocb->ki_pos = z->z_wpoffset; 443 mutex_unlock(&zi->i_truncate_mutex); 444 } 445 446 count = zonefs_write_check_limits(file, iocb->ki_pos, 447 iov_iter_count(from)); 448 if (count < 0) 449 return count; 450 451 iov_iter_truncate(from, count); 452 return iov_iter_count(from); 453 } 454 455 /* 456 * Handle direct writes. For sequential zone files, this is the only possible 457 * write path. For these files, check that the user is issuing writes 458 * sequentially from the end of the file. This code assumes that the block layer 459 * delivers write requests to the device in sequential order. This is always the 460 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 461 * elevator feature is being used (e.g. mq-deadline). The block layer always 462 * automatically select such an elevator for zoned block devices during the 463 * device initialization. 464 */ 465 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 466 { 467 struct inode *inode = file_inode(iocb->ki_filp); 468 struct zonefs_inode_info *zi = ZONEFS_I(inode); 469 struct zonefs_zone *z = zonefs_inode_zone(inode); 470 struct super_block *sb = inode->i_sb; 471 ssize_t ret, count; 472 473 /* 474 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 475 * as this can cause write reordering (e.g. the first aio gets EAGAIN 476 * on the inode lock but the second goes through but is now unaligned). 477 */ 478 if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) && 479 (iocb->ki_flags & IOCB_NOWAIT)) 480 return -EOPNOTSUPP; 481 482 if (iocb->ki_flags & IOCB_NOWAIT) { 483 if (!inode_trylock(inode)) 484 return -EAGAIN; 485 } else { 486 inode_lock(inode); 487 } 488 489 count = zonefs_write_checks(iocb, from); 490 if (count <= 0) { 491 ret = count; 492 goto inode_unlock; 493 } 494 495 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 496 ret = -EINVAL; 497 goto inode_unlock; 498 } 499 500 /* Enforce sequential writes (append only) in sequential zones */ 501 if (zonefs_zone_is_seq(z)) { 502 mutex_lock(&zi->i_truncate_mutex); 503 if (iocb->ki_pos != z->z_wpoffset) { 504 mutex_unlock(&zi->i_truncate_mutex); 505 ret = -EINVAL; 506 goto inode_unlock; 507 } 508 /* 509 * Advance the zone write pointer offset. This assumes that the 510 * IO will succeed, which is OK to do because we do not allow 511 * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO 512 * fails, the error path will correct the write pointer offset. 513 */ 514 z->z_wpoffset += count; 515 zonefs_inode_account_active(inode); 516 mutex_unlock(&zi->i_truncate_mutex); 517 } 518 519 /* 520 * iomap_dio_rw() may return ENOTBLK if there was an issue with 521 * page invalidation. Overwrite that error code with EBUSY so that 522 * the user can make sense of the error. 523 */ 524 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, 525 &zonefs_write_dio_ops, 0, NULL, 0); 526 if (ret == -ENOTBLK) 527 ret = -EBUSY; 528 529 /* 530 * For a failed IO or partial completion, trigger error recovery 531 * to update the zone write pointer offset to a correct value. 532 * For asynchronous IOs, zonefs_file_write_dio_end_io() may already 533 * have executed error recovery if the IO already completed when we 534 * reach here. However, we cannot know that and execute error recovery 535 * again (that will not change anything). 536 */ 537 if (zonefs_zone_is_seq(z)) { 538 if (ret > 0 && ret != count) 539 ret = -EIO; 540 if (ret < 0 && ret != -EIOCBQUEUED) 541 zonefs_io_error(inode, true); 542 } 543 544 inode_unlock: 545 inode_unlock(inode); 546 547 return ret; 548 } 549 550 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 551 struct iov_iter *from) 552 { 553 struct inode *inode = file_inode(iocb->ki_filp); 554 ssize_t ret; 555 556 /* 557 * Direct IO writes are mandatory for sequential zone files so that the 558 * write IO issuing order is preserved. 559 */ 560 if (zonefs_inode_is_seq(inode)) 561 return -EIO; 562 563 if (iocb->ki_flags & IOCB_NOWAIT) { 564 if (!inode_trylock(inode)) 565 return -EAGAIN; 566 } else { 567 inode_lock(inode); 568 } 569 570 ret = zonefs_write_checks(iocb, from); 571 if (ret <= 0) 572 goto inode_unlock; 573 574 ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL); 575 if (ret == -EIO) 576 zonefs_io_error(inode, true); 577 578 inode_unlock: 579 inode_unlock(inode); 580 if (ret > 0) 581 ret = generic_write_sync(iocb, ret); 582 583 return ret; 584 } 585 586 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 587 { 588 struct inode *inode = file_inode(iocb->ki_filp); 589 struct zonefs_zone *z = zonefs_inode_zone(inode); 590 591 if (unlikely(IS_IMMUTABLE(inode))) 592 return -EPERM; 593 594 if (sb_rdonly(inode->i_sb)) 595 return -EROFS; 596 597 /* Write operations beyond the zone capacity are not allowed */ 598 if (iocb->ki_pos >= z->z_capacity) 599 return -EFBIG; 600 601 if (iocb->ki_flags & IOCB_DIRECT) { 602 ssize_t ret = zonefs_file_dio_write(iocb, from); 603 604 if (ret != -ENOTBLK) 605 return ret; 606 } 607 608 return zonefs_file_buffered_write(iocb, from); 609 } 610 611 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 612 int error, unsigned int flags) 613 { 614 if (error) { 615 zonefs_io_error(file_inode(iocb->ki_filp), false); 616 return error; 617 } 618 619 return 0; 620 } 621 622 static const struct iomap_dio_ops zonefs_read_dio_ops = { 623 .end_io = zonefs_file_read_dio_end_io, 624 }; 625 626 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 627 { 628 struct inode *inode = file_inode(iocb->ki_filp); 629 struct zonefs_inode_info *zi = ZONEFS_I(inode); 630 struct zonefs_zone *z = zonefs_inode_zone(inode); 631 struct super_block *sb = inode->i_sb; 632 loff_t isize; 633 ssize_t ret; 634 635 /* Offline zones cannot be read */ 636 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 637 return -EPERM; 638 639 if (iocb->ki_pos >= z->z_capacity) 640 return 0; 641 642 if (iocb->ki_flags & IOCB_NOWAIT) { 643 if (!inode_trylock_shared(inode)) 644 return -EAGAIN; 645 } else { 646 inode_lock_shared(inode); 647 } 648 649 /* Limit read operations to written data */ 650 mutex_lock(&zi->i_truncate_mutex); 651 isize = i_size_read(inode); 652 if (iocb->ki_pos >= isize) { 653 mutex_unlock(&zi->i_truncate_mutex); 654 ret = 0; 655 goto inode_unlock; 656 } 657 iov_iter_truncate(to, isize - iocb->ki_pos); 658 mutex_unlock(&zi->i_truncate_mutex); 659 660 if (iocb->ki_flags & IOCB_DIRECT) { 661 size_t count = iov_iter_count(to); 662 663 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 664 ret = -EINVAL; 665 goto inode_unlock; 666 } 667 file_accessed(iocb->ki_filp); 668 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, 669 &zonefs_read_dio_ops, 0, NULL, 0); 670 } else { 671 ret = generic_file_read_iter(iocb, to); 672 if (ret == -EIO) 673 zonefs_io_error(inode, false); 674 } 675 676 inode_unlock: 677 inode_unlock_shared(inode); 678 679 return ret; 680 } 681 682 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos, 683 struct pipe_inode_info *pipe, 684 size_t len, unsigned int flags) 685 { 686 struct inode *inode = file_inode(in); 687 struct zonefs_inode_info *zi = ZONEFS_I(inode); 688 struct zonefs_zone *z = zonefs_inode_zone(inode); 689 loff_t isize; 690 ssize_t ret = 0; 691 692 /* Offline zones cannot be read */ 693 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 694 return -EPERM; 695 696 if (*ppos >= z->z_capacity) 697 return 0; 698 699 inode_lock_shared(inode); 700 701 /* Limit read operations to written data */ 702 mutex_lock(&zi->i_truncate_mutex); 703 isize = i_size_read(inode); 704 if (*ppos >= isize) 705 len = 0; 706 else 707 len = min_t(loff_t, len, isize - *ppos); 708 mutex_unlock(&zi->i_truncate_mutex); 709 710 if (len > 0) { 711 ret = filemap_splice_read(in, ppos, pipe, len, flags); 712 if (ret == -EIO) 713 zonefs_io_error(inode, false); 714 } 715 716 inode_unlock_shared(inode); 717 return ret; 718 } 719 720 /* 721 * Write open accounting is done only for sequential files. 722 */ 723 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 724 struct file *file) 725 { 726 if (zonefs_inode_is_cnv(inode)) 727 return false; 728 729 if (!(file->f_mode & FMODE_WRITE)) 730 return false; 731 732 return true; 733 } 734 735 static int zonefs_seq_file_write_open(struct inode *inode) 736 { 737 struct zonefs_inode_info *zi = ZONEFS_I(inode); 738 struct zonefs_zone *z = zonefs_inode_zone(inode); 739 int ret = 0; 740 741 mutex_lock(&zi->i_truncate_mutex); 742 743 if (!zi->i_wr_refcnt) { 744 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 745 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 746 747 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 748 749 if (sbi->s_max_wro_seq_files 750 && wro > sbi->s_max_wro_seq_files) { 751 atomic_dec(&sbi->s_wro_seq_files); 752 ret = -EBUSY; 753 goto unlock; 754 } 755 756 if (i_size_read(inode) < z->z_capacity) { 757 ret = zonefs_inode_zone_mgmt(inode, 758 REQ_OP_ZONE_OPEN); 759 if (ret) { 760 atomic_dec(&sbi->s_wro_seq_files); 761 goto unlock; 762 } 763 z->z_flags |= ZONEFS_ZONE_OPEN; 764 zonefs_inode_account_active(inode); 765 } 766 } 767 } 768 769 zi->i_wr_refcnt++; 770 771 unlock: 772 mutex_unlock(&zi->i_truncate_mutex); 773 774 return ret; 775 } 776 777 static int zonefs_file_open(struct inode *inode, struct file *file) 778 { 779 int ret; 780 781 file->f_mode |= FMODE_CAN_ODIRECT; 782 ret = generic_file_open(inode, file); 783 if (ret) 784 return ret; 785 786 if (zonefs_seq_file_need_wro(inode, file)) 787 return zonefs_seq_file_write_open(inode); 788 789 return 0; 790 } 791 792 static void zonefs_seq_file_write_close(struct inode *inode) 793 { 794 struct zonefs_inode_info *zi = ZONEFS_I(inode); 795 struct zonefs_zone *z = zonefs_inode_zone(inode); 796 struct super_block *sb = inode->i_sb; 797 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 798 int ret = 0; 799 800 mutex_lock(&zi->i_truncate_mutex); 801 802 zi->i_wr_refcnt--; 803 if (zi->i_wr_refcnt) 804 goto unlock; 805 806 /* 807 * The file zone may not be open anymore (e.g. the file was truncated to 808 * its maximum size or it was fully written). For this case, we only 809 * need to decrement the write open count. 810 */ 811 if (z->z_flags & ZONEFS_ZONE_OPEN) { 812 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 813 if (ret) { 814 __zonefs_io_error(inode, false); 815 /* 816 * Leaving zones explicitly open may lead to a state 817 * where most zones cannot be written (zone resources 818 * exhausted). So take preventive action by remounting 819 * read-only. 820 */ 821 if (z->z_flags & ZONEFS_ZONE_OPEN && 822 !(sb->s_flags & SB_RDONLY)) { 823 zonefs_warn(sb, 824 "closing zone at %llu failed %d\n", 825 z->z_sector, ret); 826 zonefs_warn(sb, 827 "remounting filesystem read-only\n"); 828 sb->s_flags |= SB_RDONLY; 829 } 830 goto unlock; 831 } 832 833 z->z_flags &= ~ZONEFS_ZONE_OPEN; 834 zonefs_inode_account_active(inode); 835 } 836 837 atomic_dec(&sbi->s_wro_seq_files); 838 839 unlock: 840 mutex_unlock(&zi->i_truncate_mutex); 841 } 842 843 static int zonefs_file_release(struct inode *inode, struct file *file) 844 { 845 /* 846 * If we explicitly open a zone we must close it again as well, but the 847 * zone management operation can fail (either due to an IO error or as 848 * the zone has gone offline or read-only). Make sure we don't fail the 849 * close(2) for user-space. 850 */ 851 if (zonefs_seq_file_need_wro(inode, file)) 852 zonefs_seq_file_write_close(inode); 853 854 return 0; 855 } 856 857 const struct file_operations zonefs_file_operations = { 858 .open = zonefs_file_open, 859 .release = zonefs_file_release, 860 .fsync = zonefs_file_fsync, 861 .mmap = zonefs_file_mmap, 862 .llseek = zonefs_file_llseek, 863 .read_iter = zonefs_file_read_iter, 864 .write_iter = zonefs_file_write_iter, 865 .splice_read = zonefs_file_splice_read, 866 .splice_write = iter_file_splice_write, 867 .iopoll = iocb_bio_iopoll, 868 }; 869