1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2022 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/iomap.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/blkdev.h> 13 #include <linux/statfs.h> 14 #include <linux/writeback.h> 15 #include <linux/quotaops.h> 16 #include <linux/seq_file.h> 17 #include <linux/parser.h> 18 #include <linux/uio.h> 19 #include <linux/mman.h> 20 #include <linux/sched/mm.h> 21 #include <linux/task_io_accounting_ops.h> 22 23 #include "zonefs.h" 24 25 #include "trace.h" 26 27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, 28 loff_t length, unsigned int flags, 29 struct iomap *iomap, struct iomap *srcmap) 30 { 31 struct zonefs_inode_info *zi = ZONEFS_I(inode); 32 struct zonefs_zone *z = zonefs_inode_zone(inode); 33 struct super_block *sb = inode->i_sb; 34 loff_t isize; 35 36 /* 37 * All blocks are always mapped below EOF. If reading past EOF, 38 * act as if there is a hole up to the file maximum size. 39 */ 40 mutex_lock(&zi->i_truncate_mutex); 41 iomap->bdev = inode->i_sb->s_bdev; 42 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 43 isize = i_size_read(inode); 44 if (iomap->offset >= isize) { 45 iomap->type = IOMAP_HOLE; 46 iomap->addr = IOMAP_NULL_ADDR; 47 iomap->length = length; 48 } else { 49 iomap->type = IOMAP_MAPPED; 50 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 51 iomap->length = isize - iomap->offset; 52 } 53 mutex_unlock(&zi->i_truncate_mutex); 54 55 trace_zonefs_iomap_begin(inode, iomap); 56 57 return 0; 58 } 59 60 static const struct iomap_ops zonefs_read_iomap_ops = { 61 .iomap_begin = zonefs_read_iomap_begin, 62 }; 63 64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, 65 loff_t length, unsigned int flags, 66 struct iomap *iomap, struct iomap *srcmap) 67 { 68 struct zonefs_inode_info *zi = ZONEFS_I(inode); 69 struct zonefs_zone *z = zonefs_inode_zone(inode); 70 struct super_block *sb = inode->i_sb; 71 loff_t isize; 72 73 /* All write I/Os should always be within the file maximum size */ 74 if (WARN_ON_ONCE(offset + length > z->z_capacity)) 75 return -EIO; 76 77 /* 78 * Sequential zones can only accept direct writes. This is already 79 * checked when writes are issued, so warn if we see a page writeback 80 * operation. 81 */ 82 if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) 83 return -EIO; 84 85 /* 86 * For conventional zones, all blocks are always mapped. For sequential 87 * zones, all blocks after always mapped below the inode size (zone 88 * write pointer) and unwritten beyond. 89 */ 90 mutex_lock(&zi->i_truncate_mutex); 91 iomap->bdev = inode->i_sb->s_bdev; 92 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 93 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 94 isize = i_size_read(inode); 95 if (iomap->offset >= isize) { 96 iomap->type = IOMAP_UNWRITTEN; 97 iomap->length = z->z_capacity - iomap->offset; 98 } else { 99 iomap->type = IOMAP_MAPPED; 100 iomap->length = isize - iomap->offset; 101 } 102 mutex_unlock(&zi->i_truncate_mutex); 103 104 trace_zonefs_iomap_begin(inode, iomap); 105 106 return 0; 107 } 108 109 static const struct iomap_ops zonefs_write_iomap_ops = { 110 .iomap_begin = zonefs_write_iomap_begin, 111 }; 112 113 static int zonefs_read_folio(struct file *unused, struct folio *folio) 114 { 115 iomap_bio_read_folio(folio, &zonefs_read_iomap_ops); 116 return 0; 117 } 118 119 static void zonefs_readahead(struct readahead_control *rac) 120 { 121 iomap_bio_readahead(rac, &zonefs_read_iomap_ops); 122 } 123 124 /* 125 * Map blocks for page writeback. This is used only on conventional zone files, 126 * which implies that the page range can only be within the fixed inode size. 127 */ 128 static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc, 129 struct folio *folio, u64 offset, unsigned len, u64 end_pos) 130 { 131 struct zonefs_zone *z = zonefs_inode_zone(wpc->inode); 132 133 if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) 134 return -EIO; 135 if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode))) 136 return -EIO; 137 138 /* If the mapping is already OK, nothing needs to be done */ 139 if (offset < wpc->iomap.offset || 140 offset >= wpc->iomap.offset + wpc->iomap.length) { 141 int error; 142 143 error = zonefs_write_iomap_begin(wpc->inode, offset, 144 z->z_capacity - offset, IOMAP_WRITE, 145 &wpc->iomap, NULL); 146 if (error) 147 return error; 148 } 149 150 return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); 151 } 152 153 static const struct iomap_writeback_ops zonefs_writeback_ops = { 154 .writeback_range = zonefs_writeback_range, 155 .writeback_submit = iomap_ioend_writeback_submit, 156 }; 157 158 static int zonefs_writepages(struct address_space *mapping, 159 struct writeback_control *wbc) 160 { 161 struct iomap_writepage_ctx wpc = { 162 .inode = mapping->host, 163 .wbc = wbc, 164 .ops = &zonefs_writeback_ops, 165 }; 166 167 return iomap_writepages(&wpc); 168 } 169 170 static int zonefs_swap_activate(struct swap_info_struct *sis, 171 struct file *swap_file, sector_t *span) 172 { 173 struct inode *inode = file_inode(swap_file); 174 175 if (zonefs_inode_is_seq(inode)) { 176 zonefs_err(inode->i_sb, 177 "swap file: not a conventional zone file\n"); 178 return -EINVAL; 179 } 180 181 return iomap_swapfile_activate(sis, swap_file, span, 182 &zonefs_read_iomap_ops); 183 } 184 185 const struct address_space_operations zonefs_file_aops = { 186 .read_folio = zonefs_read_folio, 187 .readahead = zonefs_readahead, 188 .writepages = zonefs_writepages, 189 .dirty_folio = iomap_dirty_folio, 190 .release_folio = iomap_release_folio, 191 .invalidate_folio = iomap_invalidate_folio, 192 .migrate_folio = filemap_migrate_folio, 193 .is_partially_uptodate = iomap_is_partially_uptodate, 194 .error_remove_folio = generic_error_remove_folio, 195 .swap_activate = zonefs_swap_activate, 196 }; 197 198 int zonefs_file_truncate(struct inode *inode, loff_t isize) 199 { 200 struct zonefs_inode_info *zi = ZONEFS_I(inode); 201 struct zonefs_zone *z = zonefs_inode_zone(inode); 202 loff_t old_isize; 203 enum req_op op; 204 int ret = 0; 205 206 /* 207 * Only sequential zone files can be truncated and truncation is allowed 208 * only down to a 0 size, which is equivalent to a zone reset, and to 209 * the maximum file size, which is equivalent to a zone finish. 210 */ 211 if (!zonefs_zone_is_seq(z)) 212 return -EPERM; 213 214 if (!isize) 215 op = REQ_OP_ZONE_RESET; 216 else if (isize == z->z_capacity) 217 op = REQ_OP_ZONE_FINISH; 218 else 219 return -EPERM; 220 221 inode_dio_wait(inode); 222 223 /* Serialize against page faults */ 224 filemap_invalidate_lock(inode->i_mapping); 225 226 /* Serialize against zonefs_iomap_begin() */ 227 mutex_lock(&zi->i_truncate_mutex); 228 229 old_isize = i_size_read(inode); 230 if (isize == old_isize) 231 goto unlock; 232 233 ret = zonefs_inode_zone_mgmt(inode, op); 234 if (ret) 235 goto unlock; 236 237 /* 238 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 239 * take care of open zones. 240 */ 241 if (z->z_flags & ZONEFS_ZONE_OPEN) { 242 /* 243 * Truncating a zone to EMPTY or FULL is the equivalent of 244 * closing the zone. For a truncation to 0, we need to 245 * re-open the zone to ensure new writes can be processed. 246 * For a truncation to the maximum file size, the zone is 247 * closed and writes cannot be accepted anymore, so clear 248 * the open flag. 249 */ 250 if (!isize) 251 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 252 else 253 z->z_flags &= ~ZONEFS_ZONE_OPEN; 254 } 255 256 zonefs_update_stats(inode, isize); 257 truncate_setsize(inode, isize); 258 z->z_wpoffset = isize; 259 zonefs_inode_account_active(inode); 260 261 unlock: 262 mutex_unlock(&zi->i_truncate_mutex); 263 filemap_invalidate_unlock(inode->i_mapping); 264 265 return ret; 266 } 267 268 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 269 int datasync) 270 { 271 struct inode *inode = file_inode(file); 272 int ret = 0; 273 274 if (unlikely(IS_IMMUTABLE(inode))) 275 return -EPERM; 276 277 /* 278 * Since only direct writes are allowed in sequential files, page cache 279 * flush is needed only for conventional zone files. 280 */ 281 if (zonefs_inode_is_cnv(inode)) 282 ret = file_write_and_wait_range(file, start, end); 283 if (!ret) 284 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 285 286 if (ret) 287 zonefs_io_error(inode, true); 288 289 return ret; 290 } 291 292 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 293 { 294 struct inode *inode = file_inode(vmf->vma->vm_file); 295 vm_fault_t ret; 296 297 if (unlikely(IS_IMMUTABLE(inode))) 298 return VM_FAULT_SIGBUS; 299 300 /* 301 * Sanity check: only conventional zone files can have shared 302 * writeable mappings. 303 */ 304 if (zonefs_inode_is_seq(inode)) 305 return VM_FAULT_NOPAGE; 306 307 sb_start_pagefault(inode->i_sb); 308 file_update_time(vmf->vma->vm_file); 309 310 /* Serialize against truncates */ 311 filemap_invalidate_lock_shared(inode->i_mapping); 312 ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); 313 filemap_invalidate_unlock_shared(inode->i_mapping); 314 315 sb_end_pagefault(inode->i_sb); 316 return ret; 317 } 318 319 static const struct vm_operations_struct zonefs_file_vm_ops = { 320 .fault = filemap_fault, 321 .map_pages = filemap_map_pages, 322 .page_mkwrite = zonefs_filemap_page_mkwrite, 323 }; 324 325 static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) 326 { 327 struct file *file = desc->file; 328 329 /* 330 * Conventional zones accept random writes, so their files can support 331 * shared writable mappings. For sequential zone files, only read 332 * mappings are possible since there are no guarantees for write 333 * ordering between msync() and page cache writeback. 334 */ 335 if (zonefs_inode_is_seq(file_inode(file)) && 336 vma_desc_test_flags(desc, VMA_SHARED_BIT) && 337 vma_desc_test_flags(desc, VMA_MAYWRITE_BIT)) 338 return -EINVAL; 339 340 file_accessed(file); 341 desc->vm_ops = &zonefs_file_vm_ops; 342 343 return 0; 344 } 345 346 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 347 { 348 loff_t isize = i_size_read(file_inode(file)); 349 350 /* 351 * Seeks are limited to below the zone size for conventional zones 352 * and below the zone write pointer for sequential zones. In both 353 * cases, this limit is the inode size. 354 */ 355 return generic_file_llseek_size(file, offset, whence, isize, isize); 356 } 357 358 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 359 int error, unsigned int flags) 360 { 361 struct inode *inode = file_inode(iocb->ki_filp); 362 struct zonefs_inode_info *zi = ZONEFS_I(inode); 363 364 if (error) { 365 /* 366 * For Sync IOs, error recovery is called from 367 * zonefs_file_dio_write(). 368 */ 369 if (!is_sync_kiocb(iocb)) 370 zonefs_io_error(inode, true); 371 return error; 372 } 373 374 if (size && zonefs_inode_is_seq(inode)) { 375 /* 376 * Note that we may be seeing completions out of order, 377 * but that is not a problem since a write completed 378 * successfully necessarily means that all preceding writes 379 * were also successful. So we can safely increase the inode 380 * size to the write end location. 381 */ 382 mutex_lock(&zi->i_truncate_mutex); 383 if (i_size_read(inode) < iocb->ki_pos + size) { 384 zonefs_update_stats(inode, iocb->ki_pos + size); 385 zonefs_i_size_write(inode, iocb->ki_pos + size); 386 } 387 mutex_unlock(&zi->i_truncate_mutex); 388 } 389 390 return 0; 391 } 392 393 static const struct iomap_dio_ops zonefs_write_dio_ops = { 394 .end_io = zonefs_file_write_dio_end_io, 395 }; 396 397 /* 398 * Do not exceed the LFS limits nor the file zone size. If pos is under the 399 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 400 */ 401 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 402 loff_t count) 403 { 404 struct inode *inode = file_inode(file); 405 struct zonefs_zone *z = zonefs_inode_zone(inode); 406 loff_t limit = rlimit(RLIMIT_FSIZE); 407 loff_t max_size = z->z_capacity; 408 409 if (limit != RLIM_INFINITY) { 410 if (pos >= limit) { 411 send_sig(SIGXFSZ, current, 0); 412 return -EFBIG; 413 } 414 count = min(count, limit - pos); 415 } 416 417 if (!(file->f_flags & O_LARGEFILE)) 418 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 419 420 if (unlikely(pos >= max_size)) 421 return -EFBIG; 422 423 return min(count, max_size - pos); 424 } 425 426 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 427 { 428 struct file *file = iocb->ki_filp; 429 struct inode *inode = file_inode(file); 430 struct zonefs_inode_info *zi = ZONEFS_I(inode); 431 struct zonefs_zone *z = zonefs_inode_zone(inode); 432 loff_t count; 433 434 if (IS_SWAPFILE(inode)) 435 return -ETXTBSY; 436 437 if (!iov_iter_count(from)) 438 return 0; 439 440 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 441 return -EINVAL; 442 443 if (iocb->ki_flags & IOCB_APPEND) { 444 if (zonefs_zone_is_cnv(z)) 445 return -EINVAL; 446 mutex_lock(&zi->i_truncate_mutex); 447 iocb->ki_pos = z->z_wpoffset; 448 mutex_unlock(&zi->i_truncate_mutex); 449 } 450 451 count = zonefs_write_check_limits(file, iocb->ki_pos, 452 iov_iter_count(from)); 453 if (count < 0) 454 return count; 455 456 iov_iter_truncate(from, count); 457 return iov_iter_count(from); 458 } 459 460 /* 461 * Handle direct writes. For sequential zone files, this is the only possible 462 * write path. For these files, check that the user is issuing writes 463 * sequentially from the end of the file. This code assumes that the block layer 464 * delivers write requests to the device in sequential order. This is always the 465 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 466 * elevator feature is being used (e.g. mq-deadline). The block layer always 467 * automatically select such an elevator for zoned block devices during the 468 * device initialization. 469 */ 470 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 471 { 472 struct inode *inode = file_inode(iocb->ki_filp); 473 struct zonefs_inode_info *zi = ZONEFS_I(inode); 474 struct zonefs_zone *z = zonefs_inode_zone(inode); 475 struct super_block *sb = inode->i_sb; 476 ssize_t ret, count; 477 478 /* 479 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 480 * as this can cause write reordering (e.g. the first aio gets EAGAIN 481 * on the inode lock but the second goes through but is now unaligned). 482 */ 483 if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) && 484 (iocb->ki_flags & IOCB_NOWAIT)) 485 return -EOPNOTSUPP; 486 487 if (iocb->ki_flags & IOCB_NOWAIT) { 488 if (!inode_trylock(inode)) 489 return -EAGAIN; 490 } else { 491 inode_lock(inode); 492 } 493 494 count = zonefs_write_checks(iocb, from); 495 if (count <= 0) { 496 ret = count; 497 goto inode_unlock; 498 } 499 500 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 501 ret = -EINVAL; 502 goto inode_unlock; 503 } 504 505 /* Enforce sequential writes (append only) in sequential zones */ 506 if (zonefs_zone_is_seq(z)) { 507 mutex_lock(&zi->i_truncate_mutex); 508 if (iocb->ki_pos != z->z_wpoffset) { 509 mutex_unlock(&zi->i_truncate_mutex); 510 ret = -EINVAL; 511 goto inode_unlock; 512 } 513 /* 514 * Advance the zone write pointer offset. This assumes that the 515 * IO will succeed, which is OK to do because we do not allow 516 * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO 517 * fails, the error path will correct the write pointer offset. 518 */ 519 z->z_wpoffset += count; 520 zonefs_inode_account_active(inode); 521 mutex_unlock(&zi->i_truncate_mutex); 522 } 523 524 /* 525 * iomap_dio_rw() may return ENOTBLK if there was an issue with 526 * page invalidation. Overwrite that error code with EBUSY so that 527 * the user can make sense of the error. 528 */ 529 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, 530 &zonefs_write_dio_ops, 0, NULL, 0); 531 if (ret == -ENOTBLK) 532 ret = -EBUSY; 533 534 /* 535 * For a failed IO or partial completion, trigger error recovery 536 * to update the zone write pointer offset to a correct value. 537 * For asynchronous IOs, zonefs_file_write_dio_end_io() may already 538 * have executed error recovery if the IO already completed when we 539 * reach here. However, we cannot know that and execute error recovery 540 * again (that will not change anything). 541 */ 542 if (zonefs_zone_is_seq(z)) { 543 if (ret > 0 && ret != count) 544 ret = -EIO; 545 if (ret < 0 && ret != -EIOCBQUEUED) 546 zonefs_io_error(inode, true); 547 } 548 549 inode_unlock: 550 inode_unlock(inode); 551 552 return ret; 553 } 554 555 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 556 struct iov_iter *from) 557 { 558 struct inode *inode = file_inode(iocb->ki_filp); 559 ssize_t ret; 560 561 /* 562 * Direct IO writes are mandatory for sequential zone files so that the 563 * write IO issuing order is preserved. 564 */ 565 if (zonefs_inode_is_seq(inode)) 566 return -EIO; 567 568 if (iocb->ki_flags & IOCB_NOWAIT) { 569 if (!inode_trylock(inode)) 570 return -EAGAIN; 571 } else { 572 inode_lock(inode); 573 } 574 575 ret = zonefs_write_checks(iocb, from); 576 if (ret <= 0) 577 goto inode_unlock; 578 579 ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, 580 NULL, NULL); 581 if (ret == -EIO) 582 zonefs_io_error(inode, true); 583 584 inode_unlock: 585 inode_unlock(inode); 586 if (ret > 0) 587 ret = generic_write_sync(iocb, ret); 588 589 return ret; 590 } 591 592 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 593 { 594 struct inode *inode = file_inode(iocb->ki_filp); 595 struct zonefs_zone *z = zonefs_inode_zone(inode); 596 597 if (unlikely(IS_IMMUTABLE(inode))) 598 return -EPERM; 599 600 if (sb_rdonly(inode->i_sb)) 601 return -EROFS; 602 603 /* Write operations beyond the zone capacity are not allowed */ 604 if (iocb->ki_pos >= z->z_capacity) 605 return -EFBIG; 606 607 if (iocb->ki_flags & IOCB_DIRECT) { 608 ssize_t ret = zonefs_file_dio_write(iocb, from); 609 610 if (ret != -ENOTBLK) 611 return ret; 612 } 613 614 return zonefs_file_buffered_write(iocb, from); 615 } 616 617 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 618 int error, unsigned int flags) 619 { 620 if (error) { 621 zonefs_io_error(file_inode(iocb->ki_filp), false); 622 return error; 623 } 624 625 return 0; 626 } 627 628 static const struct iomap_dio_ops zonefs_read_dio_ops = { 629 .end_io = zonefs_file_read_dio_end_io, 630 }; 631 632 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 633 { 634 struct inode *inode = file_inode(iocb->ki_filp); 635 struct zonefs_inode_info *zi = ZONEFS_I(inode); 636 struct zonefs_zone *z = zonefs_inode_zone(inode); 637 struct super_block *sb = inode->i_sb; 638 loff_t isize; 639 ssize_t ret; 640 641 /* Offline zones cannot be read */ 642 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 643 return -EPERM; 644 645 if (iocb->ki_pos >= z->z_capacity) 646 return 0; 647 648 if (iocb->ki_flags & IOCB_NOWAIT) { 649 if (!inode_trylock_shared(inode)) 650 return -EAGAIN; 651 } else { 652 inode_lock_shared(inode); 653 } 654 655 /* Limit read operations to written data */ 656 mutex_lock(&zi->i_truncate_mutex); 657 isize = i_size_read(inode); 658 if (iocb->ki_pos >= isize) { 659 mutex_unlock(&zi->i_truncate_mutex); 660 ret = 0; 661 goto inode_unlock; 662 } 663 iov_iter_truncate(to, isize - iocb->ki_pos); 664 mutex_unlock(&zi->i_truncate_mutex); 665 666 if (iocb->ki_flags & IOCB_DIRECT) { 667 size_t count = iov_iter_count(to); 668 669 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 670 ret = -EINVAL; 671 goto inode_unlock; 672 } 673 file_accessed(iocb->ki_filp); 674 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, 675 &zonefs_read_dio_ops, 0, NULL, 0); 676 } else { 677 ret = generic_file_read_iter(iocb, to); 678 if (ret == -EIO) 679 zonefs_io_error(inode, false); 680 } 681 682 inode_unlock: 683 inode_unlock_shared(inode); 684 685 return ret; 686 } 687 688 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos, 689 struct pipe_inode_info *pipe, 690 size_t len, unsigned int flags) 691 { 692 struct inode *inode = file_inode(in); 693 struct zonefs_inode_info *zi = ZONEFS_I(inode); 694 struct zonefs_zone *z = zonefs_inode_zone(inode); 695 loff_t isize; 696 ssize_t ret = 0; 697 698 /* Offline zones cannot be read */ 699 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 700 return -EPERM; 701 702 if (*ppos >= z->z_capacity) 703 return 0; 704 705 inode_lock_shared(inode); 706 707 /* Limit read operations to written data */ 708 mutex_lock(&zi->i_truncate_mutex); 709 isize = i_size_read(inode); 710 if (*ppos >= isize) 711 len = 0; 712 else 713 len = min_t(loff_t, len, isize - *ppos); 714 mutex_unlock(&zi->i_truncate_mutex); 715 716 if (len > 0) { 717 ret = filemap_splice_read(in, ppos, pipe, len, flags); 718 if (ret == -EIO) 719 zonefs_io_error(inode, false); 720 } 721 722 inode_unlock_shared(inode); 723 return ret; 724 } 725 726 /* 727 * Write open accounting is done only for sequential files. 728 */ 729 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 730 struct file *file) 731 { 732 if (zonefs_inode_is_cnv(inode)) 733 return false; 734 735 if (!(file->f_mode & FMODE_WRITE)) 736 return false; 737 738 return true; 739 } 740 741 static int zonefs_seq_file_write_open(struct inode *inode) 742 { 743 struct zonefs_inode_info *zi = ZONEFS_I(inode); 744 struct zonefs_zone *z = zonefs_inode_zone(inode); 745 int ret = 0; 746 747 mutex_lock(&zi->i_truncate_mutex); 748 749 if (!zi->i_wr_refcnt) { 750 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 751 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 752 753 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 754 755 if (sbi->s_max_wro_seq_files 756 && wro > sbi->s_max_wro_seq_files) { 757 atomic_dec(&sbi->s_wro_seq_files); 758 ret = -EBUSY; 759 goto unlock; 760 } 761 762 if (i_size_read(inode) < z->z_capacity) { 763 ret = zonefs_inode_zone_mgmt(inode, 764 REQ_OP_ZONE_OPEN); 765 if (ret) { 766 atomic_dec(&sbi->s_wro_seq_files); 767 goto unlock; 768 } 769 z->z_flags |= ZONEFS_ZONE_OPEN; 770 zonefs_inode_account_active(inode); 771 } 772 } 773 } 774 775 zi->i_wr_refcnt++; 776 777 unlock: 778 mutex_unlock(&zi->i_truncate_mutex); 779 780 return ret; 781 } 782 783 static int zonefs_file_open(struct inode *inode, struct file *file) 784 { 785 int ret; 786 787 file->f_mode |= FMODE_CAN_ODIRECT; 788 ret = generic_file_open(inode, file); 789 if (ret) 790 return ret; 791 792 if (zonefs_seq_file_need_wro(inode, file)) 793 return zonefs_seq_file_write_open(inode); 794 795 return 0; 796 } 797 798 static void zonefs_seq_file_write_close(struct inode *inode) 799 { 800 struct zonefs_inode_info *zi = ZONEFS_I(inode); 801 struct zonefs_zone *z = zonefs_inode_zone(inode); 802 struct super_block *sb = inode->i_sb; 803 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 804 int ret = 0; 805 806 mutex_lock(&zi->i_truncate_mutex); 807 808 zi->i_wr_refcnt--; 809 if (zi->i_wr_refcnt) 810 goto unlock; 811 812 /* 813 * The file zone may not be open anymore (e.g. the file was truncated to 814 * its maximum size or it was fully written). For this case, we only 815 * need to decrement the write open count. 816 */ 817 if (z->z_flags & ZONEFS_ZONE_OPEN) { 818 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 819 if (ret) { 820 __zonefs_io_error(inode, false); 821 /* 822 * Leaving zones explicitly open may lead to a state 823 * where most zones cannot be written (zone resources 824 * exhausted). So take preventive action by remounting 825 * read-only. 826 */ 827 if (z->z_flags & ZONEFS_ZONE_OPEN && 828 !(sb->s_flags & SB_RDONLY)) { 829 zonefs_warn(sb, 830 "closing zone at %llu failed %d\n", 831 z->z_sector, ret); 832 zonefs_warn(sb, 833 "remounting filesystem read-only\n"); 834 sb->s_flags |= SB_RDONLY; 835 } 836 goto unlock; 837 } 838 839 z->z_flags &= ~ZONEFS_ZONE_OPEN; 840 zonefs_inode_account_active(inode); 841 } 842 843 atomic_dec(&sbi->s_wro_seq_files); 844 845 unlock: 846 mutex_unlock(&zi->i_truncate_mutex); 847 } 848 849 static int zonefs_file_release(struct inode *inode, struct file *file) 850 { 851 /* 852 * If we explicitly open a zone we must close it again as well, but the 853 * zone management operation can fail (either due to an IO error or as 854 * the zone has gone offline or read-only). Make sure we don't fail the 855 * close(2) for user-space. 856 */ 857 if (zonefs_seq_file_need_wro(inode, file)) 858 zonefs_seq_file_write_close(inode); 859 860 return 0; 861 } 862 863 const struct file_operations zonefs_file_operations = { 864 .open = zonefs_file_open, 865 .release = zonefs_file_release, 866 .fsync = zonefs_file_fsync, 867 .mmap_prepare = zonefs_file_mmap_prepare, 868 .llseek = zonefs_file_llseek, 869 .read_iter = zonefs_file_read_iter, 870 .write_iter = zonefs_file_write_iter, 871 .splice_read = zonefs_file_splice_read, 872 .splice_write = iter_file_splice_write, 873 .iopoll = iocb_bio_iopoll, 874 }; 875