1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2022 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/iomap.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/blkdev.h> 13 #include <linux/statfs.h> 14 #include <linux/writeback.h> 15 #include <linux/quotaops.h> 16 #include <linux/seq_file.h> 17 #include <linux/parser.h> 18 #include <linux/uio.h> 19 #include <linux/mman.h> 20 #include <linux/sched/mm.h> 21 #include <linux/task_io_accounting_ops.h> 22 23 #include "zonefs.h" 24 25 #include "trace.h" 26 27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, 28 loff_t length, unsigned int flags, 29 struct iomap *iomap, struct iomap *srcmap) 30 { 31 struct zonefs_inode_info *zi = ZONEFS_I(inode); 32 struct zonefs_zone *z = zonefs_inode_zone(inode); 33 struct super_block *sb = inode->i_sb; 34 loff_t isize; 35 36 /* 37 * All blocks are always mapped below EOF. If reading past EOF, 38 * act as if there is a hole up to the file maximum size. 39 */ 40 mutex_lock(&zi->i_truncate_mutex); 41 iomap->bdev = inode->i_sb->s_bdev; 42 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 43 isize = i_size_read(inode); 44 if (iomap->offset >= isize) { 45 iomap->type = IOMAP_HOLE; 46 iomap->addr = IOMAP_NULL_ADDR; 47 iomap->length = length; 48 } else { 49 iomap->type = IOMAP_MAPPED; 50 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 51 iomap->length = isize - iomap->offset; 52 } 53 mutex_unlock(&zi->i_truncate_mutex); 54 55 trace_zonefs_iomap_begin(inode, iomap); 56 57 return 0; 58 } 59 60 static const struct iomap_ops zonefs_read_iomap_ops = { 61 .iomap_begin = zonefs_read_iomap_begin, 62 }; 63 64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, 65 loff_t length, unsigned int flags, 66 struct iomap *iomap, struct iomap *srcmap) 67 { 68 struct zonefs_inode_info *zi = ZONEFS_I(inode); 69 struct zonefs_zone *z = zonefs_inode_zone(inode); 70 struct super_block *sb = inode->i_sb; 71 loff_t isize; 72 73 /* All write I/Os should always be within the file maximum size */ 74 if (WARN_ON_ONCE(offset + length > z->z_capacity)) 75 return -EIO; 76 77 /* 78 * Sequential zones can only accept direct writes. This is already 79 * checked when writes are issued, so warn if we see a page writeback 80 * operation. 81 */ 82 if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) 83 return -EIO; 84 85 /* 86 * For conventional zones, all blocks are always mapped. For sequential 87 * zones, all blocks after always mapped below the inode size (zone 88 * write pointer) and unwriten beyond. 89 */ 90 mutex_lock(&zi->i_truncate_mutex); 91 iomap->bdev = inode->i_sb->s_bdev; 92 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 93 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 94 isize = i_size_read(inode); 95 if (iomap->offset >= isize) { 96 iomap->type = IOMAP_UNWRITTEN; 97 iomap->length = z->z_capacity - iomap->offset; 98 } else { 99 iomap->type = IOMAP_MAPPED; 100 iomap->length = isize - iomap->offset; 101 } 102 mutex_unlock(&zi->i_truncate_mutex); 103 104 trace_zonefs_iomap_begin(inode, iomap); 105 106 return 0; 107 } 108 109 static const struct iomap_ops zonefs_write_iomap_ops = { 110 .iomap_begin = zonefs_write_iomap_begin, 111 }; 112 113 static int zonefs_read_folio(struct file *unused, struct folio *folio) 114 { 115 return iomap_read_folio(folio, &zonefs_read_iomap_ops); 116 } 117 118 static void zonefs_readahead(struct readahead_control *rac) 119 { 120 iomap_readahead(rac, &zonefs_read_iomap_ops); 121 } 122 123 /* 124 * Map blocks for page writeback. This is used only on conventional zone files, 125 * which implies that the page range can only be within the fixed inode size. 126 */ 127 static ssize_t zonefs_writeback_range(struct iomap_writepage_ctx *wpc, 128 struct folio *folio, u64 offset, unsigned len, u64 end_pos) 129 { 130 struct zonefs_zone *z = zonefs_inode_zone(wpc->inode); 131 132 if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) 133 return -EIO; 134 if (WARN_ON_ONCE(offset >= i_size_read(wpc->inode))) 135 return -EIO; 136 137 /* If the mapping is already OK, nothing needs to be done */ 138 if (offset < wpc->iomap.offset || 139 offset >= wpc->iomap.offset + wpc->iomap.length) { 140 int error; 141 142 error = zonefs_write_iomap_begin(wpc->inode, offset, 143 z->z_capacity - offset, IOMAP_WRITE, 144 &wpc->iomap, NULL); 145 if (error) 146 return error; 147 } 148 149 return iomap_add_to_ioend(wpc, folio, offset, end_pos, len); 150 } 151 152 static const struct iomap_writeback_ops zonefs_writeback_ops = { 153 .writeback_range = zonefs_writeback_range, 154 .writeback_submit = iomap_ioend_writeback_submit, 155 }; 156 157 static int zonefs_writepages(struct address_space *mapping, 158 struct writeback_control *wbc) 159 { 160 struct iomap_writepage_ctx wpc = { 161 .inode = mapping->host, 162 .wbc = wbc, 163 .ops = &zonefs_writeback_ops, 164 }; 165 166 return iomap_writepages(&wpc); 167 } 168 169 static int zonefs_swap_activate(struct swap_info_struct *sis, 170 struct file *swap_file, sector_t *span) 171 { 172 struct inode *inode = file_inode(swap_file); 173 174 if (zonefs_inode_is_seq(inode)) { 175 zonefs_err(inode->i_sb, 176 "swap file: not a conventional zone file\n"); 177 return -EINVAL; 178 } 179 180 return iomap_swapfile_activate(sis, swap_file, span, 181 &zonefs_read_iomap_ops); 182 } 183 184 const struct address_space_operations zonefs_file_aops = { 185 .read_folio = zonefs_read_folio, 186 .readahead = zonefs_readahead, 187 .writepages = zonefs_writepages, 188 .dirty_folio = iomap_dirty_folio, 189 .release_folio = iomap_release_folio, 190 .invalidate_folio = iomap_invalidate_folio, 191 .migrate_folio = filemap_migrate_folio, 192 .is_partially_uptodate = iomap_is_partially_uptodate, 193 .error_remove_folio = generic_error_remove_folio, 194 .swap_activate = zonefs_swap_activate, 195 }; 196 197 int zonefs_file_truncate(struct inode *inode, loff_t isize) 198 { 199 struct zonefs_inode_info *zi = ZONEFS_I(inode); 200 struct zonefs_zone *z = zonefs_inode_zone(inode); 201 loff_t old_isize; 202 enum req_op op; 203 int ret = 0; 204 205 /* 206 * Only sequential zone files can be truncated and truncation is allowed 207 * only down to a 0 size, which is equivalent to a zone reset, and to 208 * the maximum file size, which is equivalent to a zone finish. 209 */ 210 if (!zonefs_zone_is_seq(z)) 211 return -EPERM; 212 213 if (!isize) 214 op = REQ_OP_ZONE_RESET; 215 else if (isize == z->z_capacity) 216 op = REQ_OP_ZONE_FINISH; 217 else 218 return -EPERM; 219 220 inode_dio_wait(inode); 221 222 /* Serialize against page faults */ 223 filemap_invalidate_lock(inode->i_mapping); 224 225 /* Serialize against zonefs_iomap_begin() */ 226 mutex_lock(&zi->i_truncate_mutex); 227 228 old_isize = i_size_read(inode); 229 if (isize == old_isize) 230 goto unlock; 231 232 ret = zonefs_inode_zone_mgmt(inode, op); 233 if (ret) 234 goto unlock; 235 236 /* 237 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 238 * take care of open zones. 239 */ 240 if (z->z_flags & ZONEFS_ZONE_OPEN) { 241 /* 242 * Truncating a zone to EMPTY or FULL is the equivalent of 243 * closing the zone. For a truncation to 0, we need to 244 * re-open the zone to ensure new writes can be processed. 245 * For a truncation to the maximum file size, the zone is 246 * closed and writes cannot be accepted anymore, so clear 247 * the open flag. 248 */ 249 if (!isize) 250 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 251 else 252 z->z_flags &= ~ZONEFS_ZONE_OPEN; 253 } 254 255 zonefs_update_stats(inode, isize); 256 truncate_setsize(inode, isize); 257 z->z_wpoffset = isize; 258 zonefs_inode_account_active(inode); 259 260 unlock: 261 mutex_unlock(&zi->i_truncate_mutex); 262 filemap_invalidate_unlock(inode->i_mapping); 263 264 return ret; 265 } 266 267 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 268 int datasync) 269 { 270 struct inode *inode = file_inode(file); 271 int ret = 0; 272 273 if (unlikely(IS_IMMUTABLE(inode))) 274 return -EPERM; 275 276 /* 277 * Since only direct writes are allowed in sequential files, page cache 278 * flush is needed only for conventional zone files. 279 */ 280 if (zonefs_inode_is_cnv(inode)) 281 ret = file_write_and_wait_range(file, start, end); 282 if (!ret) 283 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 284 285 if (ret) 286 zonefs_io_error(inode, true); 287 288 return ret; 289 } 290 291 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 292 { 293 struct inode *inode = file_inode(vmf->vma->vm_file); 294 vm_fault_t ret; 295 296 if (unlikely(IS_IMMUTABLE(inode))) 297 return VM_FAULT_SIGBUS; 298 299 /* 300 * Sanity check: only conventional zone files can have shared 301 * writeable mappings. 302 */ 303 if (zonefs_inode_is_seq(inode)) 304 return VM_FAULT_NOPAGE; 305 306 sb_start_pagefault(inode->i_sb); 307 file_update_time(vmf->vma->vm_file); 308 309 /* Serialize against truncates */ 310 filemap_invalidate_lock_shared(inode->i_mapping); 311 ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); 312 filemap_invalidate_unlock_shared(inode->i_mapping); 313 314 sb_end_pagefault(inode->i_sb); 315 return ret; 316 } 317 318 static const struct vm_operations_struct zonefs_file_vm_ops = { 319 .fault = filemap_fault, 320 .map_pages = filemap_map_pages, 321 .page_mkwrite = zonefs_filemap_page_mkwrite, 322 }; 323 324 static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) 325 { 326 struct file *file = desc->file; 327 328 /* 329 * Conventional zones accept random writes, so their files can support 330 * shared writable mappings. For sequential zone files, only read 331 * mappings are possible since there are no guarantees for write 332 * ordering between msync() and page cache writeback. 333 */ 334 if (zonefs_inode_is_seq(file_inode(file)) && 335 (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) 336 return -EINVAL; 337 338 file_accessed(file); 339 desc->vm_ops = &zonefs_file_vm_ops; 340 341 return 0; 342 } 343 344 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 345 { 346 loff_t isize = i_size_read(file_inode(file)); 347 348 /* 349 * Seeks are limited to below the zone size for conventional zones 350 * and below the zone write pointer for sequential zones. In both 351 * cases, this limit is the inode size. 352 */ 353 return generic_file_llseek_size(file, offset, whence, isize, isize); 354 } 355 356 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 357 int error, unsigned int flags) 358 { 359 struct inode *inode = file_inode(iocb->ki_filp); 360 struct zonefs_inode_info *zi = ZONEFS_I(inode); 361 362 if (error) { 363 /* 364 * For Sync IOs, error recovery is called from 365 * zonefs_file_dio_write(). 366 */ 367 if (!is_sync_kiocb(iocb)) 368 zonefs_io_error(inode, true); 369 return error; 370 } 371 372 if (size && zonefs_inode_is_seq(inode)) { 373 /* 374 * Note that we may be seeing completions out of order, 375 * but that is not a problem since a write completed 376 * successfully necessarily means that all preceding writes 377 * were also successful. So we can safely increase the inode 378 * size to the write end location. 379 */ 380 mutex_lock(&zi->i_truncate_mutex); 381 if (i_size_read(inode) < iocb->ki_pos + size) { 382 zonefs_update_stats(inode, iocb->ki_pos + size); 383 zonefs_i_size_write(inode, iocb->ki_pos + size); 384 } 385 mutex_unlock(&zi->i_truncate_mutex); 386 } 387 388 return 0; 389 } 390 391 static const struct iomap_dio_ops zonefs_write_dio_ops = { 392 .end_io = zonefs_file_write_dio_end_io, 393 }; 394 395 /* 396 * Do not exceed the LFS limits nor the file zone size. If pos is under the 397 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 398 */ 399 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 400 loff_t count) 401 { 402 struct inode *inode = file_inode(file); 403 struct zonefs_zone *z = zonefs_inode_zone(inode); 404 loff_t limit = rlimit(RLIMIT_FSIZE); 405 loff_t max_size = z->z_capacity; 406 407 if (limit != RLIM_INFINITY) { 408 if (pos >= limit) { 409 send_sig(SIGXFSZ, current, 0); 410 return -EFBIG; 411 } 412 count = min(count, limit - pos); 413 } 414 415 if (!(file->f_flags & O_LARGEFILE)) 416 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 417 418 if (unlikely(pos >= max_size)) 419 return -EFBIG; 420 421 return min(count, max_size - pos); 422 } 423 424 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 425 { 426 struct file *file = iocb->ki_filp; 427 struct inode *inode = file_inode(file); 428 struct zonefs_inode_info *zi = ZONEFS_I(inode); 429 struct zonefs_zone *z = zonefs_inode_zone(inode); 430 loff_t count; 431 432 if (IS_SWAPFILE(inode)) 433 return -ETXTBSY; 434 435 if (!iov_iter_count(from)) 436 return 0; 437 438 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 439 return -EINVAL; 440 441 if (iocb->ki_flags & IOCB_APPEND) { 442 if (zonefs_zone_is_cnv(z)) 443 return -EINVAL; 444 mutex_lock(&zi->i_truncate_mutex); 445 iocb->ki_pos = z->z_wpoffset; 446 mutex_unlock(&zi->i_truncate_mutex); 447 } 448 449 count = zonefs_write_check_limits(file, iocb->ki_pos, 450 iov_iter_count(from)); 451 if (count < 0) 452 return count; 453 454 iov_iter_truncate(from, count); 455 return iov_iter_count(from); 456 } 457 458 /* 459 * Handle direct writes. For sequential zone files, this is the only possible 460 * write path. For these files, check that the user is issuing writes 461 * sequentially from the end of the file. This code assumes that the block layer 462 * delivers write requests to the device in sequential order. This is always the 463 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 464 * elevator feature is being used (e.g. mq-deadline). The block layer always 465 * automatically select such an elevator for zoned block devices during the 466 * device initialization. 467 */ 468 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 469 { 470 struct inode *inode = file_inode(iocb->ki_filp); 471 struct zonefs_inode_info *zi = ZONEFS_I(inode); 472 struct zonefs_zone *z = zonefs_inode_zone(inode); 473 struct super_block *sb = inode->i_sb; 474 ssize_t ret, count; 475 476 /* 477 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 478 * as this can cause write reordering (e.g. the first aio gets EAGAIN 479 * on the inode lock but the second goes through but is now unaligned). 480 */ 481 if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) && 482 (iocb->ki_flags & IOCB_NOWAIT)) 483 return -EOPNOTSUPP; 484 485 if (iocb->ki_flags & IOCB_NOWAIT) { 486 if (!inode_trylock(inode)) 487 return -EAGAIN; 488 } else { 489 inode_lock(inode); 490 } 491 492 count = zonefs_write_checks(iocb, from); 493 if (count <= 0) { 494 ret = count; 495 goto inode_unlock; 496 } 497 498 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 499 ret = -EINVAL; 500 goto inode_unlock; 501 } 502 503 /* Enforce sequential writes (append only) in sequential zones */ 504 if (zonefs_zone_is_seq(z)) { 505 mutex_lock(&zi->i_truncate_mutex); 506 if (iocb->ki_pos != z->z_wpoffset) { 507 mutex_unlock(&zi->i_truncate_mutex); 508 ret = -EINVAL; 509 goto inode_unlock; 510 } 511 /* 512 * Advance the zone write pointer offset. This assumes that the 513 * IO will succeed, which is OK to do because we do not allow 514 * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO 515 * fails, the error path will correct the write pointer offset. 516 */ 517 z->z_wpoffset += count; 518 zonefs_inode_account_active(inode); 519 mutex_unlock(&zi->i_truncate_mutex); 520 } 521 522 /* 523 * iomap_dio_rw() may return ENOTBLK if there was an issue with 524 * page invalidation. Overwrite that error code with EBUSY so that 525 * the user can make sense of the error. 526 */ 527 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, 528 &zonefs_write_dio_ops, 0, NULL, 0); 529 if (ret == -ENOTBLK) 530 ret = -EBUSY; 531 532 /* 533 * For a failed IO or partial completion, trigger error recovery 534 * to update the zone write pointer offset to a correct value. 535 * For asynchronous IOs, zonefs_file_write_dio_end_io() may already 536 * have executed error recovery if the IO already completed when we 537 * reach here. However, we cannot know that and execute error recovery 538 * again (that will not change anything). 539 */ 540 if (zonefs_zone_is_seq(z)) { 541 if (ret > 0 && ret != count) 542 ret = -EIO; 543 if (ret < 0 && ret != -EIOCBQUEUED) 544 zonefs_io_error(inode, true); 545 } 546 547 inode_unlock: 548 inode_unlock(inode); 549 550 return ret; 551 } 552 553 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 554 struct iov_iter *from) 555 { 556 struct inode *inode = file_inode(iocb->ki_filp); 557 ssize_t ret; 558 559 /* 560 * Direct IO writes are mandatory for sequential zone files so that the 561 * write IO issuing order is preserved. 562 */ 563 if (zonefs_inode_is_seq(inode)) 564 return -EIO; 565 566 if (iocb->ki_flags & IOCB_NOWAIT) { 567 if (!inode_trylock(inode)) 568 return -EAGAIN; 569 } else { 570 inode_lock(inode); 571 } 572 573 ret = zonefs_write_checks(iocb, from); 574 if (ret <= 0) 575 goto inode_unlock; 576 577 ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, 578 NULL, NULL); 579 if (ret == -EIO) 580 zonefs_io_error(inode, true); 581 582 inode_unlock: 583 inode_unlock(inode); 584 if (ret > 0) 585 ret = generic_write_sync(iocb, ret); 586 587 return ret; 588 } 589 590 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 591 { 592 struct inode *inode = file_inode(iocb->ki_filp); 593 struct zonefs_zone *z = zonefs_inode_zone(inode); 594 595 if (unlikely(IS_IMMUTABLE(inode))) 596 return -EPERM; 597 598 if (sb_rdonly(inode->i_sb)) 599 return -EROFS; 600 601 /* Write operations beyond the zone capacity are not allowed */ 602 if (iocb->ki_pos >= z->z_capacity) 603 return -EFBIG; 604 605 if (iocb->ki_flags & IOCB_DIRECT) { 606 ssize_t ret = zonefs_file_dio_write(iocb, from); 607 608 if (ret != -ENOTBLK) 609 return ret; 610 } 611 612 return zonefs_file_buffered_write(iocb, from); 613 } 614 615 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 616 int error, unsigned int flags) 617 { 618 if (error) { 619 zonefs_io_error(file_inode(iocb->ki_filp), false); 620 return error; 621 } 622 623 return 0; 624 } 625 626 static const struct iomap_dio_ops zonefs_read_dio_ops = { 627 .end_io = zonefs_file_read_dio_end_io, 628 }; 629 630 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 631 { 632 struct inode *inode = file_inode(iocb->ki_filp); 633 struct zonefs_inode_info *zi = ZONEFS_I(inode); 634 struct zonefs_zone *z = zonefs_inode_zone(inode); 635 struct super_block *sb = inode->i_sb; 636 loff_t isize; 637 ssize_t ret; 638 639 /* Offline zones cannot be read */ 640 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 641 return -EPERM; 642 643 if (iocb->ki_pos >= z->z_capacity) 644 return 0; 645 646 if (iocb->ki_flags & IOCB_NOWAIT) { 647 if (!inode_trylock_shared(inode)) 648 return -EAGAIN; 649 } else { 650 inode_lock_shared(inode); 651 } 652 653 /* Limit read operations to written data */ 654 mutex_lock(&zi->i_truncate_mutex); 655 isize = i_size_read(inode); 656 if (iocb->ki_pos >= isize) { 657 mutex_unlock(&zi->i_truncate_mutex); 658 ret = 0; 659 goto inode_unlock; 660 } 661 iov_iter_truncate(to, isize - iocb->ki_pos); 662 mutex_unlock(&zi->i_truncate_mutex); 663 664 if (iocb->ki_flags & IOCB_DIRECT) { 665 size_t count = iov_iter_count(to); 666 667 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 668 ret = -EINVAL; 669 goto inode_unlock; 670 } 671 file_accessed(iocb->ki_filp); 672 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, 673 &zonefs_read_dio_ops, 0, NULL, 0); 674 } else { 675 ret = generic_file_read_iter(iocb, to); 676 if (ret == -EIO) 677 zonefs_io_error(inode, false); 678 } 679 680 inode_unlock: 681 inode_unlock_shared(inode); 682 683 return ret; 684 } 685 686 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos, 687 struct pipe_inode_info *pipe, 688 size_t len, unsigned int flags) 689 { 690 struct inode *inode = file_inode(in); 691 struct zonefs_inode_info *zi = ZONEFS_I(inode); 692 struct zonefs_zone *z = zonefs_inode_zone(inode); 693 loff_t isize; 694 ssize_t ret = 0; 695 696 /* Offline zones cannot be read */ 697 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 698 return -EPERM; 699 700 if (*ppos >= z->z_capacity) 701 return 0; 702 703 inode_lock_shared(inode); 704 705 /* Limit read operations to written data */ 706 mutex_lock(&zi->i_truncate_mutex); 707 isize = i_size_read(inode); 708 if (*ppos >= isize) 709 len = 0; 710 else 711 len = min_t(loff_t, len, isize - *ppos); 712 mutex_unlock(&zi->i_truncate_mutex); 713 714 if (len > 0) { 715 ret = filemap_splice_read(in, ppos, pipe, len, flags); 716 if (ret == -EIO) 717 zonefs_io_error(inode, false); 718 } 719 720 inode_unlock_shared(inode); 721 return ret; 722 } 723 724 /* 725 * Write open accounting is done only for sequential files. 726 */ 727 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 728 struct file *file) 729 { 730 if (zonefs_inode_is_cnv(inode)) 731 return false; 732 733 if (!(file->f_mode & FMODE_WRITE)) 734 return false; 735 736 return true; 737 } 738 739 static int zonefs_seq_file_write_open(struct inode *inode) 740 { 741 struct zonefs_inode_info *zi = ZONEFS_I(inode); 742 struct zonefs_zone *z = zonefs_inode_zone(inode); 743 int ret = 0; 744 745 mutex_lock(&zi->i_truncate_mutex); 746 747 if (!zi->i_wr_refcnt) { 748 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 749 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 750 751 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 752 753 if (sbi->s_max_wro_seq_files 754 && wro > sbi->s_max_wro_seq_files) { 755 atomic_dec(&sbi->s_wro_seq_files); 756 ret = -EBUSY; 757 goto unlock; 758 } 759 760 if (i_size_read(inode) < z->z_capacity) { 761 ret = zonefs_inode_zone_mgmt(inode, 762 REQ_OP_ZONE_OPEN); 763 if (ret) { 764 atomic_dec(&sbi->s_wro_seq_files); 765 goto unlock; 766 } 767 z->z_flags |= ZONEFS_ZONE_OPEN; 768 zonefs_inode_account_active(inode); 769 } 770 } 771 } 772 773 zi->i_wr_refcnt++; 774 775 unlock: 776 mutex_unlock(&zi->i_truncate_mutex); 777 778 return ret; 779 } 780 781 static int zonefs_file_open(struct inode *inode, struct file *file) 782 { 783 int ret; 784 785 file->f_mode |= FMODE_CAN_ODIRECT; 786 ret = generic_file_open(inode, file); 787 if (ret) 788 return ret; 789 790 if (zonefs_seq_file_need_wro(inode, file)) 791 return zonefs_seq_file_write_open(inode); 792 793 return 0; 794 } 795 796 static void zonefs_seq_file_write_close(struct inode *inode) 797 { 798 struct zonefs_inode_info *zi = ZONEFS_I(inode); 799 struct zonefs_zone *z = zonefs_inode_zone(inode); 800 struct super_block *sb = inode->i_sb; 801 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 802 int ret = 0; 803 804 mutex_lock(&zi->i_truncate_mutex); 805 806 zi->i_wr_refcnt--; 807 if (zi->i_wr_refcnt) 808 goto unlock; 809 810 /* 811 * The file zone may not be open anymore (e.g. the file was truncated to 812 * its maximum size or it was fully written). For this case, we only 813 * need to decrement the write open count. 814 */ 815 if (z->z_flags & ZONEFS_ZONE_OPEN) { 816 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 817 if (ret) { 818 __zonefs_io_error(inode, false); 819 /* 820 * Leaving zones explicitly open may lead to a state 821 * where most zones cannot be written (zone resources 822 * exhausted). So take preventive action by remounting 823 * read-only. 824 */ 825 if (z->z_flags & ZONEFS_ZONE_OPEN && 826 !(sb->s_flags & SB_RDONLY)) { 827 zonefs_warn(sb, 828 "closing zone at %llu failed %d\n", 829 z->z_sector, ret); 830 zonefs_warn(sb, 831 "remounting filesystem read-only\n"); 832 sb->s_flags |= SB_RDONLY; 833 } 834 goto unlock; 835 } 836 837 z->z_flags &= ~ZONEFS_ZONE_OPEN; 838 zonefs_inode_account_active(inode); 839 } 840 841 atomic_dec(&sbi->s_wro_seq_files); 842 843 unlock: 844 mutex_unlock(&zi->i_truncate_mutex); 845 } 846 847 static int zonefs_file_release(struct inode *inode, struct file *file) 848 { 849 /* 850 * If we explicitly open a zone we must close it again as well, but the 851 * zone management operation can fail (either due to an IO error or as 852 * the zone has gone offline or read-only). Make sure we don't fail the 853 * close(2) for user-space. 854 */ 855 if (zonefs_seq_file_need_wro(inode, file)) 856 zonefs_seq_file_write_close(inode); 857 858 return 0; 859 } 860 861 const struct file_operations zonefs_file_operations = { 862 .open = zonefs_file_open, 863 .release = zonefs_file_release, 864 .fsync = zonefs_file_fsync, 865 .mmap_prepare = zonefs_file_mmap_prepare, 866 .llseek = zonefs_file_llseek, 867 .read_iter = zonefs_file_read_iter, 868 .write_iter = zonefs_file_write_iter, 869 .splice_read = zonefs_file_splice_read, 870 .splice_write = iter_file_splice_write, 871 .iopoll = iocb_bio_iopoll, 872 }; 873