1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2022 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/iomap.h> 10 #include <linux/init.h> 11 #include <linux/slab.h> 12 #include <linux/blkdev.h> 13 #include <linux/statfs.h> 14 #include <linux/writeback.h> 15 #include <linux/quotaops.h> 16 #include <linux/seq_file.h> 17 #include <linux/parser.h> 18 #include <linux/uio.h> 19 #include <linux/mman.h> 20 #include <linux/sched/mm.h> 21 #include <linux/task_io_accounting_ops.h> 22 23 #include "zonefs.h" 24 25 #include "trace.h" 26 27 static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset, 28 loff_t length, unsigned int flags, 29 struct iomap *iomap, struct iomap *srcmap) 30 { 31 struct zonefs_inode_info *zi = ZONEFS_I(inode); 32 struct zonefs_zone *z = zonefs_inode_zone(inode); 33 struct super_block *sb = inode->i_sb; 34 loff_t isize; 35 36 /* 37 * All blocks are always mapped below EOF. If reading past EOF, 38 * act as if there is a hole up to the file maximum size. 39 */ 40 mutex_lock(&zi->i_truncate_mutex); 41 iomap->bdev = inode->i_sb->s_bdev; 42 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 43 isize = i_size_read(inode); 44 if (iomap->offset >= isize) { 45 iomap->type = IOMAP_HOLE; 46 iomap->addr = IOMAP_NULL_ADDR; 47 iomap->length = length; 48 } else { 49 iomap->type = IOMAP_MAPPED; 50 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 51 iomap->length = isize - iomap->offset; 52 } 53 mutex_unlock(&zi->i_truncate_mutex); 54 55 trace_zonefs_iomap_begin(inode, iomap); 56 57 return 0; 58 } 59 60 static const struct iomap_ops zonefs_read_iomap_ops = { 61 .iomap_begin = zonefs_read_iomap_begin, 62 }; 63 64 static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, 65 loff_t length, unsigned int flags, 66 struct iomap *iomap, struct iomap *srcmap) 67 { 68 struct zonefs_inode_info *zi = ZONEFS_I(inode); 69 struct zonefs_zone *z = zonefs_inode_zone(inode); 70 struct super_block *sb = inode->i_sb; 71 loff_t isize; 72 73 /* All write I/Os should always be within the file maximum size */ 74 if (WARN_ON_ONCE(offset + length > z->z_capacity)) 75 return -EIO; 76 77 /* 78 * Sequential zones can only accept direct writes. This is already 79 * checked when writes are issued, so warn if we see a page writeback 80 * operation. 81 */ 82 if (WARN_ON_ONCE(zonefs_zone_is_seq(z) && !(flags & IOMAP_DIRECT))) 83 return -EIO; 84 85 /* 86 * For conventional zones, all blocks are always mapped. For sequential 87 * zones, all blocks after always mapped below the inode size (zone 88 * write pointer) and unwriten beyond. 89 */ 90 mutex_lock(&zi->i_truncate_mutex); 91 iomap->bdev = inode->i_sb->s_bdev; 92 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 93 iomap->addr = (z->z_sector << SECTOR_SHIFT) + iomap->offset; 94 isize = i_size_read(inode); 95 if (iomap->offset >= isize) { 96 iomap->type = IOMAP_UNWRITTEN; 97 iomap->length = z->z_capacity - iomap->offset; 98 } else { 99 iomap->type = IOMAP_MAPPED; 100 iomap->length = isize - iomap->offset; 101 } 102 mutex_unlock(&zi->i_truncate_mutex); 103 104 trace_zonefs_iomap_begin(inode, iomap); 105 106 return 0; 107 } 108 109 static const struct iomap_ops zonefs_write_iomap_ops = { 110 .iomap_begin = zonefs_write_iomap_begin, 111 }; 112 113 static int zonefs_read_folio(struct file *unused, struct folio *folio) 114 { 115 return iomap_read_folio(folio, &zonefs_read_iomap_ops); 116 } 117 118 static void zonefs_readahead(struct readahead_control *rac) 119 { 120 iomap_readahead(rac, &zonefs_read_iomap_ops); 121 } 122 123 /* 124 * Map blocks for page writeback. This is used only on conventional zone files, 125 * which implies that the page range can only be within the fixed inode size. 126 */ 127 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc, 128 struct inode *inode, loff_t offset, 129 unsigned int len) 130 { 131 struct zonefs_zone *z = zonefs_inode_zone(inode); 132 133 if (WARN_ON_ONCE(zonefs_zone_is_seq(z))) 134 return -EIO; 135 if (WARN_ON_ONCE(offset >= i_size_read(inode))) 136 return -EIO; 137 138 /* If the mapping is already OK, nothing needs to be done */ 139 if (offset >= wpc->iomap.offset && 140 offset < wpc->iomap.offset + wpc->iomap.length) 141 return 0; 142 143 return zonefs_write_iomap_begin(inode, offset, 144 z->z_capacity - offset, 145 IOMAP_WRITE, &wpc->iomap, NULL); 146 } 147 148 static const struct iomap_writeback_ops zonefs_writeback_ops = { 149 .map_blocks = zonefs_write_map_blocks, 150 }; 151 152 static int zonefs_writepages(struct address_space *mapping, 153 struct writeback_control *wbc) 154 { 155 struct iomap_writepage_ctx wpc = { }; 156 157 return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); 158 } 159 160 static int zonefs_swap_activate(struct swap_info_struct *sis, 161 struct file *swap_file, sector_t *span) 162 { 163 struct inode *inode = file_inode(swap_file); 164 165 if (zonefs_inode_is_seq(inode)) { 166 zonefs_err(inode->i_sb, 167 "swap file: not a conventional zone file\n"); 168 return -EINVAL; 169 } 170 171 return iomap_swapfile_activate(sis, swap_file, span, 172 &zonefs_read_iomap_ops); 173 } 174 175 const struct address_space_operations zonefs_file_aops = { 176 .read_folio = zonefs_read_folio, 177 .readahead = zonefs_readahead, 178 .writepages = zonefs_writepages, 179 .dirty_folio = iomap_dirty_folio, 180 .release_folio = iomap_release_folio, 181 .invalidate_folio = iomap_invalidate_folio, 182 .migrate_folio = filemap_migrate_folio, 183 .is_partially_uptodate = iomap_is_partially_uptodate, 184 .error_remove_folio = generic_error_remove_folio, 185 .swap_activate = zonefs_swap_activate, 186 }; 187 188 int zonefs_file_truncate(struct inode *inode, loff_t isize) 189 { 190 struct zonefs_inode_info *zi = ZONEFS_I(inode); 191 struct zonefs_zone *z = zonefs_inode_zone(inode); 192 loff_t old_isize; 193 enum req_op op; 194 int ret = 0; 195 196 /* 197 * Only sequential zone files can be truncated and truncation is allowed 198 * only down to a 0 size, which is equivalent to a zone reset, and to 199 * the maximum file size, which is equivalent to a zone finish. 200 */ 201 if (!zonefs_zone_is_seq(z)) 202 return -EPERM; 203 204 if (!isize) 205 op = REQ_OP_ZONE_RESET; 206 else if (isize == z->z_capacity) 207 op = REQ_OP_ZONE_FINISH; 208 else 209 return -EPERM; 210 211 inode_dio_wait(inode); 212 213 /* Serialize against page faults */ 214 filemap_invalidate_lock(inode->i_mapping); 215 216 /* Serialize against zonefs_iomap_begin() */ 217 mutex_lock(&zi->i_truncate_mutex); 218 219 old_isize = i_size_read(inode); 220 if (isize == old_isize) 221 goto unlock; 222 223 ret = zonefs_inode_zone_mgmt(inode, op); 224 if (ret) 225 goto unlock; 226 227 /* 228 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 229 * take care of open zones. 230 */ 231 if (z->z_flags & ZONEFS_ZONE_OPEN) { 232 /* 233 * Truncating a zone to EMPTY or FULL is the equivalent of 234 * closing the zone. For a truncation to 0, we need to 235 * re-open the zone to ensure new writes can be processed. 236 * For a truncation to the maximum file size, the zone is 237 * closed and writes cannot be accepted anymore, so clear 238 * the open flag. 239 */ 240 if (!isize) 241 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 242 else 243 z->z_flags &= ~ZONEFS_ZONE_OPEN; 244 } 245 246 zonefs_update_stats(inode, isize); 247 truncate_setsize(inode, isize); 248 z->z_wpoffset = isize; 249 zonefs_inode_account_active(inode); 250 251 unlock: 252 mutex_unlock(&zi->i_truncate_mutex); 253 filemap_invalidate_unlock(inode->i_mapping); 254 255 return ret; 256 } 257 258 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 259 int datasync) 260 { 261 struct inode *inode = file_inode(file); 262 int ret = 0; 263 264 if (unlikely(IS_IMMUTABLE(inode))) 265 return -EPERM; 266 267 /* 268 * Since only direct writes are allowed in sequential files, page cache 269 * flush is needed only for conventional zone files. 270 */ 271 if (zonefs_inode_is_cnv(inode)) 272 ret = file_write_and_wait_range(file, start, end); 273 if (!ret) 274 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 275 276 if (ret) 277 zonefs_io_error(inode, true); 278 279 return ret; 280 } 281 282 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 283 { 284 struct inode *inode = file_inode(vmf->vma->vm_file); 285 vm_fault_t ret; 286 287 if (unlikely(IS_IMMUTABLE(inode))) 288 return VM_FAULT_SIGBUS; 289 290 /* 291 * Sanity check: only conventional zone files can have shared 292 * writeable mappings. 293 */ 294 if (zonefs_inode_is_seq(inode)) 295 return VM_FAULT_NOPAGE; 296 297 sb_start_pagefault(inode->i_sb); 298 file_update_time(vmf->vma->vm_file); 299 300 /* Serialize against truncates */ 301 filemap_invalidate_lock_shared(inode->i_mapping); 302 ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); 303 filemap_invalidate_unlock_shared(inode->i_mapping); 304 305 sb_end_pagefault(inode->i_sb); 306 return ret; 307 } 308 309 static const struct vm_operations_struct zonefs_file_vm_ops = { 310 .fault = filemap_fault, 311 .map_pages = filemap_map_pages, 312 .page_mkwrite = zonefs_filemap_page_mkwrite, 313 }; 314 315 static int zonefs_file_mmap_prepare(struct vm_area_desc *desc) 316 { 317 struct file *file = desc->file; 318 319 /* 320 * Conventional zones accept random writes, so their files can support 321 * shared writable mappings. For sequential zone files, only read 322 * mappings are possible since there are no guarantees for write 323 * ordering between msync() and page cache writeback. 324 */ 325 if (zonefs_inode_is_seq(file_inode(file)) && 326 (desc->vm_flags & VM_SHARED) && (desc->vm_flags & VM_MAYWRITE)) 327 return -EINVAL; 328 329 file_accessed(file); 330 desc->vm_ops = &zonefs_file_vm_ops; 331 332 return 0; 333 } 334 335 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 336 { 337 loff_t isize = i_size_read(file_inode(file)); 338 339 /* 340 * Seeks are limited to below the zone size for conventional zones 341 * and below the zone write pointer for sequential zones. In both 342 * cases, this limit is the inode size. 343 */ 344 return generic_file_llseek_size(file, offset, whence, isize, isize); 345 } 346 347 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 348 int error, unsigned int flags) 349 { 350 struct inode *inode = file_inode(iocb->ki_filp); 351 struct zonefs_inode_info *zi = ZONEFS_I(inode); 352 353 if (error) { 354 /* 355 * For Sync IOs, error recovery is called from 356 * zonefs_file_dio_write(). 357 */ 358 if (!is_sync_kiocb(iocb)) 359 zonefs_io_error(inode, true); 360 return error; 361 } 362 363 if (size && zonefs_inode_is_seq(inode)) { 364 /* 365 * Note that we may be seeing completions out of order, 366 * but that is not a problem since a write completed 367 * successfully necessarily means that all preceding writes 368 * were also successful. So we can safely increase the inode 369 * size to the write end location. 370 */ 371 mutex_lock(&zi->i_truncate_mutex); 372 if (i_size_read(inode) < iocb->ki_pos + size) { 373 zonefs_update_stats(inode, iocb->ki_pos + size); 374 zonefs_i_size_write(inode, iocb->ki_pos + size); 375 } 376 mutex_unlock(&zi->i_truncate_mutex); 377 } 378 379 return 0; 380 } 381 382 static const struct iomap_dio_ops zonefs_write_dio_ops = { 383 .end_io = zonefs_file_write_dio_end_io, 384 }; 385 386 /* 387 * Do not exceed the LFS limits nor the file zone size. If pos is under the 388 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 389 */ 390 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 391 loff_t count) 392 { 393 struct inode *inode = file_inode(file); 394 struct zonefs_zone *z = zonefs_inode_zone(inode); 395 loff_t limit = rlimit(RLIMIT_FSIZE); 396 loff_t max_size = z->z_capacity; 397 398 if (limit != RLIM_INFINITY) { 399 if (pos >= limit) { 400 send_sig(SIGXFSZ, current, 0); 401 return -EFBIG; 402 } 403 count = min(count, limit - pos); 404 } 405 406 if (!(file->f_flags & O_LARGEFILE)) 407 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 408 409 if (unlikely(pos >= max_size)) 410 return -EFBIG; 411 412 return min(count, max_size - pos); 413 } 414 415 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 416 { 417 struct file *file = iocb->ki_filp; 418 struct inode *inode = file_inode(file); 419 struct zonefs_inode_info *zi = ZONEFS_I(inode); 420 struct zonefs_zone *z = zonefs_inode_zone(inode); 421 loff_t count; 422 423 if (IS_SWAPFILE(inode)) 424 return -ETXTBSY; 425 426 if (!iov_iter_count(from)) 427 return 0; 428 429 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 430 return -EINVAL; 431 432 if (iocb->ki_flags & IOCB_APPEND) { 433 if (zonefs_zone_is_cnv(z)) 434 return -EINVAL; 435 mutex_lock(&zi->i_truncate_mutex); 436 iocb->ki_pos = z->z_wpoffset; 437 mutex_unlock(&zi->i_truncate_mutex); 438 } 439 440 count = zonefs_write_check_limits(file, iocb->ki_pos, 441 iov_iter_count(from)); 442 if (count < 0) 443 return count; 444 445 iov_iter_truncate(from, count); 446 return iov_iter_count(from); 447 } 448 449 /* 450 * Handle direct writes. For sequential zone files, this is the only possible 451 * write path. For these files, check that the user is issuing writes 452 * sequentially from the end of the file. This code assumes that the block layer 453 * delivers write requests to the device in sequential order. This is always the 454 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 455 * elevator feature is being used (e.g. mq-deadline). The block layer always 456 * automatically select such an elevator for zoned block devices during the 457 * device initialization. 458 */ 459 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 460 { 461 struct inode *inode = file_inode(iocb->ki_filp); 462 struct zonefs_inode_info *zi = ZONEFS_I(inode); 463 struct zonefs_zone *z = zonefs_inode_zone(inode); 464 struct super_block *sb = inode->i_sb; 465 ssize_t ret, count; 466 467 /* 468 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 469 * as this can cause write reordering (e.g. the first aio gets EAGAIN 470 * on the inode lock but the second goes through but is now unaligned). 471 */ 472 if (zonefs_zone_is_seq(z) && !is_sync_kiocb(iocb) && 473 (iocb->ki_flags & IOCB_NOWAIT)) 474 return -EOPNOTSUPP; 475 476 if (iocb->ki_flags & IOCB_NOWAIT) { 477 if (!inode_trylock(inode)) 478 return -EAGAIN; 479 } else { 480 inode_lock(inode); 481 } 482 483 count = zonefs_write_checks(iocb, from); 484 if (count <= 0) { 485 ret = count; 486 goto inode_unlock; 487 } 488 489 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 490 ret = -EINVAL; 491 goto inode_unlock; 492 } 493 494 /* Enforce sequential writes (append only) in sequential zones */ 495 if (zonefs_zone_is_seq(z)) { 496 mutex_lock(&zi->i_truncate_mutex); 497 if (iocb->ki_pos != z->z_wpoffset) { 498 mutex_unlock(&zi->i_truncate_mutex); 499 ret = -EINVAL; 500 goto inode_unlock; 501 } 502 /* 503 * Advance the zone write pointer offset. This assumes that the 504 * IO will succeed, which is OK to do because we do not allow 505 * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO 506 * fails, the error path will correct the write pointer offset. 507 */ 508 z->z_wpoffset += count; 509 zonefs_inode_account_active(inode); 510 mutex_unlock(&zi->i_truncate_mutex); 511 } 512 513 /* 514 * iomap_dio_rw() may return ENOTBLK if there was an issue with 515 * page invalidation. Overwrite that error code with EBUSY so that 516 * the user can make sense of the error. 517 */ 518 ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops, 519 &zonefs_write_dio_ops, 0, NULL, 0); 520 if (ret == -ENOTBLK) 521 ret = -EBUSY; 522 523 /* 524 * For a failed IO or partial completion, trigger error recovery 525 * to update the zone write pointer offset to a correct value. 526 * For asynchronous IOs, zonefs_file_write_dio_end_io() may already 527 * have executed error recovery if the IO already completed when we 528 * reach here. However, we cannot know that and execute error recovery 529 * again (that will not change anything). 530 */ 531 if (zonefs_zone_is_seq(z)) { 532 if (ret > 0 && ret != count) 533 ret = -EIO; 534 if (ret < 0 && ret != -EIOCBQUEUED) 535 zonefs_io_error(inode, true); 536 } 537 538 inode_unlock: 539 inode_unlock(inode); 540 541 return ret; 542 } 543 544 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 545 struct iov_iter *from) 546 { 547 struct inode *inode = file_inode(iocb->ki_filp); 548 ssize_t ret; 549 550 /* 551 * Direct IO writes are mandatory for sequential zone files so that the 552 * write IO issuing order is preserved. 553 */ 554 if (zonefs_inode_is_seq(inode)) 555 return -EIO; 556 557 if (iocb->ki_flags & IOCB_NOWAIT) { 558 if (!inode_trylock(inode)) 559 return -EAGAIN; 560 } else { 561 inode_lock(inode); 562 } 563 564 ret = zonefs_write_checks(iocb, from); 565 if (ret <= 0) 566 goto inode_unlock; 567 568 ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops, NULL); 569 if (ret == -EIO) 570 zonefs_io_error(inode, true); 571 572 inode_unlock: 573 inode_unlock(inode); 574 if (ret > 0) 575 ret = generic_write_sync(iocb, ret); 576 577 return ret; 578 } 579 580 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 581 { 582 struct inode *inode = file_inode(iocb->ki_filp); 583 struct zonefs_zone *z = zonefs_inode_zone(inode); 584 585 if (unlikely(IS_IMMUTABLE(inode))) 586 return -EPERM; 587 588 if (sb_rdonly(inode->i_sb)) 589 return -EROFS; 590 591 /* Write operations beyond the zone capacity are not allowed */ 592 if (iocb->ki_pos >= z->z_capacity) 593 return -EFBIG; 594 595 if (iocb->ki_flags & IOCB_DIRECT) { 596 ssize_t ret = zonefs_file_dio_write(iocb, from); 597 598 if (ret != -ENOTBLK) 599 return ret; 600 } 601 602 return zonefs_file_buffered_write(iocb, from); 603 } 604 605 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 606 int error, unsigned int flags) 607 { 608 if (error) { 609 zonefs_io_error(file_inode(iocb->ki_filp), false); 610 return error; 611 } 612 613 return 0; 614 } 615 616 static const struct iomap_dio_ops zonefs_read_dio_ops = { 617 .end_io = zonefs_file_read_dio_end_io, 618 }; 619 620 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 621 { 622 struct inode *inode = file_inode(iocb->ki_filp); 623 struct zonefs_inode_info *zi = ZONEFS_I(inode); 624 struct zonefs_zone *z = zonefs_inode_zone(inode); 625 struct super_block *sb = inode->i_sb; 626 loff_t isize; 627 ssize_t ret; 628 629 /* Offline zones cannot be read */ 630 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 631 return -EPERM; 632 633 if (iocb->ki_pos >= z->z_capacity) 634 return 0; 635 636 if (iocb->ki_flags & IOCB_NOWAIT) { 637 if (!inode_trylock_shared(inode)) 638 return -EAGAIN; 639 } else { 640 inode_lock_shared(inode); 641 } 642 643 /* Limit read operations to written data */ 644 mutex_lock(&zi->i_truncate_mutex); 645 isize = i_size_read(inode); 646 if (iocb->ki_pos >= isize) { 647 mutex_unlock(&zi->i_truncate_mutex); 648 ret = 0; 649 goto inode_unlock; 650 } 651 iov_iter_truncate(to, isize - iocb->ki_pos); 652 mutex_unlock(&zi->i_truncate_mutex); 653 654 if (iocb->ki_flags & IOCB_DIRECT) { 655 size_t count = iov_iter_count(to); 656 657 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 658 ret = -EINVAL; 659 goto inode_unlock; 660 } 661 file_accessed(iocb->ki_filp); 662 ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops, 663 &zonefs_read_dio_ops, 0, NULL, 0); 664 } else { 665 ret = generic_file_read_iter(iocb, to); 666 if (ret == -EIO) 667 zonefs_io_error(inode, false); 668 } 669 670 inode_unlock: 671 inode_unlock_shared(inode); 672 673 return ret; 674 } 675 676 static ssize_t zonefs_file_splice_read(struct file *in, loff_t *ppos, 677 struct pipe_inode_info *pipe, 678 size_t len, unsigned int flags) 679 { 680 struct inode *inode = file_inode(in); 681 struct zonefs_inode_info *zi = ZONEFS_I(inode); 682 struct zonefs_zone *z = zonefs_inode_zone(inode); 683 loff_t isize; 684 ssize_t ret = 0; 685 686 /* Offline zones cannot be read */ 687 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 688 return -EPERM; 689 690 if (*ppos >= z->z_capacity) 691 return 0; 692 693 inode_lock_shared(inode); 694 695 /* Limit read operations to written data */ 696 mutex_lock(&zi->i_truncate_mutex); 697 isize = i_size_read(inode); 698 if (*ppos >= isize) 699 len = 0; 700 else 701 len = min_t(loff_t, len, isize - *ppos); 702 mutex_unlock(&zi->i_truncate_mutex); 703 704 if (len > 0) { 705 ret = filemap_splice_read(in, ppos, pipe, len, flags); 706 if (ret == -EIO) 707 zonefs_io_error(inode, false); 708 } 709 710 inode_unlock_shared(inode); 711 return ret; 712 } 713 714 /* 715 * Write open accounting is done only for sequential files. 716 */ 717 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 718 struct file *file) 719 { 720 if (zonefs_inode_is_cnv(inode)) 721 return false; 722 723 if (!(file->f_mode & FMODE_WRITE)) 724 return false; 725 726 return true; 727 } 728 729 static int zonefs_seq_file_write_open(struct inode *inode) 730 { 731 struct zonefs_inode_info *zi = ZONEFS_I(inode); 732 struct zonefs_zone *z = zonefs_inode_zone(inode); 733 int ret = 0; 734 735 mutex_lock(&zi->i_truncate_mutex); 736 737 if (!zi->i_wr_refcnt) { 738 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 739 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 740 741 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 742 743 if (sbi->s_max_wro_seq_files 744 && wro > sbi->s_max_wro_seq_files) { 745 atomic_dec(&sbi->s_wro_seq_files); 746 ret = -EBUSY; 747 goto unlock; 748 } 749 750 if (i_size_read(inode) < z->z_capacity) { 751 ret = zonefs_inode_zone_mgmt(inode, 752 REQ_OP_ZONE_OPEN); 753 if (ret) { 754 atomic_dec(&sbi->s_wro_seq_files); 755 goto unlock; 756 } 757 z->z_flags |= ZONEFS_ZONE_OPEN; 758 zonefs_inode_account_active(inode); 759 } 760 } 761 } 762 763 zi->i_wr_refcnt++; 764 765 unlock: 766 mutex_unlock(&zi->i_truncate_mutex); 767 768 return ret; 769 } 770 771 static int zonefs_file_open(struct inode *inode, struct file *file) 772 { 773 int ret; 774 775 file->f_mode |= FMODE_CAN_ODIRECT; 776 ret = generic_file_open(inode, file); 777 if (ret) 778 return ret; 779 780 if (zonefs_seq_file_need_wro(inode, file)) 781 return zonefs_seq_file_write_open(inode); 782 783 return 0; 784 } 785 786 static void zonefs_seq_file_write_close(struct inode *inode) 787 { 788 struct zonefs_inode_info *zi = ZONEFS_I(inode); 789 struct zonefs_zone *z = zonefs_inode_zone(inode); 790 struct super_block *sb = inode->i_sb; 791 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 792 int ret = 0; 793 794 mutex_lock(&zi->i_truncate_mutex); 795 796 zi->i_wr_refcnt--; 797 if (zi->i_wr_refcnt) 798 goto unlock; 799 800 /* 801 * The file zone may not be open anymore (e.g. the file was truncated to 802 * its maximum size or it was fully written). For this case, we only 803 * need to decrement the write open count. 804 */ 805 if (z->z_flags & ZONEFS_ZONE_OPEN) { 806 ret = zonefs_inode_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 807 if (ret) { 808 __zonefs_io_error(inode, false); 809 /* 810 * Leaving zones explicitly open may lead to a state 811 * where most zones cannot be written (zone resources 812 * exhausted). So take preventive action by remounting 813 * read-only. 814 */ 815 if (z->z_flags & ZONEFS_ZONE_OPEN && 816 !(sb->s_flags & SB_RDONLY)) { 817 zonefs_warn(sb, 818 "closing zone at %llu failed %d\n", 819 z->z_sector, ret); 820 zonefs_warn(sb, 821 "remounting filesystem read-only\n"); 822 sb->s_flags |= SB_RDONLY; 823 } 824 goto unlock; 825 } 826 827 z->z_flags &= ~ZONEFS_ZONE_OPEN; 828 zonefs_inode_account_active(inode); 829 } 830 831 atomic_dec(&sbi->s_wro_seq_files); 832 833 unlock: 834 mutex_unlock(&zi->i_truncate_mutex); 835 } 836 837 static int zonefs_file_release(struct inode *inode, struct file *file) 838 { 839 /* 840 * If we explicitly open a zone we must close it again as well, but the 841 * zone management operation can fail (either due to an IO error or as 842 * the zone has gone offline or read-only). Make sure we don't fail the 843 * close(2) for user-space. 844 */ 845 if (zonefs_seq_file_need_wro(inode, file)) 846 zonefs_seq_file_write_close(inode); 847 848 return 0; 849 } 850 851 const struct file_operations zonefs_file_operations = { 852 .open = zonefs_file_open, 853 .release = zonefs_file_release, 854 .fsync = zonefs_file_fsync, 855 .mmap_prepare = zonefs_file_mmap_prepare, 856 .llseek = zonefs_file_llseek, 857 .read_iter = zonefs_file_read_iter, 858 .write_iter = zonefs_file_write_iter, 859 .splice_read = zonefs_file_splice_read, 860 .splice_write = iter_file_splice_write, 861 .iopoll = iocb_bio_iopoll, 862 }; 863