1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Simple file system for zoned block devices exposing zones as files. 4 * 5 * Copyright (C) 2019 Western Digital Corporation or its affiliates. 6 */ 7 #include <linux/module.h> 8 #include <linux/pagemap.h> 9 #include <linux/magic.h> 10 #include <linux/iomap.h> 11 #include <linux/init.h> 12 #include <linux/slab.h> 13 #include <linux/blkdev.h> 14 #include <linux/statfs.h> 15 #include <linux/writeback.h> 16 #include <linux/quotaops.h> 17 #include <linux/seq_file.h> 18 #include <linux/parser.h> 19 #include <linux/uio.h> 20 #include <linux/mman.h> 21 #include <linux/sched/mm.h> 22 #include <linux/crc32.h> 23 #include <linux/task_io_accounting_ops.h> 24 25 #include "zonefs.h" 26 27 #define CREATE_TRACE_POINTS 28 #include "trace.h" 29 30 /* 31 * Manage the active zone count. Called with zi->i_truncate_mutex held. 32 */ 33 static void zonefs_account_active(struct inode *inode) 34 { 35 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 36 struct zonefs_inode_info *zi = ZONEFS_I(inode); 37 38 lockdep_assert_held(&zi->i_truncate_mutex); 39 40 if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) 41 return; 42 43 /* 44 * If the zone is active, that is, if it is explicitly open or 45 * partially written, check if it was already accounted as active. 46 */ 47 if ((zi->i_flags & ZONEFS_ZONE_OPEN) || 48 (zi->i_wpoffset > 0 && zi->i_wpoffset < zi->i_max_size)) { 49 if (!(zi->i_flags & ZONEFS_ZONE_ACTIVE)) { 50 zi->i_flags |= ZONEFS_ZONE_ACTIVE; 51 atomic_inc(&sbi->s_active_seq_files); 52 } 53 return; 54 } 55 56 /* The zone is not active. If it was, update the active count */ 57 if (zi->i_flags & ZONEFS_ZONE_ACTIVE) { 58 zi->i_flags &= ~ZONEFS_ZONE_ACTIVE; 59 atomic_dec(&sbi->s_active_seq_files); 60 } 61 } 62 63 static inline int zonefs_zone_mgmt(struct inode *inode, 64 enum req_opf op) 65 { 66 struct zonefs_inode_info *zi = ZONEFS_I(inode); 67 int ret; 68 69 lockdep_assert_held(&zi->i_truncate_mutex); 70 71 /* 72 * With ZNS drives, closing an explicitly open zone that has not been 73 * written will change the zone state to "closed", that is, the zone 74 * will remain active. Since this can then cause failure of explicit 75 * open operation on other zones if the drive active zone resources 76 * are exceeded, make sure that the zone does not remain active by 77 * resetting it. 78 */ 79 if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset) 80 op = REQ_OP_ZONE_RESET; 81 82 trace_zonefs_zone_mgmt(inode, op); 83 ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, 84 zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); 85 if (ret) { 86 zonefs_err(inode->i_sb, 87 "Zone management operation %s at %llu failed %d\n", 88 blk_op_str(op), zi->i_zsector, ret); 89 return ret; 90 } 91 92 return 0; 93 } 94 95 static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) 96 { 97 struct zonefs_inode_info *zi = ZONEFS_I(inode); 98 99 i_size_write(inode, isize); 100 /* 101 * A full zone is no longer open/active and does not need 102 * explicit closing. 103 */ 104 if (isize >= zi->i_max_size) { 105 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 106 107 if (zi->i_flags & ZONEFS_ZONE_ACTIVE) 108 atomic_dec(&sbi->s_active_seq_files); 109 zi->i_flags &= ~(ZONEFS_ZONE_OPEN | ZONEFS_ZONE_ACTIVE); 110 } 111 } 112 113 static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, 114 unsigned int flags, struct iomap *iomap, 115 struct iomap *srcmap) 116 { 117 struct zonefs_inode_info *zi = ZONEFS_I(inode); 118 struct super_block *sb = inode->i_sb; 119 loff_t isize; 120 121 /* All I/Os should always be within the file maximum size */ 122 if (WARN_ON_ONCE(offset + length > zi->i_max_size)) 123 return -EIO; 124 125 /* 126 * Sequential zones can only accept direct writes. This is already 127 * checked when writes are issued, so warn if we see a page writeback 128 * operation. 129 */ 130 if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ && 131 (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT))) 132 return -EIO; 133 134 /* 135 * For conventional zones, all blocks are always mapped. For sequential 136 * zones, all blocks after always mapped below the inode size (zone 137 * write pointer) and unwriten beyond. 138 */ 139 mutex_lock(&zi->i_truncate_mutex); 140 isize = i_size_read(inode); 141 if (offset >= isize) 142 iomap->type = IOMAP_UNWRITTEN; 143 else 144 iomap->type = IOMAP_MAPPED; 145 if (flags & IOMAP_WRITE) 146 length = zi->i_max_size - offset; 147 else 148 length = min(length, isize - offset); 149 mutex_unlock(&zi->i_truncate_mutex); 150 151 iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize); 152 iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset; 153 iomap->bdev = inode->i_sb->s_bdev; 154 iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; 155 156 trace_zonefs_iomap_begin(inode, iomap); 157 158 return 0; 159 } 160 161 static const struct iomap_ops zonefs_iomap_ops = { 162 .iomap_begin = zonefs_iomap_begin, 163 }; 164 165 static int zonefs_read_folio(struct file *unused, struct folio *folio) 166 { 167 return iomap_read_folio(folio, &zonefs_iomap_ops); 168 } 169 170 static void zonefs_readahead(struct readahead_control *rac) 171 { 172 iomap_readahead(rac, &zonefs_iomap_ops); 173 } 174 175 /* 176 * Map blocks for page writeback. This is used only on conventional zone files, 177 * which implies that the page range can only be within the fixed inode size. 178 */ 179 static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc, 180 struct inode *inode, loff_t offset) 181 { 182 struct zonefs_inode_info *zi = ZONEFS_I(inode); 183 184 if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) 185 return -EIO; 186 if (WARN_ON_ONCE(offset >= i_size_read(inode))) 187 return -EIO; 188 189 /* If the mapping is already OK, nothing needs to be done */ 190 if (offset >= wpc->iomap.offset && 191 offset < wpc->iomap.offset + wpc->iomap.length) 192 return 0; 193 194 return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset, 195 IOMAP_WRITE, &wpc->iomap, NULL); 196 } 197 198 static const struct iomap_writeback_ops zonefs_writeback_ops = { 199 .map_blocks = zonefs_map_blocks, 200 }; 201 202 static int zonefs_writepage(struct page *page, struct writeback_control *wbc) 203 { 204 struct iomap_writepage_ctx wpc = { }; 205 206 return iomap_writepage(page, wbc, &wpc, &zonefs_writeback_ops); 207 } 208 209 static int zonefs_writepages(struct address_space *mapping, 210 struct writeback_control *wbc) 211 { 212 struct iomap_writepage_ctx wpc = { }; 213 214 return iomap_writepages(mapping, wbc, &wpc, &zonefs_writeback_ops); 215 } 216 217 static int zonefs_swap_activate(struct swap_info_struct *sis, 218 struct file *swap_file, sector_t *span) 219 { 220 struct inode *inode = file_inode(swap_file); 221 struct zonefs_inode_info *zi = ZONEFS_I(inode); 222 223 if (zi->i_ztype != ZONEFS_ZTYPE_CNV) { 224 zonefs_err(inode->i_sb, 225 "swap file: not a conventional zone file\n"); 226 return -EINVAL; 227 } 228 229 return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops); 230 } 231 232 static const struct address_space_operations zonefs_file_aops = { 233 .read_folio = zonefs_read_folio, 234 .readahead = zonefs_readahead, 235 .writepage = zonefs_writepage, 236 .writepages = zonefs_writepages, 237 .dirty_folio = filemap_dirty_folio, 238 .release_folio = iomap_release_folio, 239 .invalidate_folio = iomap_invalidate_folio, 240 .migratepage = iomap_migrate_page, 241 .is_partially_uptodate = iomap_is_partially_uptodate, 242 .error_remove_page = generic_error_remove_page, 243 .direct_IO = noop_direct_IO, 244 .swap_activate = zonefs_swap_activate, 245 }; 246 247 static void zonefs_update_stats(struct inode *inode, loff_t new_isize) 248 { 249 struct super_block *sb = inode->i_sb; 250 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 251 loff_t old_isize = i_size_read(inode); 252 loff_t nr_blocks; 253 254 if (new_isize == old_isize) 255 return; 256 257 spin_lock(&sbi->s_lock); 258 259 /* 260 * This may be called for an update after an IO error. 261 * So beware of the values seen. 262 */ 263 if (new_isize < old_isize) { 264 nr_blocks = (old_isize - new_isize) >> sb->s_blocksize_bits; 265 if (sbi->s_used_blocks > nr_blocks) 266 sbi->s_used_blocks -= nr_blocks; 267 else 268 sbi->s_used_blocks = 0; 269 } else { 270 sbi->s_used_blocks += 271 (new_isize - old_isize) >> sb->s_blocksize_bits; 272 if (sbi->s_used_blocks > sbi->s_blocks) 273 sbi->s_used_blocks = sbi->s_blocks; 274 } 275 276 spin_unlock(&sbi->s_lock); 277 } 278 279 /* 280 * Check a zone condition and adjust its file inode access permissions for 281 * offline and readonly zones. Return the inode size corresponding to the 282 * amount of readable data in the zone. 283 */ 284 static loff_t zonefs_check_zone_condition(struct inode *inode, 285 struct blk_zone *zone, bool warn, 286 bool mount) 287 { 288 struct zonefs_inode_info *zi = ZONEFS_I(inode); 289 290 switch (zone->cond) { 291 case BLK_ZONE_COND_OFFLINE: 292 /* 293 * Dead zone: make the inode immutable, disable all accesses 294 * and set the file size to 0 (zone wp set to zone start). 295 */ 296 if (warn) 297 zonefs_warn(inode->i_sb, "inode %lu: offline zone\n", 298 inode->i_ino); 299 inode->i_flags |= S_IMMUTABLE; 300 inode->i_mode &= ~0777; 301 zone->wp = zone->start; 302 return 0; 303 case BLK_ZONE_COND_READONLY: 304 /* 305 * The write pointer of read-only zones is invalid. If such a 306 * zone is found during mount, the file size cannot be retrieved 307 * so we treat the zone as offline (mount == true case). 308 * Otherwise, keep the file size as it was when last updated 309 * so that the user can recover data. In both cases, writes are 310 * always disabled for the zone. 311 */ 312 if (warn) 313 zonefs_warn(inode->i_sb, "inode %lu: read-only zone\n", 314 inode->i_ino); 315 inode->i_flags |= S_IMMUTABLE; 316 if (mount) { 317 zone->cond = BLK_ZONE_COND_OFFLINE; 318 inode->i_mode &= ~0777; 319 zone->wp = zone->start; 320 return 0; 321 } 322 inode->i_mode &= ~0222; 323 return i_size_read(inode); 324 case BLK_ZONE_COND_FULL: 325 /* The write pointer of full zones is invalid. */ 326 return zi->i_max_size; 327 default: 328 if (zi->i_ztype == ZONEFS_ZTYPE_CNV) 329 return zi->i_max_size; 330 return (zone->wp - zone->start) << SECTOR_SHIFT; 331 } 332 } 333 334 struct zonefs_ioerr_data { 335 struct inode *inode; 336 bool write; 337 }; 338 339 static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, 340 void *data) 341 { 342 struct zonefs_ioerr_data *err = data; 343 struct inode *inode = err->inode; 344 struct zonefs_inode_info *zi = ZONEFS_I(inode); 345 struct super_block *sb = inode->i_sb; 346 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 347 loff_t isize, data_size; 348 349 /* 350 * Check the zone condition: if the zone is not "bad" (offline or 351 * read-only), read errors are simply signaled to the IO issuer as long 352 * as there is no inconsistency between the inode size and the amount of 353 * data writen in the zone (data_size). 354 */ 355 data_size = zonefs_check_zone_condition(inode, zone, true, false); 356 isize = i_size_read(inode); 357 if (zone->cond != BLK_ZONE_COND_OFFLINE && 358 zone->cond != BLK_ZONE_COND_READONLY && 359 !err->write && isize == data_size) 360 return 0; 361 362 /* 363 * At this point, we detected either a bad zone or an inconsistency 364 * between the inode size and the amount of data written in the zone. 365 * For the latter case, the cause may be a write IO error or an external 366 * action on the device. Two error patterns exist: 367 * 1) The inode size is lower than the amount of data in the zone: 368 * a write operation partially failed and data was writen at the end 369 * of the file. This can happen in the case of a large direct IO 370 * needing several BIOs and/or write requests to be processed. 371 * 2) The inode size is larger than the amount of data in the zone: 372 * this can happen with a deferred write error with the use of the 373 * device side write cache after getting successful write IO 374 * completions. Other possibilities are (a) an external corruption, 375 * e.g. an application reset the zone directly, or (b) the device 376 * has a serious problem (e.g. firmware bug). 377 * 378 * In all cases, warn about inode size inconsistency and handle the 379 * IO error according to the zone condition and to the mount options. 380 */ 381 if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && isize != data_size) 382 zonefs_warn(sb, "inode %lu: invalid size %lld (should be %lld)\n", 383 inode->i_ino, isize, data_size); 384 385 /* 386 * First handle bad zones signaled by hardware. The mount options 387 * errors=zone-ro and errors=zone-offline result in changing the 388 * zone condition to read-only and offline respectively, as if the 389 * condition was signaled by the hardware. 390 */ 391 if (zone->cond == BLK_ZONE_COND_OFFLINE || 392 sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) { 393 zonefs_warn(sb, "inode %lu: read/write access disabled\n", 394 inode->i_ino); 395 if (zone->cond != BLK_ZONE_COND_OFFLINE) { 396 zone->cond = BLK_ZONE_COND_OFFLINE; 397 data_size = zonefs_check_zone_condition(inode, zone, 398 false, false); 399 } 400 } else if (zone->cond == BLK_ZONE_COND_READONLY || 401 sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) { 402 zonefs_warn(sb, "inode %lu: write access disabled\n", 403 inode->i_ino); 404 if (zone->cond != BLK_ZONE_COND_READONLY) { 405 zone->cond = BLK_ZONE_COND_READONLY; 406 data_size = zonefs_check_zone_condition(inode, zone, 407 false, false); 408 } 409 } 410 411 /* 412 * If the filesystem is mounted with the explicit-open mount option, we 413 * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to 414 * the read-only or offline condition, to avoid attempting an explicit 415 * close of the zone when the inode file is closed. 416 */ 417 if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && 418 (zone->cond == BLK_ZONE_COND_OFFLINE || 419 zone->cond == BLK_ZONE_COND_READONLY)) 420 zi->i_flags &= ~ZONEFS_ZONE_OPEN; 421 422 /* 423 * If error=remount-ro was specified, any error result in remounting 424 * the volume as read-only. 425 */ 426 if ((sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) && !sb_rdonly(sb)) { 427 zonefs_warn(sb, "remounting filesystem read-only\n"); 428 sb->s_flags |= SB_RDONLY; 429 } 430 431 /* 432 * Update block usage stats and the inode size to prevent access to 433 * invalid data. 434 */ 435 zonefs_update_stats(inode, data_size); 436 zonefs_i_size_write(inode, data_size); 437 zi->i_wpoffset = data_size; 438 zonefs_account_active(inode); 439 440 return 0; 441 } 442 443 /* 444 * When an file IO error occurs, check the file zone to see if there is a change 445 * in the zone condition (e.g. offline or read-only). For a failed write to a 446 * sequential zone, the zone write pointer position must also be checked to 447 * eventually correct the file size and zonefs inode write pointer offset 448 * (which can be out of sync with the drive due to partial write failures). 449 */ 450 static void __zonefs_io_error(struct inode *inode, bool write) 451 { 452 struct zonefs_inode_info *zi = ZONEFS_I(inode); 453 struct super_block *sb = inode->i_sb; 454 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 455 unsigned int noio_flag; 456 unsigned int nr_zones = 457 zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT); 458 struct zonefs_ioerr_data err = { 459 .inode = inode, 460 .write = write, 461 }; 462 int ret; 463 464 /* 465 * Memory allocations in blkdev_report_zones() can trigger a memory 466 * reclaim which may in turn cause a recursion into zonefs as well as 467 * struct request allocations for the same device. The former case may 468 * end up in a deadlock on the inode truncate mutex, while the latter 469 * may prevent IO forward progress. Executing the report zones under 470 * the GFP_NOIO context avoids both problems. 471 */ 472 noio_flag = memalloc_noio_save(); 473 ret = blkdev_report_zones(sb->s_bdev, zi->i_zsector, nr_zones, 474 zonefs_io_error_cb, &err); 475 if (ret != nr_zones) 476 zonefs_err(sb, "Get inode %lu zone information failed %d\n", 477 inode->i_ino, ret); 478 memalloc_noio_restore(noio_flag); 479 } 480 481 static void zonefs_io_error(struct inode *inode, bool write) 482 { 483 struct zonefs_inode_info *zi = ZONEFS_I(inode); 484 485 mutex_lock(&zi->i_truncate_mutex); 486 __zonefs_io_error(inode, write); 487 mutex_unlock(&zi->i_truncate_mutex); 488 } 489 490 static int zonefs_file_truncate(struct inode *inode, loff_t isize) 491 { 492 struct zonefs_inode_info *zi = ZONEFS_I(inode); 493 loff_t old_isize; 494 enum req_opf op; 495 int ret = 0; 496 497 /* 498 * Only sequential zone files can be truncated and truncation is allowed 499 * only down to a 0 size, which is equivalent to a zone reset, and to 500 * the maximum file size, which is equivalent to a zone finish. 501 */ 502 if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) 503 return -EPERM; 504 505 if (!isize) 506 op = REQ_OP_ZONE_RESET; 507 else if (isize == zi->i_max_size) 508 op = REQ_OP_ZONE_FINISH; 509 else 510 return -EPERM; 511 512 inode_dio_wait(inode); 513 514 /* Serialize against page faults */ 515 filemap_invalidate_lock(inode->i_mapping); 516 517 /* Serialize against zonefs_iomap_begin() */ 518 mutex_lock(&zi->i_truncate_mutex); 519 520 old_isize = i_size_read(inode); 521 if (isize == old_isize) 522 goto unlock; 523 524 ret = zonefs_zone_mgmt(inode, op); 525 if (ret) 526 goto unlock; 527 528 /* 529 * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, 530 * take care of open zones. 531 */ 532 if (zi->i_flags & ZONEFS_ZONE_OPEN) { 533 /* 534 * Truncating a zone to EMPTY or FULL is the equivalent of 535 * closing the zone. For a truncation to 0, we need to 536 * re-open the zone to ensure new writes can be processed. 537 * For a truncation to the maximum file size, the zone is 538 * closed and writes cannot be accepted anymore, so clear 539 * the open flag. 540 */ 541 if (!isize) 542 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 543 else 544 zi->i_flags &= ~ZONEFS_ZONE_OPEN; 545 } 546 547 zonefs_update_stats(inode, isize); 548 truncate_setsize(inode, isize); 549 zi->i_wpoffset = isize; 550 zonefs_account_active(inode); 551 552 unlock: 553 mutex_unlock(&zi->i_truncate_mutex); 554 filemap_invalidate_unlock(inode->i_mapping); 555 556 return ret; 557 } 558 559 static int zonefs_inode_setattr(struct user_namespace *mnt_userns, 560 struct dentry *dentry, struct iattr *iattr) 561 { 562 struct inode *inode = d_inode(dentry); 563 int ret; 564 565 if (unlikely(IS_IMMUTABLE(inode))) 566 return -EPERM; 567 568 ret = setattr_prepare(&init_user_ns, dentry, iattr); 569 if (ret) 570 return ret; 571 572 /* 573 * Since files and directories cannot be created nor deleted, do not 574 * allow setting any write attributes on the sub-directories grouping 575 * files by zone type. 576 */ 577 if ((iattr->ia_valid & ATTR_MODE) && S_ISDIR(inode->i_mode) && 578 (iattr->ia_mode & 0222)) 579 return -EPERM; 580 581 if (((iattr->ia_valid & ATTR_UID) && 582 !uid_eq(iattr->ia_uid, inode->i_uid)) || 583 ((iattr->ia_valid & ATTR_GID) && 584 !gid_eq(iattr->ia_gid, inode->i_gid))) { 585 ret = dquot_transfer(inode, iattr); 586 if (ret) 587 return ret; 588 } 589 590 if (iattr->ia_valid & ATTR_SIZE) { 591 ret = zonefs_file_truncate(inode, iattr->ia_size); 592 if (ret) 593 return ret; 594 } 595 596 setattr_copy(&init_user_ns, inode, iattr); 597 598 return 0; 599 } 600 601 static const struct inode_operations zonefs_file_inode_operations = { 602 .setattr = zonefs_inode_setattr, 603 }; 604 605 static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, 606 int datasync) 607 { 608 struct inode *inode = file_inode(file); 609 int ret = 0; 610 611 if (unlikely(IS_IMMUTABLE(inode))) 612 return -EPERM; 613 614 /* 615 * Since only direct writes are allowed in sequential files, page cache 616 * flush is needed only for conventional zone files. 617 */ 618 if (ZONEFS_I(inode)->i_ztype == ZONEFS_ZTYPE_CNV) 619 ret = file_write_and_wait_range(file, start, end); 620 if (!ret) 621 ret = blkdev_issue_flush(inode->i_sb->s_bdev); 622 623 if (ret) 624 zonefs_io_error(inode, true); 625 626 return ret; 627 } 628 629 static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) 630 { 631 struct inode *inode = file_inode(vmf->vma->vm_file); 632 struct zonefs_inode_info *zi = ZONEFS_I(inode); 633 vm_fault_t ret; 634 635 if (unlikely(IS_IMMUTABLE(inode))) 636 return VM_FAULT_SIGBUS; 637 638 /* 639 * Sanity check: only conventional zone files can have shared 640 * writeable mappings. 641 */ 642 if (WARN_ON_ONCE(zi->i_ztype != ZONEFS_ZTYPE_CNV)) 643 return VM_FAULT_NOPAGE; 644 645 sb_start_pagefault(inode->i_sb); 646 file_update_time(vmf->vma->vm_file); 647 648 /* Serialize against truncates */ 649 filemap_invalidate_lock_shared(inode->i_mapping); 650 ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); 651 filemap_invalidate_unlock_shared(inode->i_mapping); 652 653 sb_end_pagefault(inode->i_sb); 654 return ret; 655 } 656 657 static const struct vm_operations_struct zonefs_file_vm_ops = { 658 .fault = filemap_fault, 659 .map_pages = filemap_map_pages, 660 .page_mkwrite = zonefs_filemap_page_mkwrite, 661 }; 662 663 static int zonefs_file_mmap(struct file *file, struct vm_area_struct *vma) 664 { 665 /* 666 * Conventional zones accept random writes, so their files can support 667 * shared writable mappings. For sequential zone files, only read 668 * mappings are possible since there are no guarantees for write 669 * ordering between msync() and page cache writeback. 670 */ 671 if (ZONEFS_I(file_inode(file))->i_ztype == ZONEFS_ZTYPE_SEQ && 672 (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) 673 return -EINVAL; 674 675 file_accessed(file); 676 vma->vm_ops = &zonefs_file_vm_ops; 677 678 return 0; 679 } 680 681 static loff_t zonefs_file_llseek(struct file *file, loff_t offset, int whence) 682 { 683 loff_t isize = i_size_read(file_inode(file)); 684 685 /* 686 * Seeks are limited to below the zone size for conventional zones 687 * and below the zone write pointer for sequential zones. In both 688 * cases, this limit is the inode size. 689 */ 690 return generic_file_llseek_size(file, offset, whence, isize, isize); 691 } 692 693 static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, 694 int error, unsigned int flags) 695 { 696 struct inode *inode = file_inode(iocb->ki_filp); 697 struct zonefs_inode_info *zi = ZONEFS_I(inode); 698 699 if (error) { 700 zonefs_io_error(inode, true); 701 return error; 702 } 703 704 if (size && zi->i_ztype != ZONEFS_ZTYPE_CNV) { 705 /* 706 * Note that we may be seeing completions out of order, 707 * but that is not a problem since a write completed 708 * successfully necessarily means that all preceding writes 709 * were also successful. So we can safely increase the inode 710 * size to the write end location. 711 */ 712 mutex_lock(&zi->i_truncate_mutex); 713 if (i_size_read(inode) < iocb->ki_pos + size) { 714 zonefs_update_stats(inode, iocb->ki_pos + size); 715 zonefs_i_size_write(inode, iocb->ki_pos + size); 716 } 717 mutex_unlock(&zi->i_truncate_mutex); 718 } 719 720 return 0; 721 } 722 723 static const struct iomap_dio_ops zonefs_write_dio_ops = { 724 .end_io = zonefs_file_write_dio_end_io, 725 }; 726 727 static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) 728 { 729 struct inode *inode = file_inode(iocb->ki_filp); 730 struct zonefs_inode_info *zi = ZONEFS_I(inode); 731 struct block_device *bdev = inode->i_sb->s_bdev; 732 unsigned int max = bdev_max_zone_append_sectors(bdev); 733 struct bio *bio; 734 ssize_t size; 735 int nr_pages; 736 ssize_t ret; 737 738 max = ALIGN_DOWN(max << SECTOR_SHIFT, inode->i_sb->s_blocksize); 739 iov_iter_truncate(from, max); 740 741 nr_pages = iov_iter_npages(from, BIO_MAX_VECS); 742 if (!nr_pages) 743 return 0; 744 745 bio = bio_alloc(bdev, nr_pages, 746 REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS); 747 bio->bi_iter.bi_sector = zi->i_zsector; 748 bio->bi_ioprio = iocb->ki_ioprio; 749 if (iocb->ki_flags & IOCB_DSYNC) 750 bio->bi_opf |= REQ_FUA; 751 752 ret = bio_iov_iter_get_pages(bio, from); 753 if (unlikely(ret)) 754 goto out_release; 755 756 size = bio->bi_iter.bi_size; 757 task_io_account_write(size); 758 759 if (iocb->ki_flags & IOCB_HIPRI) 760 bio_set_polled(bio, iocb); 761 762 ret = submit_bio_wait(bio); 763 764 zonefs_file_write_dio_end_io(iocb, size, ret, 0); 765 trace_zonefs_file_dio_append(inode, size, ret); 766 767 out_release: 768 bio_release_pages(bio, false); 769 bio_put(bio); 770 771 if (ret >= 0) { 772 iocb->ki_pos += size; 773 return size; 774 } 775 776 return ret; 777 } 778 779 /* 780 * Do not exceed the LFS limits nor the file zone size. If pos is under the 781 * limit it becomes a short access. If it exceeds the limit, return -EFBIG. 782 */ 783 static loff_t zonefs_write_check_limits(struct file *file, loff_t pos, 784 loff_t count) 785 { 786 struct inode *inode = file_inode(file); 787 struct zonefs_inode_info *zi = ZONEFS_I(inode); 788 loff_t limit = rlimit(RLIMIT_FSIZE); 789 loff_t max_size = zi->i_max_size; 790 791 if (limit != RLIM_INFINITY) { 792 if (pos >= limit) { 793 send_sig(SIGXFSZ, current, 0); 794 return -EFBIG; 795 } 796 count = min(count, limit - pos); 797 } 798 799 if (!(file->f_flags & O_LARGEFILE)) 800 max_size = min_t(loff_t, MAX_NON_LFS, max_size); 801 802 if (unlikely(pos >= max_size)) 803 return -EFBIG; 804 805 return min(count, max_size - pos); 806 } 807 808 static ssize_t zonefs_write_checks(struct kiocb *iocb, struct iov_iter *from) 809 { 810 struct file *file = iocb->ki_filp; 811 struct inode *inode = file_inode(file); 812 struct zonefs_inode_info *zi = ZONEFS_I(inode); 813 loff_t count; 814 815 if (IS_SWAPFILE(inode)) 816 return -ETXTBSY; 817 818 if (!iov_iter_count(from)) 819 return 0; 820 821 if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) 822 return -EINVAL; 823 824 if (iocb->ki_flags & IOCB_APPEND) { 825 if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) 826 return -EINVAL; 827 mutex_lock(&zi->i_truncate_mutex); 828 iocb->ki_pos = zi->i_wpoffset; 829 mutex_unlock(&zi->i_truncate_mutex); 830 } 831 832 count = zonefs_write_check_limits(file, iocb->ki_pos, 833 iov_iter_count(from)); 834 if (count < 0) 835 return count; 836 837 iov_iter_truncate(from, count); 838 return iov_iter_count(from); 839 } 840 841 /* 842 * Handle direct writes. For sequential zone files, this is the only possible 843 * write path. For these files, check that the user is issuing writes 844 * sequentially from the end of the file. This code assumes that the block layer 845 * delivers write requests to the device in sequential order. This is always the 846 * case if a block IO scheduler implementing the ELEVATOR_F_ZBD_SEQ_WRITE 847 * elevator feature is being used (e.g. mq-deadline). The block layer always 848 * automatically select such an elevator for zoned block devices during the 849 * device initialization. 850 */ 851 static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) 852 { 853 struct inode *inode = file_inode(iocb->ki_filp); 854 struct zonefs_inode_info *zi = ZONEFS_I(inode); 855 struct super_block *sb = inode->i_sb; 856 bool sync = is_sync_kiocb(iocb); 857 bool append = false; 858 ssize_t ret, count; 859 860 /* 861 * For async direct IOs to sequential zone files, refuse IOCB_NOWAIT 862 * as this can cause write reordering (e.g. the first aio gets EAGAIN 863 * on the inode lock but the second goes through but is now unaligned). 864 */ 865 if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && !sync && 866 (iocb->ki_flags & IOCB_NOWAIT)) 867 return -EOPNOTSUPP; 868 869 if (iocb->ki_flags & IOCB_NOWAIT) { 870 if (!inode_trylock(inode)) 871 return -EAGAIN; 872 } else { 873 inode_lock(inode); 874 } 875 876 count = zonefs_write_checks(iocb, from); 877 if (count <= 0) { 878 ret = count; 879 goto inode_unlock; 880 } 881 882 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 883 ret = -EINVAL; 884 goto inode_unlock; 885 } 886 887 /* Enforce sequential writes (append only) in sequential zones */ 888 if (zi->i_ztype == ZONEFS_ZTYPE_SEQ) { 889 mutex_lock(&zi->i_truncate_mutex); 890 if (iocb->ki_pos != zi->i_wpoffset) { 891 mutex_unlock(&zi->i_truncate_mutex); 892 ret = -EINVAL; 893 goto inode_unlock; 894 } 895 mutex_unlock(&zi->i_truncate_mutex); 896 append = sync; 897 } 898 899 if (append) 900 ret = zonefs_file_dio_append(iocb, from); 901 else 902 ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, 903 &zonefs_write_dio_ops, 0, NULL, 0); 904 if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && 905 (ret > 0 || ret == -EIOCBQUEUED)) { 906 if (ret > 0) 907 count = ret; 908 909 /* 910 * Update the zone write pointer offset assuming the write 911 * operation succeeded. If it did not, the error recovery path 912 * will correct it. Also do active seq file accounting. 913 */ 914 mutex_lock(&zi->i_truncate_mutex); 915 zi->i_wpoffset += count; 916 zonefs_account_active(inode); 917 mutex_unlock(&zi->i_truncate_mutex); 918 } 919 920 inode_unlock: 921 inode_unlock(inode); 922 923 return ret; 924 } 925 926 static ssize_t zonefs_file_buffered_write(struct kiocb *iocb, 927 struct iov_iter *from) 928 { 929 struct inode *inode = file_inode(iocb->ki_filp); 930 struct zonefs_inode_info *zi = ZONEFS_I(inode); 931 ssize_t ret; 932 933 /* 934 * Direct IO writes are mandatory for sequential zone files so that the 935 * write IO issuing order is preserved. 936 */ 937 if (zi->i_ztype != ZONEFS_ZTYPE_CNV) 938 return -EIO; 939 940 if (iocb->ki_flags & IOCB_NOWAIT) { 941 if (!inode_trylock(inode)) 942 return -EAGAIN; 943 } else { 944 inode_lock(inode); 945 } 946 947 ret = zonefs_write_checks(iocb, from); 948 if (ret <= 0) 949 goto inode_unlock; 950 951 ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops); 952 if (ret > 0) 953 iocb->ki_pos += ret; 954 else if (ret == -EIO) 955 zonefs_io_error(inode, true); 956 957 inode_unlock: 958 inode_unlock(inode); 959 if (ret > 0) 960 ret = generic_write_sync(iocb, ret); 961 962 return ret; 963 } 964 965 static ssize_t zonefs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) 966 { 967 struct inode *inode = file_inode(iocb->ki_filp); 968 969 if (unlikely(IS_IMMUTABLE(inode))) 970 return -EPERM; 971 972 if (sb_rdonly(inode->i_sb)) 973 return -EROFS; 974 975 /* Write operations beyond the zone size are not allowed */ 976 if (iocb->ki_pos >= ZONEFS_I(inode)->i_max_size) 977 return -EFBIG; 978 979 if (iocb->ki_flags & IOCB_DIRECT) { 980 ssize_t ret = zonefs_file_dio_write(iocb, from); 981 if (ret != -ENOTBLK) 982 return ret; 983 } 984 985 return zonefs_file_buffered_write(iocb, from); 986 } 987 988 static int zonefs_file_read_dio_end_io(struct kiocb *iocb, ssize_t size, 989 int error, unsigned int flags) 990 { 991 if (error) { 992 zonefs_io_error(file_inode(iocb->ki_filp), false); 993 return error; 994 } 995 996 return 0; 997 } 998 999 static const struct iomap_dio_ops zonefs_read_dio_ops = { 1000 .end_io = zonefs_file_read_dio_end_io, 1001 }; 1002 1003 static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) 1004 { 1005 struct inode *inode = file_inode(iocb->ki_filp); 1006 struct zonefs_inode_info *zi = ZONEFS_I(inode); 1007 struct super_block *sb = inode->i_sb; 1008 loff_t isize; 1009 ssize_t ret; 1010 1011 /* Offline zones cannot be read */ 1012 if (unlikely(IS_IMMUTABLE(inode) && !(inode->i_mode & 0777))) 1013 return -EPERM; 1014 1015 if (iocb->ki_pos >= zi->i_max_size) 1016 return 0; 1017 1018 if (iocb->ki_flags & IOCB_NOWAIT) { 1019 if (!inode_trylock_shared(inode)) 1020 return -EAGAIN; 1021 } else { 1022 inode_lock_shared(inode); 1023 } 1024 1025 /* Limit read operations to written data */ 1026 mutex_lock(&zi->i_truncate_mutex); 1027 isize = i_size_read(inode); 1028 if (iocb->ki_pos >= isize) { 1029 mutex_unlock(&zi->i_truncate_mutex); 1030 ret = 0; 1031 goto inode_unlock; 1032 } 1033 iov_iter_truncate(to, isize - iocb->ki_pos); 1034 mutex_unlock(&zi->i_truncate_mutex); 1035 1036 if (iocb->ki_flags & IOCB_DIRECT) { 1037 size_t count = iov_iter_count(to); 1038 1039 if ((iocb->ki_pos | count) & (sb->s_blocksize - 1)) { 1040 ret = -EINVAL; 1041 goto inode_unlock; 1042 } 1043 file_accessed(iocb->ki_filp); 1044 ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, 1045 &zonefs_read_dio_ops, 0, NULL, 0); 1046 } else { 1047 ret = generic_file_read_iter(iocb, to); 1048 if (ret == -EIO) 1049 zonefs_io_error(inode, false); 1050 } 1051 1052 inode_unlock: 1053 inode_unlock_shared(inode); 1054 1055 return ret; 1056 } 1057 1058 /* 1059 * Write open accounting is done only for sequential files. 1060 */ 1061 static inline bool zonefs_seq_file_need_wro(struct inode *inode, 1062 struct file *file) 1063 { 1064 struct zonefs_inode_info *zi = ZONEFS_I(inode); 1065 1066 if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) 1067 return false; 1068 1069 if (!(file->f_mode & FMODE_WRITE)) 1070 return false; 1071 1072 return true; 1073 } 1074 1075 static int zonefs_seq_file_write_open(struct inode *inode) 1076 { 1077 struct zonefs_inode_info *zi = ZONEFS_I(inode); 1078 int ret = 0; 1079 1080 mutex_lock(&zi->i_truncate_mutex); 1081 1082 if (!zi->i_wr_refcnt) { 1083 struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); 1084 unsigned int wro = atomic_inc_return(&sbi->s_wro_seq_files); 1085 1086 if (sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 1087 1088 if (wro > sbi->s_max_wro_seq_files) { 1089 atomic_dec(&sbi->s_wro_seq_files); 1090 ret = -EBUSY; 1091 goto unlock; 1092 } 1093 1094 if (i_size_read(inode) < zi->i_max_size) { 1095 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); 1096 if (ret) { 1097 atomic_dec(&sbi->s_wro_seq_files); 1098 goto unlock; 1099 } 1100 zi->i_flags |= ZONEFS_ZONE_OPEN; 1101 zonefs_account_active(inode); 1102 } 1103 } 1104 } 1105 1106 zi->i_wr_refcnt++; 1107 1108 unlock: 1109 mutex_unlock(&zi->i_truncate_mutex); 1110 1111 return ret; 1112 } 1113 1114 static int zonefs_file_open(struct inode *inode, struct file *file) 1115 { 1116 int ret; 1117 1118 ret = generic_file_open(inode, file); 1119 if (ret) 1120 return ret; 1121 1122 if (zonefs_seq_file_need_wro(inode, file)) 1123 return zonefs_seq_file_write_open(inode); 1124 1125 return 0; 1126 } 1127 1128 static void zonefs_seq_file_write_close(struct inode *inode) 1129 { 1130 struct zonefs_inode_info *zi = ZONEFS_I(inode); 1131 struct super_block *sb = inode->i_sb; 1132 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1133 int ret = 0; 1134 1135 mutex_lock(&zi->i_truncate_mutex); 1136 1137 zi->i_wr_refcnt--; 1138 if (zi->i_wr_refcnt) 1139 goto unlock; 1140 1141 /* 1142 * The file zone may not be open anymore (e.g. the file was truncated to 1143 * its maximum size or it was fully written). For this case, we only 1144 * need to decrement the write open count. 1145 */ 1146 if (zi->i_flags & ZONEFS_ZONE_OPEN) { 1147 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 1148 if (ret) { 1149 __zonefs_io_error(inode, false); 1150 /* 1151 * Leaving zones explicitly open may lead to a state 1152 * where most zones cannot be written (zone resources 1153 * exhausted). So take preventive action by remounting 1154 * read-only. 1155 */ 1156 if (zi->i_flags & ZONEFS_ZONE_OPEN && 1157 !(sb->s_flags & SB_RDONLY)) { 1158 zonefs_warn(sb, 1159 "closing zone at %llu failed %d\n", 1160 zi->i_zsector, ret); 1161 zonefs_warn(sb, 1162 "remounting filesystem read-only\n"); 1163 sb->s_flags |= SB_RDONLY; 1164 } 1165 goto unlock; 1166 } 1167 1168 zi->i_flags &= ~ZONEFS_ZONE_OPEN; 1169 zonefs_account_active(inode); 1170 } 1171 1172 atomic_dec(&sbi->s_wro_seq_files); 1173 1174 unlock: 1175 mutex_unlock(&zi->i_truncate_mutex); 1176 } 1177 1178 static int zonefs_file_release(struct inode *inode, struct file *file) 1179 { 1180 /* 1181 * If we explicitly open a zone we must close it again as well, but the 1182 * zone management operation can fail (either due to an IO error or as 1183 * the zone has gone offline or read-only). Make sure we don't fail the 1184 * close(2) for user-space. 1185 */ 1186 if (zonefs_seq_file_need_wro(inode, file)) 1187 zonefs_seq_file_write_close(inode); 1188 1189 return 0; 1190 } 1191 1192 static const struct file_operations zonefs_file_operations = { 1193 .open = zonefs_file_open, 1194 .release = zonefs_file_release, 1195 .fsync = zonefs_file_fsync, 1196 .mmap = zonefs_file_mmap, 1197 .llseek = zonefs_file_llseek, 1198 .read_iter = zonefs_file_read_iter, 1199 .write_iter = zonefs_file_write_iter, 1200 .splice_read = generic_file_splice_read, 1201 .splice_write = iter_file_splice_write, 1202 .iopoll = iocb_bio_iopoll, 1203 }; 1204 1205 static struct kmem_cache *zonefs_inode_cachep; 1206 1207 static struct inode *zonefs_alloc_inode(struct super_block *sb) 1208 { 1209 struct zonefs_inode_info *zi; 1210 1211 zi = alloc_inode_sb(sb, zonefs_inode_cachep, GFP_KERNEL); 1212 if (!zi) 1213 return NULL; 1214 1215 inode_init_once(&zi->i_vnode); 1216 mutex_init(&zi->i_truncate_mutex); 1217 zi->i_wr_refcnt = 0; 1218 zi->i_flags = 0; 1219 1220 return &zi->i_vnode; 1221 } 1222 1223 static void zonefs_free_inode(struct inode *inode) 1224 { 1225 kmem_cache_free(zonefs_inode_cachep, ZONEFS_I(inode)); 1226 } 1227 1228 /* 1229 * File system stat. 1230 */ 1231 static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) 1232 { 1233 struct super_block *sb = dentry->d_sb; 1234 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1235 enum zonefs_ztype t; 1236 1237 buf->f_type = ZONEFS_MAGIC; 1238 buf->f_bsize = sb->s_blocksize; 1239 buf->f_namelen = ZONEFS_NAME_MAX; 1240 1241 spin_lock(&sbi->s_lock); 1242 1243 buf->f_blocks = sbi->s_blocks; 1244 if (WARN_ON(sbi->s_used_blocks > sbi->s_blocks)) 1245 buf->f_bfree = 0; 1246 else 1247 buf->f_bfree = buf->f_blocks - sbi->s_used_blocks; 1248 buf->f_bavail = buf->f_bfree; 1249 1250 for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { 1251 if (sbi->s_nr_files[t]) 1252 buf->f_files += sbi->s_nr_files[t] + 1; 1253 } 1254 buf->f_ffree = 0; 1255 1256 spin_unlock(&sbi->s_lock); 1257 1258 buf->f_fsid = uuid_to_fsid(sbi->s_uuid.b); 1259 1260 return 0; 1261 } 1262 1263 enum { 1264 Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, 1265 Opt_explicit_open, Opt_err, 1266 }; 1267 1268 static const match_table_t tokens = { 1269 { Opt_errors_ro, "errors=remount-ro"}, 1270 { Opt_errors_zro, "errors=zone-ro"}, 1271 { Opt_errors_zol, "errors=zone-offline"}, 1272 { Opt_errors_repair, "errors=repair"}, 1273 { Opt_explicit_open, "explicit-open" }, 1274 { Opt_err, NULL} 1275 }; 1276 1277 static int zonefs_parse_options(struct super_block *sb, char *options) 1278 { 1279 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1280 substring_t args[MAX_OPT_ARGS]; 1281 char *p; 1282 1283 if (!options) 1284 return 0; 1285 1286 while ((p = strsep(&options, ",")) != NULL) { 1287 int token; 1288 1289 if (!*p) 1290 continue; 1291 1292 token = match_token(p, tokens, args); 1293 switch (token) { 1294 case Opt_errors_ro: 1295 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 1296 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO; 1297 break; 1298 case Opt_errors_zro: 1299 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 1300 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO; 1301 break; 1302 case Opt_errors_zol: 1303 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 1304 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL; 1305 break; 1306 case Opt_errors_repair: 1307 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; 1308 sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; 1309 break; 1310 case Opt_explicit_open: 1311 sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; 1312 break; 1313 default: 1314 return -EINVAL; 1315 } 1316 } 1317 1318 return 0; 1319 } 1320 1321 static int zonefs_show_options(struct seq_file *seq, struct dentry *root) 1322 { 1323 struct zonefs_sb_info *sbi = ZONEFS_SB(root->d_sb); 1324 1325 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_RO) 1326 seq_puts(seq, ",errors=remount-ro"); 1327 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZRO) 1328 seq_puts(seq, ",errors=zone-ro"); 1329 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_ZOL) 1330 seq_puts(seq, ",errors=zone-offline"); 1331 if (sbi->s_mount_opts & ZONEFS_MNTOPT_ERRORS_REPAIR) 1332 seq_puts(seq, ",errors=repair"); 1333 1334 return 0; 1335 } 1336 1337 static int zonefs_remount(struct super_block *sb, int *flags, char *data) 1338 { 1339 sync_filesystem(sb); 1340 1341 return zonefs_parse_options(sb, data); 1342 } 1343 1344 static const struct super_operations zonefs_sops = { 1345 .alloc_inode = zonefs_alloc_inode, 1346 .free_inode = zonefs_free_inode, 1347 .statfs = zonefs_statfs, 1348 .remount_fs = zonefs_remount, 1349 .show_options = zonefs_show_options, 1350 }; 1351 1352 static const struct inode_operations zonefs_dir_inode_operations = { 1353 .lookup = simple_lookup, 1354 .setattr = zonefs_inode_setattr, 1355 }; 1356 1357 static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, 1358 enum zonefs_ztype type) 1359 { 1360 struct super_block *sb = parent->i_sb; 1361 1362 inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1; 1363 inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); 1364 inode->i_op = &zonefs_dir_inode_operations; 1365 inode->i_fop = &simple_dir_operations; 1366 set_nlink(inode, 2); 1367 inc_nlink(parent); 1368 } 1369 1370 static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone, 1371 enum zonefs_ztype type) 1372 { 1373 struct super_block *sb = inode->i_sb; 1374 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1375 struct zonefs_inode_info *zi = ZONEFS_I(inode); 1376 int ret = 0; 1377 1378 inode->i_ino = zone->start >> sbi->s_zone_sectors_shift; 1379 inode->i_mode = S_IFREG | sbi->s_perm; 1380 1381 zi->i_ztype = type; 1382 zi->i_zsector = zone->start; 1383 zi->i_zone_size = zone->len << SECTOR_SHIFT; 1384 1385 zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE, 1386 zone->capacity << SECTOR_SHIFT); 1387 zi->i_wpoffset = zonefs_check_zone_condition(inode, zone, true, true); 1388 1389 inode->i_uid = sbi->s_uid; 1390 inode->i_gid = sbi->s_gid; 1391 inode->i_size = zi->i_wpoffset; 1392 inode->i_blocks = zi->i_max_size >> SECTOR_SHIFT; 1393 1394 inode->i_op = &zonefs_file_inode_operations; 1395 inode->i_fop = &zonefs_file_operations; 1396 inode->i_mapping->a_ops = &zonefs_file_aops; 1397 1398 sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes); 1399 sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits; 1400 sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits; 1401 1402 mutex_lock(&zi->i_truncate_mutex); 1403 1404 /* 1405 * For sequential zones, make sure that any open zone is closed first 1406 * to ensure that the initial number of open zones is 0, in sync with 1407 * the open zone accounting done when the mount option 1408 * ZONEFS_MNTOPT_EXPLICIT_OPEN is used. 1409 */ 1410 if (type == ZONEFS_ZTYPE_SEQ && 1411 (zone->cond == BLK_ZONE_COND_IMP_OPEN || 1412 zone->cond == BLK_ZONE_COND_EXP_OPEN)) { 1413 ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); 1414 if (ret) 1415 goto unlock; 1416 } 1417 1418 zonefs_account_active(inode); 1419 1420 unlock: 1421 mutex_unlock(&zi->i_truncate_mutex); 1422 1423 return ret; 1424 } 1425 1426 static struct dentry *zonefs_create_inode(struct dentry *parent, 1427 const char *name, struct blk_zone *zone, 1428 enum zonefs_ztype type) 1429 { 1430 struct inode *dir = d_inode(parent); 1431 struct dentry *dentry; 1432 struct inode *inode; 1433 int ret; 1434 1435 dentry = d_alloc_name(parent, name); 1436 if (!dentry) 1437 return NULL; 1438 1439 inode = new_inode(parent->d_sb); 1440 if (!inode) 1441 goto dput; 1442 1443 inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; 1444 if (zone) { 1445 ret = zonefs_init_file_inode(inode, zone, type); 1446 if (ret) { 1447 iput(inode); 1448 goto dput; 1449 } 1450 } else { 1451 zonefs_init_dir_inode(dir, inode, type); 1452 } 1453 1454 d_add(dentry, inode); 1455 dir->i_size++; 1456 1457 return dentry; 1458 1459 dput: 1460 dput(dentry); 1461 1462 return NULL; 1463 } 1464 1465 struct zonefs_zone_data { 1466 struct super_block *sb; 1467 unsigned int nr_zones[ZONEFS_ZTYPE_MAX]; 1468 struct blk_zone *zones; 1469 }; 1470 1471 /* 1472 * Create a zone group and populate it with zone files. 1473 */ 1474 static int zonefs_create_zgroup(struct zonefs_zone_data *zd, 1475 enum zonefs_ztype type) 1476 { 1477 struct super_block *sb = zd->sb; 1478 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1479 struct blk_zone *zone, *next, *end; 1480 const char *zgroup_name; 1481 char *file_name; 1482 struct dentry *dir; 1483 unsigned int n = 0; 1484 int ret; 1485 1486 /* If the group is empty, there is nothing to do */ 1487 if (!zd->nr_zones[type]) 1488 return 0; 1489 1490 file_name = kmalloc(ZONEFS_NAME_MAX, GFP_KERNEL); 1491 if (!file_name) 1492 return -ENOMEM; 1493 1494 if (type == ZONEFS_ZTYPE_CNV) 1495 zgroup_name = "cnv"; 1496 else 1497 zgroup_name = "seq"; 1498 1499 dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type); 1500 if (!dir) { 1501 ret = -ENOMEM; 1502 goto free; 1503 } 1504 1505 /* 1506 * The first zone contains the super block: skip it. 1507 */ 1508 end = zd->zones + blkdev_nr_zones(sb->s_bdev->bd_disk); 1509 for (zone = &zd->zones[1]; zone < end; zone = next) { 1510 1511 next = zone + 1; 1512 if (zonefs_zone_type(zone) != type) 1513 continue; 1514 1515 /* 1516 * For conventional zones, contiguous zones can be aggregated 1517 * together to form larger files. Note that this overwrites the 1518 * length of the first zone of the set of contiguous zones 1519 * aggregated together. If one offline or read-only zone is 1520 * found, assume that all zones aggregated have the same 1521 * condition. 1522 */ 1523 if (type == ZONEFS_ZTYPE_CNV && 1524 (sbi->s_features & ZONEFS_F_AGGRCNV)) { 1525 for (; next < end; next++) { 1526 if (zonefs_zone_type(next) != type) 1527 break; 1528 zone->len += next->len; 1529 zone->capacity += next->capacity; 1530 if (next->cond == BLK_ZONE_COND_READONLY && 1531 zone->cond != BLK_ZONE_COND_OFFLINE) 1532 zone->cond = BLK_ZONE_COND_READONLY; 1533 else if (next->cond == BLK_ZONE_COND_OFFLINE) 1534 zone->cond = BLK_ZONE_COND_OFFLINE; 1535 } 1536 if (zone->capacity != zone->len) { 1537 zonefs_err(sb, "Invalid conventional zone capacity\n"); 1538 ret = -EINVAL; 1539 goto free; 1540 } 1541 } 1542 1543 /* 1544 * Use the file number within its group as file name. 1545 */ 1546 snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n); 1547 if (!zonefs_create_inode(dir, file_name, zone, type)) { 1548 ret = -ENOMEM; 1549 goto free; 1550 } 1551 1552 n++; 1553 } 1554 1555 zonefs_info(sb, "Zone group \"%s\" has %u file%s\n", 1556 zgroup_name, n, n > 1 ? "s" : ""); 1557 1558 sbi->s_nr_files[type] = n; 1559 ret = 0; 1560 1561 free: 1562 kfree(file_name); 1563 1564 return ret; 1565 } 1566 1567 static int zonefs_get_zone_info_cb(struct blk_zone *zone, unsigned int idx, 1568 void *data) 1569 { 1570 struct zonefs_zone_data *zd = data; 1571 1572 /* 1573 * Count the number of usable zones: the first zone at index 0 contains 1574 * the super block and is ignored. 1575 */ 1576 switch (zone->type) { 1577 case BLK_ZONE_TYPE_CONVENTIONAL: 1578 zone->wp = zone->start + zone->len; 1579 if (idx) 1580 zd->nr_zones[ZONEFS_ZTYPE_CNV]++; 1581 break; 1582 case BLK_ZONE_TYPE_SEQWRITE_REQ: 1583 case BLK_ZONE_TYPE_SEQWRITE_PREF: 1584 if (idx) 1585 zd->nr_zones[ZONEFS_ZTYPE_SEQ]++; 1586 break; 1587 default: 1588 zonefs_err(zd->sb, "Unsupported zone type 0x%x\n", 1589 zone->type); 1590 return -EIO; 1591 } 1592 1593 memcpy(&zd->zones[idx], zone, sizeof(struct blk_zone)); 1594 1595 return 0; 1596 } 1597 1598 static int zonefs_get_zone_info(struct zonefs_zone_data *zd) 1599 { 1600 struct block_device *bdev = zd->sb->s_bdev; 1601 int ret; 1602 1603 zd->zones = kvcalloc(blkdev_nr_zones(bdev->bd_disk), 1604 sizeof(struct blk_zone), GFP_KERNEL); 1605 if (!zd->zones) 1606 return -ENOMEM; 1607 1608 /* Get zones information from the device */ 1609 ret = blkdev_report_zones(bdev, 0, BLK_ALL_ZONES, 1610 zonefs_get_zone_info_cb, zd); 1611 if (ret < 0) { 1612 zonefs_err(zd->sb, "Zone report failed %d\n", ret); 1613 return ret; 1614 } 1615 1616 if (ret != blkdev_nr_zones(bdev->bd_disk)) { 1617 zonefs_err(zd->sb, "Invalid zone report (%d/%u zones)\n", 1618 ret, blkdev_nr_zones(bdev->bd_disk)); 1619 return -EIO; 1620 } 1621 1622 return 0; 1623 } 1624 1625 static inline void zonefs_cleanup_zone_info(struct zonefs_zone_data *zd) 1626 { 1627 kvfree(zd->zones); 1628 } 1629 1630 /* 1631 * Read super block information from the device. 1632 */ 1633 static int zonefs_read_super(struct super_block *sb) 1634 { 1635 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1636 struct zonefs_super *super; 1637 u32 crc, stored_crc; 1638 struct page *page; 1639 struct bio_vec bio_vec; 1640 struct bio bio; 1641 int ret; 1642 1643 page = alloc_page(GFP_KERNEL); 1644 if (!page) 1645 return -ENOMEM; 1646 1647 bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); 1648 bio.bi_iter.bi_sector = 0; 1649 bio_add_page(&bio, page, PAGE_SIZE, 0); 1650 1651 ret = submit_bio_wait(&bio); 1652 if (ret) 1653 goto free_page; 1654 1655 super = kmap(page); 1656 1657 ret = -EINVAL; 1658 if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) 1659 goto unmap; 1660 1661 stored_crc = le32_to_cpu(super->s_crc); 1662 super->s_crc = 0; 1663 crc = crc32(~0U, (unsigned char *)super, sizeof(struct zonefs_super)); 1664 if (crc != stored_crc) { 1665 zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", 1666 crc, stored_crc); 1667 goto unmap; 1668 } 1669 1670 sbi->s_features = le64_to_cpu(super->s_features); 1671 if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { 1672 zonefs_err(sb, "Unknown features set 0x%llx\n", 1673 sbi->s_features); 1674 goto unmap; 1675 } 1676 1677 if (sbi->s_features & ZONEFS_F_UID) { 1678 sbi->s_uid = make_kuid(current_user_ns(), 1679 le32_to_cpu(super->s_uid)); 1680 if (!uid_valid(sbi->s_uid)) { 1681 zonefs_err(sb, "Invalid UID feature\n"); 1682 goto unmap; 1683 } 1684 } 1685 1686 if (sbi->s_features & ZONEFS_F_GID) { 1687 sbi->s_gid = make_kgid(current_user_ns(), 1688 le32_to_cpu(super->s_gid)); 1689 if (!gid_valid(sbi->s_gid)) { 1690 zonefs_err(sb, "Invalid GID feature\n"); 1691 goto unmap; 1692 } 1693 } 1694 1695 if (sbi->s_features & ZONEFS_F_PERM) 1696 sbi->s_perm = le32_to_cpu(super->s_perm); 1697 1698 if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { 1699 zonefs_err(sb, "Reserved area is being used\n"); 1700 goto unmap; 1701 } 1702 1703 import_uuid(&sbi->s_uuid, super->s_uuid); 1704 ret = 0; 1705 1706 unmap: 1707 kunmap(page); 1708 free_page: 1709 __free_page(page); 1710 1711 return ret; 1712 } 1713 1714 /* 1715 * Check that the device is zoned. If it is, get the list of zones and create 1716 * sub-directories and files according to the device zone configuration and 1717 * format options. 1718 */ 1719 static int zonefs_fill_super(struct super_block *sb, void *data, int silent) 1720 { 1721 struct zonefs_zone_data zd; 1722 struct zonefs_sb_info *sbi; 1723 struct inode *inode; 1724 enum zonefs_ztype t; 1725 int ret; 1726 1727 if (!bdev_is_zoned(sb->s_bdev)) { 1728 zonefs_err(sb, "Not a zoned block device\n"); 1729 return -EINVAL; 1730 } 1731 1732 /* 1733 * Initialize super block information: the maximum file size is updated 1734 * when the zone files are created so that the format option 1735 * ZONEFS_F_AGGRCNV which increases the maximum file size of a file 1736 * beyond the zone size is taken into account. 1737 */ 1738 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 1739 if (!sbi) 1740 return -ENOMEM; 1741 1742 spin_lock_init(&sbi->s_lock); 1743 sb->s_fs_info = sbi; 1744 sb->s_magic = ZONEFS_MAGIC; 1745 sb->s_maxbytes = 0; 1746 sb->s_op = &zonefs_sops; 1747 sb->s_time_gran = 1; 1748 1749 /* 1750 * The block size is set to the device zone write granularity to ensure 1751 * that write operations are always aligned according to the device 1752 * interface constraints. 1753 */ 1754 sb_set_blocksize(sb, bdev_zone_write_granularity(sb->s_bdev)); 1755 sbi->s_zone_sectors_shift = ilog2(bdev_zone_sectors(sb->s_bdev)); 1756 sbi->s_uid = GLOBAL_ROOT_UID; 1757 sbi->s_gid = GLOBAL_ROOT_GID; 1758 sbi->s_perm = 0640; 1759 sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; 1760 1761 atomic_set(&sbi->s_wro_seq_files, 0); 1762 sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev); 1763 if (!sbi->s_max_wro_seq_files && 1764 sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { 1765 zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); 1766 sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; 1767 } 1768 1769 atomic_set(&sbi->s_active_seq_files, 0); 1770 sbi->s_max_active_seq_files = bdev_max_active_zones(sb->s_bdev); 1771 1772 ret = zonefs_read_super(sb); 1773 if (ret) 1774 return ret; 1775 1776 ret = zonefs_parse_options(sb, data); 1777 if (ret) 1778 return ret; 1779 1780 memset(&zd, 0, sizeof(struct zonefs_zone_data)); 1781 zd.sb = sb; 1782 ret = zonefs_get_zone_info(&zd); 1783 if (ret) 1784 goto cleanup; 1785 1786 ret = zonefs_sysfs_register(sb); 1787 if (ret) 1788 goto cleanup; 1789 1790 zonefs_info(sb, "Mounting %u zones", 1791 blkdev_nr_zones(sb->s_bdev->bd_disk)); 1792 1793 /* Create root directory inode */ 1794 ret = -ENOMEM; 1795 inode = new_inode(sb); 1796 if (!inode) 1797 goto cleanup; 1798 1799 inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk); 1800 inode->i_mode = S_IFDIR | 0555; 1801 inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); 1802 inode->i_op = &zonefs_dir_inode_operations; 1803 inode->i_fop = &simple_dir_operations; 1804 set_nlink(inode, 2); 1805 1806 sb->s_root = d_make_root(inode); 1807 if (!sb->s_root) 1808 goto cleanup; 1809 1810 /* Create and populate files in zone groups directories */ 1811 for (t = 0; t < ZONEFS_ZTYPE_MAX; t++) { 1812 ret = zonefs_create_zgroup(&zd, t); 1813 if (ret) 1814 break; 1815 } 1816 1817 cleanup: 1818 zonefs_cleanup_zone_info(&zd); 1819 1820 return ret; 1821 } 1822 1823 static struct dentry *zonefs_mount(struct file_system_type *fs_type, 1824 int flags, const char *dev_name, void *data) 1825 { 1826 return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super); 1827 } 1828 1829 static void zonefs_kill_super(struct super_block *sb) 1830 { 1831 struct zonefs_sb_info *sbi = ZONEFS_SB(sb); 1832 1833 if (sb->s_root) 1834 d_genocide(sb->s_root); 1835 1836 zonefs_sysfs_unregister(sb); 1837 kill_block_super(sb); 1838 kfree(sbi); 1839 } 1840 1841 /* 1842 * File system definition and registration. 1843 */ 1844 static struct file_system_type zonefs_type = { 1845 .owner = THIS_MODULE, 1846 .name = "zonefs", 1847 .mount = zonefs_mount, 1848 .kill_sb = zonefs_kill_super, 1849 .fs_flags = FS_REQUIRES_DEV, 1850 }; 1851 1852 static int __init zonefs_init_inodecache(void) 1853 { 1854 zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache", 1855 sizeof(struct zonefs_inode_info), 0, 1856 (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), 1857 NULL); 1858 if (zonefs_inode_cachep == NULL) 1859 return -ENOMEM; 1860 return 0; 1861 } 1862 1863 static void zonefs_destroy_inodecache(void) 1864 { 1865 /* 1866 * Make sure all delayed rcu free inodes are flushed before we 1867 * destroy the inode cache. 1868 */ 1869 rcu_barrier(); 1870 kmem_cache_destroy(zonefs_inode_cachep); 1871 } 1872 1873 static int __init zonefs_init(void) 1874 { 1875 int ret; 1876 1877 BUILD_BUG_ON(sizeof(struct zonefs_super) != ZONEFS_SUPER_SIZE); 1878 1879 ret = zonefs_init_inodecache(); 1880 if (ret) 1881 return ret; 1882 1883 ret = register_filesystem(&zonefs_type); 1884 if (ret) 1885 goto destroy_inodecache; 1886 1887 ret = zonefs_sysfs_init(); 1888 if (ret) 1889 goto unregister_fs; 1890 1891 return 0; 1892 1893 unregister_fs: 1894 unregister_filesystem(&zonefs_type); 1895 destroy_inodecache: 1896 zonefs_destroy_inodecache(); 1897 1898 return ret; 1899 } 1900 1901 static void __exit zonefs_exit(void) 1902 { 1903 zonefs_sysfs_exit(); 1904 zonefs_destroy_inodecache(); 1905 unregister_filesystem(&zonefs_type); 1906 } 1907 1908 MODULE_AUTHOR("Damien Le Moal"); 1909 MODULE_DESCRIPTION("Zone file system for zoned block devices"); 1910 MODULE_LICENSE("GPL"); 1911 MODULE_ALIAS_FS("zonefs"); 1912 module_init(zonefs_init); 1913 module_exit(zonefs_exit); 1914