1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * (C) 2001 Clemson University and The University of Chicago 4 * Copyright 2018 Omnibond Systems, L.L.C. 5 * 6 * See COPYING in top-level directory. 7 */ 8 9 /* 10 * Linux VFS file operations. 11 */ 12 13 #include "protocol.h" 14 #include "orangefs-kernel.h" 15 #include "orangefs-bufmap.h" 16 #include <linux/fs.h> 17 #include <linux/pagemap.h> 18 19 static int flush_racache(struct inode *inode) 20 { 21 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 22 struct orangefs_kernel_op_s *new_op; 23 int ret; 24 25 gossip_debug(GOSSIP_UTILS_DEBUG, 26 "%s: %pU: Handle is %pU | fs_id %d\n", __func__, 27 get_khandle_from_ino(inode), &orangefs_inode->refn.khandle, 28 orangefs_inode->refn.fs_id); 29 30 new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH); 31 if (!new_op) 32 return -ENOMEM; 33 new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn; 34 35 ret = service_operation(new_op, "orangefs_flush_racache", 36 get_interruptible_flag(inode)); 37 38 gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n", 39 __func__, ret); 40 41 op_release(new_op); 42 return ret; 43 } 44 45 /* 46 * Post and wait for the I/O upcall to finish 47 */ 48 ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, 49 loff_t *offset, struct iov_iter *iter, size_t total_size, 50 loff_t readahead_size, struct orangefs_write_range *wr, int *index_return) 51 { 52 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 53 struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; 54 struct orangefs_kernel_op_s *new_op = NULL; 55 int buffer_index = -1; 56 ssize_t ret; 57 size_t copy_amount; 58 59 new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); 60 if (!new_op) 61 return -ENOMEM; 62 63 /* synchronous I/O */ 64 new_op->upcall.req.io.readahead_size = readahead_size; 65 new_op->upcall.req.io.io_type = type; 66 new_op->upcall.req.io.refn = orangefs_inode->refn; 67 68 populate_shared_memory: 69 /* get a shared buffer index */ 70 buffer_index = orangefs_bufmap_get(); 71 if (buffer_index < 0) { 72 ret = buffer_index; 73 gossip_debug(GOSSIP_FILE_DEBUG, 74 "%s: orangefs_bufmap_get failure (%zd)\n", 75 __func__, ret); 76 goto out; 77 } 78 gossip_debug(GOSSIP_FILE_DEBUG, 79 "%s(%pU): GET op %p -> buffer_index %d\n", 80 __func__, 81 handle, 82 new_op, 83 buffer_index); 84 85 new_op->uses_shared_memory = 1; 86 new_op->upcall.req.io.buf_index = buffer_index; 87 new_op->upcall.req.io.count = total_size; 88 new_op->upcall.req.io.offset = *offset; 89 if (type == ORANGEFS_IO_WRITE && wr) { 90 new_op->upcall.uid = from_kuid(&init_user_ns, wr->uid); 91 new_op->upcall.gid = from_kgid(&init_user_ns, wr->gid); 92 } 93 94 gossip_debug(GOSSIP_FILE_DEBUG, 95 "%s(%pU): offset: %llu total_size: %zd\n", 96 __func__, 97 handle, 98 llu(*offset), 99 total_size); 100 /* 101 * Stage 1: copy the buffers into client-core's address space 102 */ 103 if (type == ORANGEFS_IO_WRITE && total_size) { 104 ret = orangefs_bufmap_copy_from_iovec(iter, buffer_index, 105 total_size); 106 if (ret < 0) { 107 gossip_err("%s: Failed to copy-in buffers. Please make sure that the pvfs2-client is running. %ld\n", 108 __func__, (long)ret); 109 goto out; 110 } 111 } 112 113 gossip_debug(GOSSIP_FILE_DEBUG, 114 "%s(%pU): Calling post_io_request with tag (%llu)\n", 115 __func__, 116 handle, 117 llu(new_op->tag)); 118 119 /* Stage 2: Service the I/O operation */ 120 ret = service_operation(new_op, 121 type == ORANGEFS_IO_WRITE ? 122 "file_write" : 123 "file_read", 124 get_interruptible_flag(inode)); 125 126 /* 127 * If service_operation() returns -EAGAIN #and# the operation was 128 * purged from orangefs_request_list or htable_ops_in_progress, then 129 * we know that the client was restarted, causing the shared memory 130 * area to be wiped clean. To restart a write operation in this 131 * case, we must re-copy the data from the user's iovec to a NEW 132 * shared memory location. To restart a read operation, we must get 133 * a new shared memory location. 134 */ 135 if (ret == -EAGAIN && op_state_purged(new_op)) { 136 orangefs_bufmap_put(buffer_index); 137 buffer_index = -1; 138 if (type == ORANGEFS_IO_WRITE) 139 iov_iter_revert(iter, total_size); 140 gossip_debug(GOSSIP_FILE_DEBUG, 141 "%s:going to repopulate_shared_memory.\n", 142 __func__); 143 goto populate_shared_memory; 144 } 145 146 if (ret < 0) { 147 if (ret == -EINTR) { 148 /* 149 * We can't return EINTR if any data was written, 150 * it's not POSIX. It is minimally acceptable 151 * to give a partial write, the way NFS does. 152 * 153 * It would be optimal to return all or nothing, 154 * but if a userspace write is bigger than 155 * an IO buffer, and the interrupt occurs 156 * between buffer writes, that would not be 157 * possible. 158 */ 159 switch (new_op->op_state - OP_VFS_STATE_GIVEN_UP) { 160 /* 161 * If the op was waiting when the interrupt 162 * occurred, then the client-core did not 163 * trigger the write. 164 */ 165 case OP_VFS_STATE_WAITING: 166 if (*offset == 0) 167 ret = -EINTR; 168 else 169 ret = 0; 170 break; 171 /* 172 * If the op was in progress when the interrupt 173 * occurred, then the client-core was able to 174 * trigger the write. 175 */ 176 case OP_VFS_STATE_INPROGR: 177 if (type == ORANGEFS_IO_READ) 178 ret = -EINTR; 179 else 180 ret = total_size; 181 break; 182 default: 183 gossip_err("%s: unexpected op state :%d:.\n", 184 __func__, 185 new_op->op_state); 186 ret = 0; 187 break; 188 } 189 gossip_debug(GOSSIP_FILE_DEBUG, 190 "%s: got EINTR, state:%d: %p\n", 191 __func__, 192 new_op->op_state, 193 new_op); 194 } else { 195 gossip_err("%s: error in %s handle %pU, returning %zd\n", 196 __func__, 197 type == ORANGEFS_IO_READ ? 198 "read from" : "write to", 199 handle, ret); 200 } 201 if (orangefs_cancel_op_in_progress(new_op)) 202 return ret; 203 204 goto out; 205 } 206 207 /* 208 * Stage 3: Post copy buffers from client-core's address space 209 */ 210 if (type == ORANGEFS_IO_READ && new_op->downcall.resp.io.amt_complete) { 211 /* 212 * NOTE: the iovector can either contain addresses which 213 * can futher be kernel-space or user-space addresses. 214 * or it can pointers to struct page's 215 */ 216 217 /* 218 * When reading, readahead_size will only be zero when 219 * we're doing O_DIRECT, otherwise we got here from 220 * orangefs_readpage. 221 * 222 * If we got here from orangefs_readpage we want to 223 * copy either a page or the whole file into the io 224 * vector, whichever is smaller. 225 */ 226 if (readahead_size) 227 copy_amount = 228 min(new_op->downcall.resp.io.amt_complete, 229 (__s64)PAGE_SIZE); 230 else 231 copy_amount = new_op->downcall.resp.io.amt_complete; 232 233 ret = orangefs_bufmap_copy_to_iovec(iter, buffer_index, 234 copy_amount); 235 if (ret < 0) { 236 gossip_err("%s: Failed to copy-out buffers. Please make sure that the pvfs2-client is running (%ld)\n", 237 __func__, (long)ret); 238 goto out; 239 } 240 } 241 gossip_debug(GOSSIP_FILE_DEBUG, 242 "%s(%pU): Amount %s, returned by the sys-io call:%d\n", 243 __func__, 244 handle, 245 type == ORANGEFS_IO_READ ? "read" : "written", 246 (int)new_op->downcall.resp.io.amt_complete); 247 248 ret = new_op->downcall.resp.io.amt_complete; 249 250 out: 251 if (buffer_index >= 0) { 252 if ((readahead_size) && (type == ORANGEFS_IO_READ)) { 253 /* readpage */ 254 *index_return = buffer_index; 255 gossip_debug(GOSSIP_FILE_DEBUG, 256 "%s: hold on to buffer_index :%d:\n", 257 __func__, buffer_index); 258 } else { 259 /* O_DIRECT */ 260 orangefs_bufmap_put(buffer_index); 261 gossip_debug(GOSSIP_FILE_DEBUG, 262 "%s(%pU): PUT buffer_index %d\n", 263 __func__, handle, buffer_index); 264 } 265 buffer_index = -1; 266 } 267 op_release(new_op); 268 return ret; 269 } 270 271 int orangefs_revalidate_mapping(struct inode *inode) 272 { 273 struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); 274 struct address_space *mapping = inode->i_mapping; 275 unsigned long *bitlock = &orangefs_inode->bitlock; 276 int ret; 277 278 while (1) { 279 ret = wait_on_bit(bitlock, 1, TASK_KILLABLE); 280 if (ret) 281 return ret; 282 spin_lock(&inode->i_lock); 283 if (test_bit(1, bitlock)) { 284 spin_unlock(&inode->i_lock); 285 continue; 286 } 287 if (!time_before(jiffies, orangefs_inode->mapping_time)) 288 break; 289 spin_unlock(&inode->i_lock); 290 return 0; 291 } 292 293 set_bit(1, bitlock); 294 smp_wmb(); 295 spin_unlock(&inode->i_lock); 296 297 unmap_mapping_range(mapping, 0, 0, 0); 298 ret = filemap_write_and_wait(mapping); 299 if (!ret) 300 ret = invalidate_inode_pages2(mapping); 301 302 orangefs_inode->mapping_time = jiffies + 303 orangefs_cache_timeout_msecs*HZ/1000; 304 305 clear_bit(1, bitlock); 306 smp_mb__after_atomic(); 307 wake_up_bit(bitlock, 1); 308 309 return ret; 310 } 311 312 static ssize_t orangefs_file_read_iter(struct kiocb *iocb, 313 struct iov_iter *iter) 314 { 315 int ret; 316 struct orangefs_read_options *ro; 317 318 orangefs_stats.reads++; 319 320 /* 321 * Remember how they set "count" in read(2) or pread(2) or whatever - 322 * users can use count as a knob to control orangefs io size and later 323 * we can try to help them fill as many pages as possible in readpage. 324 */ 325 if (!iocb->ki_filp->private_data) { 326 iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL); 327 if (!iocb->ki_filp->private_data) 328 return(ENOMEM); 329 ro = iocb->ki_filp->private_data; 330 ro->blksiz = iter->count; 331 } 332 333 down_read(&file_inode(iocb->ki_filp)->i_rwsem); 334 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 335 if (ret) 336 goto out; 337 338 ret = generic_file_read_iter(iocb, iter); 339 out: 340 up_read(&file_inode(iocb->ki_filp)->i_rwsem); 341 return ret; 342 } 343 344 static ssize_t orangefs_file_write_iter(struct kiocb *iocb, 345 struct iov_iter *iter) 346 { 347 int ret; 348 orangefs_stats.writes++; 349 350 if (iocb->ki_pos > i_size_read(file_inode(iocb->ki_filp))) { 351 ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp)); 352 if (ret) 353 return ret; 354 } 355 356 ret = generic_file_write_iter(iocb, iter); 357 return ret; 358 } 359 360 static int orangefs_getflags(struct inode *inode, unsigned long *uval) 361 { 362 __u64 val = 0; 363 int ret; 364 365 ret = orangefs_inode_getxattr(inode, 366 "user.pvfs2.meta_hint", 367 &val, sizeof(val)); 368 if (ret < 0 && ret != -ENODATA) 369 return ret; 370 else if (ret == -ENODATA) 371 val = 0; 372 *uval = val; 373 return 0; 374 } 375 376 /* 377 * Perform a miscellaneous operation on a file. 378 */ 379 static long orangefs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 380 { 381 struct inode *inode = file_inode(file); 382 int ret = -ENOTTY; 383 __u64 val = 0; 384 unsigned long uval; 385 386 gossip_debug(GOSSIP_FILE_DEBUG, 387 "orangefs_ioctl: called with cmd %d\n", 388 cmd); 389 390 /* 391 * we understand some general ioctls on files, such as the immutable 392 * and append flags 393 */ 394 if (cmd == FS_IOC_GETFLAGS) { 395 ret = orangefs_getflags(inode, &uval); 396 if (ret) 397 return ret; 398 gossip_debug(GOSSIP_FILE_DEBUG, 399 "orangefs_ioctl: FS_IOC_GETFLAGS: %llu\n", 400 (unsigned long long)uval); 401 return put_user(uval, (int __user *)arg); 402 } else if (cmd == FS_IOC_SETFLAGS) { 403 unsigned long old_uval; 404 405 ret = 0; 406 if (get_user(uval, (int __user *)arg)) 407 return -EFAULT; 408 /* 409 * ORANGEFS_MIRROR_FL is set internally when the mirroring mode 410 * is turned on for a file. The user is not allowed to turn 411 * on this bit, but the bit is present if the user first gets 412 * the flags and then updates the flags with some new 413 * settings. So, we ignore it in the following edit. bligon. 414 */ 415 if ((uval & ~ORANGEFS_MIRROR_FL) & 416 (~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NOATIME_FL))) { 417 gossip_err("orangefs_ioctl: the FS_IOC_SETFLAGS only supports setting one of FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NOATIME_FL\n"); 418 return -EINVAL; 419 } 420 ret = orangefs_getflags(inode, &old_uval); 421 if (ret) 422 return ret; 423 ret = vfs_ioc_setflags_prepare(inode, old_uval, uval); 424 if (ret) 425 return ret; 426 val = uval; 427 gossip_debug(GOSSIP_FILE_DEBUG, 428 "orangefs_ioctl: FS_IOC_SETFLAGS: %llu\n", 429 (unsigned long long)val); 430 ret = orangefs_inode_setxattr(inode, 431 "user.pvfs2.meta_hint", 432 &val, sizeof(val), 0); 433 } 434 435 return ret; 436 } 437 438 static vm_fault_t orangefs_fault(struct vm_fault *vmf) 439 { 440 struct file *file = vmf->vma->vm_file; 441 int ret; 442 ret = orangefs_inode_getattr(file->f_mapping->host, 443 ORANGEFS_GETATTR_SIZE); 444 if (ret == -ESTALE) 445 ret = -EIO; 446 if (ret) { 447 gossip_err("%s: orangefs_inode_getattr failed, " 448 "ret:%d:.\n", __func__, ret); 449 return VM_FAULT_SIGBUS; 450 } 451 return filemap_fault(vmf); 452 } 453 454 static const struct vm_operations_struct orangefs_file_vm_ops = { 455 .fault = orangefs_fault, 456 .map_pages = filemap_map_pages, 457 .page_mkwrite = orangefs_page_mkwrite, 458 }; 459 460 /* 461 * Memory map a region of a file. 462 */ 463 static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) 464 { 465 int ret; 466 467 ret = orangefs_revalidate_mapping(file_inode(file)); 468 if (ret) 469 return ret; 470 471 gossip_debug(GOSSIP_FILE_DEBUG, 472 "orangefs_file_mmap: called on %s\n", 473 (file ? 474 (char *)file->f_path.dentry->d_name.name : 475 (char *)"Unknown")); 476 477 /* set the sequential readahead hint */ 478 vma->vm_flags |= VM_SEQ_READ; 479 vma->vm_flags &= ~VM_RAND_READ; 480 481 file_accessed(file); 482 vma->vm_ops = &orangefs_file_vm_ops; 483 return 0; 484 } 485 486 #define mapping_nrpages(idata) ((idata)->nrpages) 487 488 /* 489 * Called to notify the module that there are no more references to 490 * this file (i.e. no processes have it open). 491 * 492 * \note Not called when each file is closed. 493 */ 494 static int orangefs_file_release(struct inode *inode, struct file *file) 495 { 496 gossip_debug(GOSSIP_FILE_DEBUG, 497 "orangefs_file_release: called on %pD\n", 498 file); 499 500 /* 501 * remove all associated inode pages from the page cache and 502 * readahead cache (if any); this forces an expensive refresh of 503 * data for the next caller of mmap (or 'get_block' accesses) 504 */ 505 if (file_inode(file) && 506 file_inode(file)->i_mapping && 507 mapping_nrpages(&file_inode(file)->i_data)) { 508 if (orangefs_features & ORANGEFS_FEATURE_READAHEAD) { 509 gossip_debug(GOSSIP_INODE_DEBUG, 510 "calling flush_racache on %pU\n", 511 get_khandle_from_ino(inode)); 512 flush_racache(inode); 513 gossip_debug(GOSSIP_INODE_DEBUG, 514 "flush_racache finished\n"); 515 } 516 517 } 518 return 0; 519 } 520 521 /* 522 * Push all data for a specific file onto permanent storage. 523 */ 524 static int orangefs_fsync(struct file *file, 525 loff_t start, 526 loff_t end, 527 int datasync) 528 { 529 int ret; 530 struct orangefs_inode_s *orangefs_inode = 531 ORANGEFS_I(file_inode(file)); 532 struct orangefs_kernel_op_s *new_op = NULL; 533 534 ret = filemap_write_and_wait_range(file_inode(file)->i_mapping, 535 start, end); 536 if (ret < 0) 537 return ret; 538 539 new_op = op_alloc(ORANGEFS_VFS_OP_FSYNC); 540 if (!new_op) 541 return -ENOMEM; 542 new_op->upcall.req.fsync.refn = orangefs_inode->refn; 543 544 ret = service_operation(new_op, 545 "orangefs_fsync", 546 get_interruptible_flag(file_inode(file))); 547 548 gossip_debug(GOSSIP_FILE_DEBUG, 549 "orangefs_fsync got return value of %d\n", 550 ret); 551 552 op_release(new_op); 553 return ret; 554 } 555 556 /* 557 * Change the file pointer position for an instance of an open file. 558 * 559 * \note If .llseek is overriden, we must acquire lock as described in 560 * Documentation/filesystems/Locking. 561 * 562 * Future upgrade could support SEEK_DATA and SEEK_HOLE but would 563 * require much changes to the FS 564 */ 565 static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) 566 { 567 int ret = -EINVAL; 568 struct inode *inode = file_inode(file); 569 570 if (origin == SEEK_END) { 571 /* 572 * revalidate the inode's file size. 573 * NOTE: We are only interested in file size here, 574 * so we set mask accordingly. 575 */ 576 ret = orangefs_inode_getattr(file->f_mapping->host, 577 ORANGEFS_GETATTR_SIZE); 578 if (ret == -ESTALE) 579 ret = -EIO; 580 if (ret) { 581 gossip_debug(GOSSIP_FILE_DEBUG, 582 "%s:%s:%d calling make bad inode\n", 583 __FILE__, 584 __func__, 585 __LINE__); 586 return ret; 587 } 588 } 589 590 gossip_debug(GOSSIP_FILE_DEBUG, 591 "orangefs_file_llseek: offset is %ld | origin is %d" 592 " | inode size is %lu\n", 593 (long)offset, 594 origin, 595 (unsigned long)i_size_read(inode)); 596 597 return generic_file_llseek(file, offset, origin); 598 } 599 600 /* 601 * Support local locks (locks that only this kernel knows about) 602 * if Orangefs was mounted -o local_lock. 603 */ 604 static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) 605 { 606 int rc = -EINVAL; 607 608 if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { 609 if (cmd == F_GETLK) { 610 rc = 0; 611 posix_test_lock(filp, fl); 612 } else { 613 rc = posix_lock_file(filp, fl, NULL); 614 } 615 } 616 617 return rc; 618 } 619 620 static int orangefs_file_open(struct inode * inode, struct file *file) 621 { 622 file->private_data = NULL; 623 return generic_file_open(inode, file); 624 } 625 626 static int orangefs_flush(struct file *file, fl_owner_t id) 627 { 628 /* 629 * This is vfs_fsync_range(file, 0, LLONG_MAX, 0) without the 630 * service_operation in orangefs_fsync. 631 * 632 * Do not send fsync to OrangeFS server on a close. Do send fsync 633 * on an explicit fsync call. This duplicates historical OrangeFS 634 * behavior. 635 */ 636 struct inode *inode = file->f_mapping->host; 637 int r; 638 639 kfree(file->private_data); 640 file->private_data = NULL; 641 642 if (inode->i_state & I_DIRTY_TIME) { 643 spin_lock(&inode->i_lock); 644 inode->i_state &= ~I_DIRTY_TIME; 645 spin_unlock(&inode->i_lock); 646 mark_inode_dirty_sync(inode); 647 } 648 649 r = filemap_write_and_wait_range(file->f_mapping, 0, LLONG_MAX); 650 if (r > 0) 651 return 0; 652 else 653 return r; 654 } 655 656 /** ORANGEFS implementation of VFS file operations */ 657 const struct file_operations orangefs_file_operations = { 658 .llseek = orangefs_file_llseek, 659 .read_iter = orangefs_file_read_iter, 660 .write_iter = orangefs_file_write_iter, 661 .lock = orangefs_lock, 662 .unlocked_ioctl = orangefs_ioctl, 663 .mmap = orangefs_file_mmap, 664 .open = orangefs_file_open, 665 .flush = orangefs_flush, 666 .release = orangefs_file_release, 667 .fsync = orangefs_fsync, 668 }; 669