1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2024-2026 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trace.h" 15 #include "xfs_ag.h" 16 #include "xfs_btree.h" 17 #include "xfs_da_format.h" 18 #include "xfs_da_btree.h" 19 #include "xfs_quota_defs.h" 20 #include "xfs_rtgroup.h" 21 #include "xfs_health.h" 22 #include "xfs_healthmon.h" 23 #include "xfs_fsops.h" 24 #include "xfs_notify_failure.h" 25 #include "xfs_file.h" 26 #include "xfs_ioctl.h" 27 28 #include <linux/anon_inodes.h> 29 #include <linux/eventpoll.h> 30 #include <linux/poll.h> 31 #include <linux/fserror.h> 32 33 /* 34 * Live Health Monitoring 35 * ====================== 36 * 37 * Autonomous self-healing of XFS filesystems requires a means for the kernel 38 * to send filesystem health events to a monitoring daemon in userspace. To 39 * accomplish this, we establish a thread_with_file kthread object to handle 40 * translating internal events about filesystem health into a format that can 41 * be parsed easily by userspace. When those internal events occur, the core 42 * filesystem code calls this health monitor to convey the events to userspace. 43 * Userspace reads events from the file descriptor returned by the ioctl. 44 * 45 * The healthmon abstraction has a weak reference to the host filesystem mount 46 * so that the queueing and processing of the events do not pin the mount and 47 * cannot slow down the main filesystem. The healthmon object can exist past 48 * the end of the filesystem mount. 49 */ 50 51 /* sign of a detached health monitor */ 52 #define DETACHED_MOUNT_COOKIE ((uintptr_t)0) 53 54 /* Constrain the number of event objects that can build up in memory. */ 55 #define XFS_HEALTHMON_MAX_EVENTS (SZ_32K / \ 56 sizeof(struct xfs_healthmon_event)) 57 58 /* Constrain the size of the output buffer for read_iter. */ 59 #define XFS_HEALTHMON_MAX_OUTBUF SZ_64K 60 61 /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */ 62 static DEFINE_SPINLOCK(xfs_healthmon_lock); 63 64 /* Grab a reference to the healthmon object for a given mount, if any. */ 65 static struct xfs_healthmon * 66 xfs_healthmon_get( 67 struct xfs_mount *mp) 68 { 69 struct xfs_healthmon *hm; 70 71 rcu_read_lock(); 72 hm = rcu_dereference(mp->m_healthmon); 73 if (hm && !refcount_inc_not_zero(&hm->ref)) 74 hm = NULL; 75 rcu_read_unlock(); 76 77 return hm; 78 } 79 80 /* 81 * Release the reference to a healthmon object. If there are no more holders, 82 * free the health monitor after an RCU grace period to eliminate possibility 83 * of races with xfs_healthmon_get. 84 */ 85 static void 86 xfs_healthmon_put( 87 struct xfs_healthmon *hm) 88 { 89 if (refcount_dec_and_test(&hm->ref)) { 90 struct xfs_healthmon_event *event; 91 struct xfs_healthmon_event *next = hm->first_event; 92 93 while ((event = next) != NULL) { 94 trace_xfs_healthmon_drop(hm, event); 95 next = event->next; 96 kfree(event); 97 } 98 99 kfree(hm->unmount_event); 100 kfree(hm->buffer); 101 mutex_destroy(&hm->lock); 102 kfree_rcu_mightsleep(hm); 103 } 104 } 105 106 /* Attach a health monitor to an xfs_mount. Only one allowed at a time. */ 107 STATIC int 108 xfs_healthmon_attach( 109 struct xfs_mount *mp, 110 struct xfs_healthmon *hm) 111 { 112 spin_lock(&xfs_healthmon_lock); 113 if (rcu_access_pointer(mp->m_healthmon) != NULL) { 114 spin_unlock(&xfs_healthmon_lock); 115 return -EEXIST; 116 } 117 118 refcount_inc(&hm->ref); 119 rcu_assign_pointer(mp->m_healthmon, hm); 120 hm->mount_cookie = (uintptr_t)mp->m_super; 121 spin_unlock(&xfs_healthmon_lock); 122 123 return 0; 124 } 125 126 /* Detach a xfs mount from a specific healthmon instance. */ 127 STATIC void 128 xfs_healthmon_detach( 129 struct xfs_healthmon *hm) 130 { 131 struct xfs_mount *mp; 132 133 spin_lock(&xfs_healthmon_lock); 134 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) { 135 spin_unlock(&xfs_healthmon_lock); 136 return; 137 } 138 139 mp = XFS_M((struct super_block *)hm->mount_cookie); 140 rcu_assign_pointer(mp->m_healthmon, NULL); 141 hm->mount_cookie = DETACHED_MOUNT_COOKIE; 142 spin_unlock(&xfs_healthmon_lock); 143 144 /* 145 * Wake up any readers that might remain. This can happen if unmount 146 * races with the healthmon fd owner entering ->read_iter, having 147 * already emptied the event queue. 148 * 149 * In the ->release case there shouldn't be any readers because the 150 * only users of the waiter are read and poll. 151 */ 152 wake_up_all(&hm->wait); 153 154 trace_xfs_healthmon_detach(hm); 155 xfs_healthmon_put(hm); 156 } 157 158 static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm) 159 { 160 hm->events++; 161 hm->total_events++; 162 } 163 164 static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm) 165 { 166 hm->lost_prev_event++; 167 hm->total_lost++; 168 } 169 170 /* 171 * If possible, merge a new event into an existing event. Returns whether or 172 * not it merged anything. 173 */ 174 static bool 175 xfs_healthmon_merge_events( 176 struct xfs_healthmon_event *existing, 177 const struct xfs_healthmon_event *new) 178 { 179 if (!existing) 180 return false; 181 182 /* type and domain must match to merge events */ 183 if (existing->type != new->type || 184 existing->domain != new->domain) 185 return false; 186 187 switch (existing->type) { 188 case XFS_HEALTHMON_RUNNING: 189 case XFS_HEALTHMON_UNMOUNT: 190 /* should only ever be one of these events anyway */ 191 return false; 192 193 case XFS_HEALTHMON_LOST: 194 existing->lostcount += new->lostcount; 195 return true; 196 197 case XFS_HEALTHMON_SICK: 198 case XFS_HEALTHMON_CORRUPT: 199 case XFS_HEALTHMON_HEALTHY: 200 switch (existing->domain) { 201 case XFS_HEALTHMON_FS: 202 existing->fsmask |= new->fsmask; 203 return true; 204 case XFS_HEALTHMON_AG: 205 case XFS_HEALTHMON_RTGROUP: 206 if (existing->group == new->group){ 207 existing->grpmask |= new->grpmask; 208 return true; 209 } 210 return false; 211 case XFS_HEALTHMON_INODE: 212 if (existing->ino == new->ino && 213 existing->gen == new->gen) { 214 existing->imask |= new->imask; 215 return true; 216 } 217 return false; 218 default: 219 ASSERT(0); 220 return false; 221 } 222 return false; 223 224 case XFS_HEALTHMON_SHUTDOWN: 225 /* yes, we can race to shutdown */ 226 existing->flags |= new->flags; 227 return true; 228 229 case XFS_HEALTHMON_MEDIA_ERROR: 230 /* physically adjacent errors can merge */ 231 if (existing->daddr + existing->bbcount == new->daddr) { 232 existing->bbcount += new->bbcount; 233 return true; 234 } 235 if (new->daddr + new->bbcount == existing->daddr) { 236 existing->daddr = new->daddr; 237 existing->bbcount += new->bbcount; 238 return true; 239 } 240 return false; 241 242 case XFS_HEALTHMON_BUFREAD: 243 case XFS_HEALTHMON_BUFWRITE: 244 case XFS_HEALTHMON_DIOREAD: 245 case XFS_HEALTHMON_DIOWRITE: 246 case XFS_HEALTHMON_DATALOST: 247 /* logically adjacent file ranges can merge */ 248 if (existing->fino != new->fino || existing->fgen != new->fgen) 249 return false; 250 251 if (existing->fpos + existing->flen == new->fpos) { 252 existing->flen += new->flen; 253 return true; 254 } 255 256 if (new->fpos + new->flen == existing->fpos) { 257 existing->fpos = new->fpos; 258 existing->flen += new->flen; 259 return true; 260 } 261 return false; 262 } 263 264 return false; 265 } 266 267 /* Insert an event onto the start of the queue. */ 268 static inline void 269 __xfs_healthmon_insert( 270 struct xfs_healthmon *hm, 271 struct xfs_healthmon_event *event) 272 { 273 struct timespec64 now; 274 275 ktime_get_coarse_real_ts64(&now); 276 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; 277 278 event->next = hm->first_event; 279 if (!hm->first_event) 280 hm->first_event = event; 281 if (!hm->last_event) 282 hm->last_event = event; 283 xfs_healthmon_bump_events(hm); 284 wake_up(&hm->wait); 285 286 trace_xfs_healthmon_insert(hm, event); 287 } 288 289 /* Push an event onto the end of the queue. */ 290 static inline void 291 __xfs_healthmon_push( 292 struct xfs_healthmon *hm, 293 struct xfs_healthmon_event *event) 294 { 295 struct timespec64 now; 296 297 ktime_get_coarse_real_ts64(&now); 298 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; 299 300 if (!hm->first_event) 301 hm->first_event = event; 302 if (hm->last_event) 303 hm->last_event->next = event; 304 hm->last_event = event; 305 event->next = NULL; 306 xfs_healthmon_bump_events(hm); 307 wake_up(&hm->wait); 308 309 trace_xfs_healthmon_push(hm, event); 310 } 311 312 /* Deal with any previously lost events */ 313 static int 314 xfs_healthmon_clear_lost_prev( 315 struct xfs_healthmon *hm) 316 { 317 struct xfs_healthmon_event lost_event = { 318 .type = XFS_HEALTHMON_LOST, 319 .domain = XFS_HEALTHMON_MOUNT, 320 .lostcount = hm->lost_prev_event, 321 }; 322 struct xfs_healthmon_event *event = NULL; 323 324 if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) { 325 trace_xfs_healthmon_merge(hm, hm->last_event); 326 wake_up(&hm->wait); 327 goto cleared; 328 } 329 330 if (hm->events < XFS_HEALTHMON_MAX_EVENTS) 331 event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event), 332 GFP_NOFS); 333 if (!event) 334 return -ENOMEM; 335 336 __xfs_healthmon_push(hm, event); 337 cleared: 338 hm->lost_prev_event = 0; 339 return 0; 340 } 341 342 /* 343 * Push an event onto the end of the list after dealing with lost events and 344 * possibly full queues. 345 */ 346 STATIC int 347 xfs_healthmon_push( 348 struct xfs_healthmon *hm, 349 const struct xfs_healthmon_event *template) 350 { 351 struct xfs_healthmon_event *event = NULL; 352 int error = 0; 353 354 /* 355 * Locklessly check if the health monitor has already detached from the 356 * mount. If so, ignore the event. If we race with deactivation, 357 * we'll queue the event but never send it. 358 */ 359 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) 360 return -ESHUTDOWN; 361 362 mutex_lock(&hm->lock); 363 364 /* Report previously lost events before we do anything else */ 365 if (hm->lost_prev_event) { 366 error = xfs_healthmon_clear_lost_prev(hm); 367 if (error) 368 goto out_unlock; 369 } 370 371 /* Try to merge with the newest event */ 372 if (xfs_healthmon_merge_events(hm->last_event, template)) { 373 trace_xfs_healthmon_merge(hm, hm->last_event); 374 wake_up(&hm->wait); 375 goto out_unlock; 376 } 377 378 /* Only create a heap event object if we're not already at capacity. */ 379 if (hm->events < XFS_HEALTHMON_MAX_EVENTS) 380 event = kmemdup(template, sizeof(struct xfs_healthmon_event), 381 GFP_NOFS); 382 if (!event) { 383 /* No memory means we lose the event */ 384 trace_xfs_healthmon_lost_event(hm); 385 xfs_healthmon_bump_lost(hm); 386 error = -ENOMEM; 387 goto out_unlock; 388 } 389 390 __xfs_healthmon_push(hm, event); 391 392 out_unlock: 393 mutex_unlock(&hm->lock); 394 return error; 395 } 396 397 /* 398 * Report that the filesystem is being unmounted, then detach the xfs mount 399 * from this healthmon instance. 400 */ 401 void 402 xfs_healthmon_unmount( 403 struct xfs_mount *mp) 404 { 405 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 406 407 if (!hm) 408 return; 409 410 trace_xfs_healthmon_report_unmount(hm); 411 412 /* 413 * Insert the unmount notification at the start of the event queue so 414 * that userspace knows the filesystem went away as soon as possible. 415 * There's nothing actionable for userspace after an unmount. Once 416 * we've inserted the unmount event, hm no longer owns that event. 417 */ 418 __xfs_healthmon_insert(hm, hm->unmount_event); 419 hm->unmount_event = NULL; 420 421 xfs_healthmon_detach(hm); 422 xfs_healthmon_put(hm); 423 } 424 425 /* Compute the reporting mask for non-unmount metadata health events. */ 426 static inline unsigned int 427 metadata_event_mask( 428 struct xfs_healthmon *hm, 429 enum xfs_healthmon_type type, 430 unsigned int old_mask, 431 unsigned int new_mask) 432 { 433 /* If we want all events, return all events. */ 434 if (hm->verbose) 435 return new_mask; 436 437 switch (type) { 438 case XFS_HEALTHMON_SICK: 439 /* Always report runtime corruptions */ 440 return new_mask; 441 case XFS_HEALTHMON_CORRUPT: 442 /* Only report new fsck errors */ 443 return new_mask & ~old_mask; 444 case XFS_HEALTHMON_HEALTHY: 445 /* Only report healthy metadata that got fixed */ 446 return new_mask & old_mask; 447 default: 448 ASSERT(0); 449 break; 450 } 451 452 return 0; 453 } 454 455 /* Report XFS_FS_SICK_* events to healthmon */ 456 void 457 xfs_healthmon_report_fs( 458 struct xfs_mount *mp, 459 enum xfs_healthmon_type type, 460 unsigned int old_mask, 461 unsigned int new_mask) 462 { 463 struct xfs_healthmon_event event = { 464 .type = type, 465 .domain = XFS_HEALTHMON_FS, 466 }; 467 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 468 469 if (!hm) 470 return; 471 472 event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) & 473 ~XFS_SICK_FS_SECONDARY; 474 trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event); 475 476 if (event.fsmask) 477 xfs_healthmon_push(hm, &event); 478 479 xfs_healthmon_put(hm); 480 } 481 482 /* Report XFS_SICK_(AG|RG)* flags to healthmon */ 483 void 484 xfs_healthmon_report_group( 485 struct xfs_group *xg, 486 enum xfs_healthmon_type type, 487 unsigned int old_mask, 488 unsigned int new_mask) 489 { 490 struct xfs_healthmon_event event = { 491 .type = type, 492 .group = xg->xg_gno, 493 }; 494 struct xfs_healthmon *hm = xfs_healthmon_get(xg->xg_mount); 495 496 if (!hm) 497 return; 498 499 switch (xg->xg_type) { 500 case XG_TYPE_RTG: 501 event.domain = XFS_HEALTHMON_RTGROUP; 502 event.grpmask = metadata_event_mask(hm, type, old_mask, 503 new_mask) & 504 ~XFS_SICK_RG_SECONDARY; 505 break; 506 case XG_TYPE_AG: 507 event.domain = XFS_HEALTHMON_AG; 508 event.grpmask = metadata_event_mask(hm, type, old_mask, 509 new_mask) & 510 ~XFS_SICK_AG_SECONDARY; 511 break; 512 default: 513 ASSERT(0); 514 break; 515 } 516 517 trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event); 518 519 if (event.grpmask) 520 xfs_healthmon_push(hm, &event); 521 522 xfs_healthmon_put(hm); 523 } 524 525 /* Report XFS_SICK_INO_* flags to healthmon */ 526 void 527 xfs_healthmon_report_inode( 528 struct xfs_inode *ip, 529 enum xfs_healthmon_type type, 530 unsigned int old_mask, 531 unsigned int new_mask) 532 { 533 struct xfs_healthmon_event event = { 534 .type = type, 535 .domain = XFS_HEALTHMON_INODE, 536 .ino = ip->i_ino, 537 .gen = VFS_I(ip)->i_generation, 538 }; 539 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); 540 541 if (!hm) 542 return; 543 544 event.imask = metadata_event_mask(hm, type, old_mask, new_mask) & 545 ~XFS_SICK_INO_SECONDARY; 546 trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event); 547 548 if (event.imask) 549 xfs_healthmon_push(hm, &event); 550 551 xfs_healthmon_put(hm); 552 } 553 554 /* Add a shutdown event to the reporting queue. */ 555 void 556 xfs_healthmon_report_shutdown( 557 struct xfs_mount *mp, 558 uint32_t flags) 559 { 560 struct xfs_healthmon_event event = { 561 .type = XFS_HEALTHMON_SHUTDOWN, 562 .domain = XFS_HEALTHMON_MOUNT, 563 .flags = flags, 564 }; 565 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 566 567 if (!hm) 568 return; 569 570 trace_xfs_healthmon_report_shutdown(hm, flags); 571 572 xfs_healthmon_push(hm, &event); 573 xfs_healthmon_put(hm); 574 } 575 576 static inline enum xfs_healthmon_domain 577 media_error_domain( 578 enum xfs_device fdev) 579 { 580 switch (fdev) { 581 case XFS_DEV_DATA: 582 return XFS_HEALTHMON_DATADEV; 583 case XFS_DEV_LOG: 584 return XFS_HEALTHMON_LOGDEV; 585 case XFS_DEV_RT: 586 return XFS_HEALTHMON_RTDEV; 587 } 588 589 ASSERT(0); 590 return 0; 591 } 592 593 /* Add a media error event to the reporting queue. */ 594 void 595 xfs_healthmon_report_media( 596 struct xfs_mount *mp, 597 enum xfs_device fdev, 598 xfs_daddr_t daddr, 599 uint64_t bbcount) 600 { 601 struct xfs_healthmon_event event = { 602 .type = XFS_HEALTHMON_MEDIA_ERROR, 603 .domain = media_error_domain(fdev), 604 .daddr = daddr, 605 .bbcount = bbcount, 606 }; 607 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 608 609 if (!hm) 610 return; 611 612 trace_xfs_healthmon_report_media(hm, fdev, &event); 613 614 xfs_healthmon_push(hm, &event); 615 xfs_healthmon_put(hm); 616 } 617 618 static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action) 619 { 620 switch (action) { 621 case FSERR_BUFFERED_READ: 622 return XFS_HEALTHMON_BUFREAD; 623 case FSERR_BUFFERED_WRITE: 624 return XFS_HEALTHMON_BUFWRITE; 625 case FSERR_DIRECTIO_READ: 626 return XFS_HEALTHMON_DIOREAD; 627 case FSERR_DIRECTIO_WRITE: 628 return XFS_HEALTHMON_DIOWRITE; 629 case FSERR_DATA_LOST: 630 return XFS_HEALTHMON_DATALOST; 631 case FSERR_METADATA: 632 /* filtered out by xfs_fs_report_error */ 633 break; 634 } 635 636 ASSERT(0); 637 return -1; 638 } 639 640 /* Add a file io error event to the reporting queue. */ 641 void 642 xfs_healthmon_report_file_ioerror( 643 struct xfs_inode *ip, 644 const struct fserror_event *p) 645 { 646 struct xfs_healthmon_event event = { 647 .type = file_ioerr_type(p->type), 648 .domain = XFS_HEALTHMON_FILERANGE, 649 .fino = ip->i_ino, 650 .fgen = VFS_I(ip)->i_generation, 651 .fpos = p->pos, 652 .flen = p->len, 653 /* send positive error number to userspace */ 654 .error = -p->error, 655 }; 656 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); 657 658 if (!hm) 659 return; 660 661 trace_xfs_healthmon_report_file_ioerror(hm, p); 662 663 xfs_healthmon_push(hm, &event); 664 xfs_healthmon_put(hm); 665 } 666 667 static inline void 668 xfs_healthmon_reset_outbuf( 669 struct xfs_healthmon *hm) 670 { 671 hm->buftail = 0; 672 hm->bufhead = 0; 673 } 674 675 struct flags_map { 676 unsigned int in_mask; 677 unsigned int out_mask; 678 }; 679 680 static const struct flags_map shutdown_map[] = { 681 { SHUTDOWN_META_IO_ERROR, XFS_HEALTH_SHUTDOWN_META_IO_ERROR }, 682 { SHUTDOWN_LOG_IO_ERROR, XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR }, 683 { SHUTDOWN_FORCE_UMOUNT, XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT }, 684 { SHUTDOWN_CORRUPT_INCORE, XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE }, 685 { SHUTDOWN_CORRUPT_ONDISK, XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK }, 686 { SHUTDOWN_DEVICE_REMOVED, XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED }, 687 }; 688 689 static inline unsigned int 690 __map_flags( 691 const struct flags_map *map, 692 size_t array_len, 693 unsigned int flags) 694 { 695 const struct flags_map *m; 696 unsigned int ret = 0; 697 698 for (m = map; m < map + array_len; m++) { 699 if (flags & m->in_mask) 700 ret |= m->out_mask; 701 } 702 703 return ret; 704 } 705 706 #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags)) 707 708 static inline unsigned int shutdown_mask(unsigned int in) 709 { 710 return map_flags(shutdown_map, in); 711 } 712 713 static const unsigned int domain_map[] = { 714 [XFS_HEALTHMON_MOUNT] = XFS_HEALTH_MONITOR_DOMAIN_MOUNT, 715 [XFS_HEALTHMON_FS] = XFS_HEALTH_MONITOR_DOMAIN_FS, 716 [XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG, 717 [XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE, 718 [XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP, 719 [XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV, 720 [XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV, 721 [XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV, 722 [XFS_HEALTHMON_FILERANGE] = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE, 723 }; 724 725 static const unsigned int type_map[] = { 726 [XFS_HEALTHMON_RUNNING] = XFS_HEALTH_MONITOR_TYPE_RUNNING, 727 [XFS_HEALTHMON_LOST] = XFS_HEALTH_MONITOR_TYPE_LOST, 728 [XFS_HEALTHMON_SICK] = XFS_HEALTH_MONITOR_TYPE_SICK, 729 [XFS_HEALTHMON_CORRUPT] = XFS_HEALTH_MONITOR_TYPE_CORRUPT, 730 [XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY, 731 [XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT, 732 [XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN, 733 [XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR, 734 [XFS_HEALTHMON_BUFREAD] = XFS_HEALTH_MONITOR_TYPE_BUFREAD, 735 [XFS_HEALTHMON_BUFWRITE] = XFS_HEALTH_MONITOR_TYPE_BUFWRITE, 736 [XFS_HEALTHMON_DIOREAD] = XFS_HEALTH_MONITOR_TYPE_DIOREAD, 737 [XFS_HEALTHMON_DIOWRITE] = XFS_HEALTH_MONITOR_TYPE_DIOWRITE, 738 [XFS_HEALTHMON_DATALOST] = XFS_HEALTH_MONITOR_TYPE_DATALOST, 739 }; 740 741 /* Render event as a V0 structure */ 742 STATIC int 743 xfs_healthmon_format_v0( 744 struct xfs_healthmon *hm, 745 const struct xfs_healthmon_event *event) 746 { 747 struct xfs_health_monitor_event hme = { 748 .time_ns = event->time_ns, 749 }; 750 751 trace_xfs_healthmon_format(hm, event); 752 753 if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) || 754 event->type < 0 || event->type >= ARRAY_SIZE(type_map)) 755 return -EFSCORRUPTED; 756 757 hme.domain = domain_map[event->domain]; 758 hme.type = type_map[event->type]; 759 760 /* fill in the event-specific details */ 761 switch (event->domain) { 762 case XFS_HEALTHMON_MOUNT: 763 switch (event->type) { 764 case XFS_HEALTHMON_LOST: 765 hme.e.lost.count = event->lostcount; 766 break; 767 case XFS_HEALTHMON_SHUTDOWN: 768 hme.e.shutdown.reasons = shutdown_mask(event->flags); 769 break; 770 default: 771 break; 772 } 773 break; 774 case XFS_HEALTHMON_FS: 775 hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask); 776 break; 777 case XFS_HEALTHMON_RTGROUP: 778 hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask); 779 hme.e.group.gno = event->group; 780 break; 781 case XFS_HEALTHMON_AG: 782 hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask); 783 hme.e.group.gno = event->group; 784 break; 785 case XFS_HEALTHMON_INODE: 786 hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask); 787 hme.e.inode.ino = event->ino; 788 hme.e.inode.gen = event->gen; 789 break; 790 case XFS_HEALTHMON_DATADEV: 791 case XFS_HEALTHMON_LOGDEV: 792 case XFS_HEALTHMON_RTDEV: 793 hme.e.media.daddr = event->daddr; 794 hme.e.media.bbcount = event->bbcount; 795 break; 796 case XFS_HEALTHMON_FILERANGE: 797 hme.e.filerange.ino = event->fino; 798 hme.e.filerange.gen = event->fgen; 799 hme.e.filerange.pos = event->fpos; 800 hme.e.filerange.len = event->flen; 801 hme.e.filerange.error = abs(event->error); 802 break; 803 default: 804 break; 805 } 806 807 ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize); 808 809 /* copy formatted object to the outbuf */ 810 if (hm->bufhead + sizeof(hme) <= hm->bufsize) { 811 memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme)); 812 hm->bufhead += sizeof(hme); 813 } 814 815 return 0; 816 } 817 818 /* How many bytes are waiting in the outbuf to be copied? */ 819 static inline size_t 820 xfs_healthmon_outbuf_bytes( 821 struct xfs_healthmon *hm) 822 { 823 if (hm->bufhead > hm->buftail) 824 return hm->bufhead - hm->buftail; 825 return 0; 826 } 827 828 /* 829 * Do we have something for userspace to read? This can mean unmount events, 830 * events pending in the queue, or pending bytes in the outbuf. 831 */ 832 static inline bool 833 xfs_healthmon_has_eventdata( 834 struct xfs_healthmon *hm) 835 { 836 /* 837 * If the health monitor is already detached from the xfs_mount, we 838 * want reads to return 0 bytes even if there are no events, because 839 * userspace interprets that as EOF. If we race with deactivation, 840 * read_iter will take the necessary locks to discover that there are 841 * no events to send. 842 */ 843 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) 844 return true; 845 846 /* 847 * Either there are events waiting to be formatted into the buffer, or 848 * there's unread bytes in the buffer. 849 */ 850 return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0; 851 } 852 853 /* Try to copy the rest of the outbuf to the iov iter. */ 854 STATIC ssize_t 855 xfs_healthmon_copybuf( 856 struct xfs_healthmon *hm, 857 struct iov_iter *to) 858 { 859 size_t to_copy; 860 size_t w = 0; 861 862 trace_xfs_healthmon_copybuf(hm, to); 863 864 to_copy = xfs_healthmon_outbuf_bytes(hm); 865 if (to_copy) { 866 w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to); 867 if (!w) 868 return -EFAULT; 869 870 hm->buftail += w; 871 } 872 873 /* 874 * Nothing left to copy? Reset the output buffer cursors to the start 875 * since there's no live data in the buffer. 876 */ 877 if (xfs_healthmon_outbuf_bytes(hm) == 0) 878 xfs_healthmon_reset_outbuf(hm); 879 return w; 880 } 881 882 /* 883 * Return a health monitoring event for formatting into the output buffer if 884 * there's enough space in the outbuf and an event waiting for us. Caller 885 * must hold i_rwsem on the healthmon file. 886 */ 887 static inline struct xfs_healthmon_event * 888 xfs_healthmon_format_pop( 889 struct xfs_healthmon *hm) 890 { 891 struct xfs_healthmon_event *event; 892 893 if (hm->bufhead + sizeof(*event) > hm->bufsize) 894 return NULL; 895 896 mutex_lock(&hm->lock); 897 event = hm->first_event; 898 if (event) { 899 if (hm->last_event == event) 900 hm->last_event = NULL; 901 hm->first_event = event->next; 902 hm->events--; 903 904 trace_xfs_healthmon_pop(hm, event); 905 } 906 mutex_unlock(&hm->lock); 907 return event; 908 } 909 910 /* Allocate formatting buffer */ 911 STATIC int 912 xfs_healthmon_alloc_outbuf( 913 struct xfs_healthmon *hm, 914 size_t user_bufsize) 915 { 916 void *outbuf; 917 size_t bufsize = 918 min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize)); 919 920 outbuf = kzalloc(bufsize, GFP_KERNEL); 921 if (!outbuf) { 922 if (bufsize == PAGE_SIZE) 923 return -ENOMEM; 924 925 bufsize = PAGE_SIZE; 926 outbuf = kzalloc(bufsize, GFP_KERNEL); 927 if (!outbuf) 928 return -ENOMEM; 929 } 930 931 hm->buffer = outbuf; 932 hm->bufsize = bufsize; 933 hm->bufhead = 0; 934 hm->buftail = 0; 935 936 return 0; 937 } 938 939 /* 940 * Convey queued event data to userspace. First copy any remaining bytes in 941 * the outbuf, then format the oldest event into the outbuf and copy that too. 942 */ 943 STATIC ssize_t 944 xfs_healthmon_read_iter( 945 struct kiocb *iocb, 946 struct iov_iter *to) 947 { 948 struct file *file = iocb->ki_filp; 949 struct inode *inode = file_inode(file); 950 struct xfs_healthmon *hm = file->private_data; 951 struct xfs_healthmon_event *event; 952 size_t copied = 0; 953 ssize_t ret = 0; 954 955 if (file->f_flags & O_NONBLOCK) { 956 if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode)) 957 return -EAGAIN; 958 } else { 959 ret = wait_event_interruptible(hm->wait, 960 xfs_healthmon_has_eventdata(hm)); 961 if (ret) 962 return ret; 963 964 inode_lock(inode); 965 } 966 967 if (hm->bufsize == 0) { 968 ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to)); 969 if (ret) 970 goto out_unlock; 971 } 972 973 trace_xfs_healthmon_read_start(hm); 974 975 /* 976 * If there's anything left in the output buffer, copy that before 977 * formatting more events. 978 */ 979 ret = xfs_healthmon_copybuf(hm, to); 980 if (ret < 0) 981 goto out_unlock; 982 copied += ret; 983 984 while (iov_iter_count(to) > 0) { 985 /* Format the next events into the outbuf until it's full. */ 986 while ((event = xfs_healthmon_format_pop(hm)) != NULL) { 987 ret = xfs_healthmon_format_v0(hm, event); 988 kfree(event); 989 if (ret) 990 goto out_unlock; 991 } 992 993 /* Copy anything formatted into outbuf to userspace */ 994 ret = xfs_healthmon_copybuf(hm, to); 995 if (ret <= 0) 996 break; 997 998 copied += ret; 999 } 1000 1001 out_unlock: 1002 trace_xfs_healthmon_read_finish(hm); 1003 inode_unlock(inode); 1004 return copied ?: ret; 1005 } 1006 1007 /* Poll for available events. */ 1008 STATIC __poll_t 1009 xfs_healthmon_poll( 1010 struct file *file, 1011 struct poll_table_struct *wait) 1012 { 1013 struct xfs_healthmon *hm = file->private_data; 1014 __poll_t mask = 0; 1015 1016 poll_wait(file, &hm->wait, wait); 1017 1018 if (xfs_healthmon_has_eventdata(hm)) 1019 mask |= EPOLLIN; 1020 return mask; 1021 } 1022 1023 /* Free the health monitoring information. */ 1024 STATIC int 1025 xfs_healthmon_release( 1026 struct inode *inode, 1027 struct file *file) 1028 { 1029 struct xfs_healthmon *hm = file->private_data; 1030 1031 trace_xfs_healthmon_release(hm); 1032 1033 /* 1034 * We might be closing the healthmon file before the filesystem 1035 * unmounts, because userspace processes can terminate at any time and 1036 * for any reason. Null out xfs_mount::m_healthmon so that another 1037 * process can create another health monitor file. 1038 */ 1039 xfs_healthmon_detach(hm); 1040 xfs_healthmon_put(hm); 1041 return 0; 1042 } 1043 1044 /* Validate ioctl parameters. */ 1045 static inline bool 1046 xfs_healthmon_validate( 1047 const struct xfs_health_monitor *hmo) 1048 { 1049 if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL) 1050 return false; 1051 if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0) 1052 return false; 1053 if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad))) 1054 return false; 1055 return true; 1056 } 1057 1058 /* Emit some data about the health monitoring fd. */ 1059 static void 1060 xfs_healthmon_show_fdinfo( 1061 struct seq_file *m, 1062 struct file *file) 1063 { 1064 struct xfs_healthmon *hm = file->private_data; 1065 1066 mutex_lock(&hm->lock); 1067 seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n", 1068 hm->mount_cookie == DETACHED_MOUNT_COOKIE ? 1069 "dead" : "alive", 1070 MAJOR(hm->dev), MINOR(hm->dev), 1071 hm->total_events, 1072 hm->total_lost); 1073 mutex_unlock(&hm->lock); 1074 } 1075 1076 /* Reconfigure the health monitor. */ 1077 STATIC long 1078 xfs_healthmon_reconfigure( 1079 struct file *file, 1080 unsigned int cmd, 1081 void __user *arg) 1082 { 1083 struct xfs_health_monitor hmo; 1084 struct xfs_healthmon *hm = file->private_data; 1085 1086 if (copy_from_user(&hmo, arg, sizeof(hmo))) 1087 return -EFAULT; 1088 1089 if (!xfs_healthmon_validate(&hmo)) 1090 return -EINVAL; 1091 1092 mutex_lock(&hm->lock); 1093 hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE); 1094 mutex_unlock(&hm->lock); 1095 1096 return 0; 1097 } 1098 1099 /* Does the fd point to the same filesystem as the one we're monitoring? */ 1100 STATIC long 1101 xfs_healthmon_file_on_monitored_fs( 1102 struct file *file, 1103 unsigned int cmd, 1104 void __user *arg) 1105 { 1106 struct xfs_health_file_on_monitored_fs hms; 1107 struct xfs_healthmon *hm = file->private_data; 1108 struct inode *hms_inode; 1109 1110 if (copy_from_user(&hms, arg, sizeof(hms))) 1111 return -EFAULT; 1112 1113 if (hms.flags) 1114 return -EINVAL; 1115 1116 CLASS(fd, hms_fd)(hms.fd); 1117 if (fd_empty(hms_fd)) 1118 return -EBADF; 1119 1120 hms_inode = file_inode(fd_file(hms_fd)); 1121 mutex_lock(&hm->lock); 1122 if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) { 1123 mutex_unlock(&hm->lock); 1124 return -ESTALE; 1125 } 1126 1127 mutex_unlock(&hm->lock); 1128 return 0; 1129 } 1130 1131 /* Handle ioctls for the health monitoring thread. */ 1132 STATIC long 1133 xfs_healthmon_ioctl( 1134 struct file *file, 1135 unsigned int cmd, 1136 unsigned long p) 1137 { 1138 void __user *arg = (void __user *)p; 1139 1140 switch (cmd) { 1141 case XFS_IOC_HEALTH_MONITOR: 1142 return xfs_healthmon_reconfigure(file, cmd, arg); 1143 case XFS_IOC_HEALTH_FD_ON_MONITORED_FS: 1144 return xfs_healthmon_file_on_monitored_fs(file, cmd, arg); 1145 default: 1146 break; 1147 } 1148 1149 return -ENOTTY; 1150 } 1151 1152 static const struct file_operations xfs_healthmon_fops = { 1153 .owner = THIS_MODULE, 1154 .show_fdinfo = xfs_healthmon_show_fdinfo, 1155 .read_iter = xfs_healthmon_read_iter, 1156 .poll = xfs_healthmon_poll, 1157 .release = xfs_healthmon_release, 1158 .unlocked_ioctl = xfs_healthmon_ioctl, 1159 }; 1160 1161 /* 1162 * Create a health monitoring file. Returns an index to the fd table or a 1163 * negative errno. 1164 */ 1165 long 1166 xfs_ioc_health_monitor( 1167 struct file *file, 1168 struct xfs_health_monitor __user *arg) 1169 { 1170 struct xfs_health_monitor hmo; 1171 struct xfs_healthmon_event *running_event; 1172 struct xfs_healthmon *hm; 1173 struct xfs_inode *ip = XFS_I(file_inode(file)); 1174 struct xfs_mount *mp = ip->i_mount; 1175 int ret; 1176 1177 /* 1178 * The only intended user of the health monitoring system should be the 1179 * xfs_healer daemon running on behalf of the whole filesystem in the 1180 * initial user namespace. IOWs, we don't allow unprivileged userspace 1181 * (they can use fsnotify) nor do we allow containers. 1182 */ 1183 if (!capable(CAP_SYS_ADMIN)) 1184 return -EPERM; 1185 if (ip->i_ino != mp->m_sb.sb_rootino) 1186 return -EPERM; 1187 if (current_user_ns() != &init_user_ns) 1188 return -EPERM; 1189 1190 if (copy_from_user(&hmo, arg, sizeof(hmo))) 1191 return -EFAULT; 1192 1193 if (!xfs_healthmon_validate(&hmo)) 1194 return -EINVAL; 1195 1196 hm = kzalloc_obj(*hm); 1197 if (!hm) 1198 return -ENOMEM; 1199 hm->dev = mp->m_super->s_dev; 1200 refcount_set(&hm->ref, 1); 1201 1202 mutex_init(&hm->lock); 1203 init_waitqueue_head(&hm->wait); 1204 1205 if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE) 1206 hm->verbose = true; 1207 1208 /* Queue up the first event that lets the client know we're running. */ 1209 running_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS); 1210 if (!running_event) { 1211 ret = -ENOMEM; 1212 goto out_hm; 1213 } 1214 running_event->type = XFS_HEALTHMON_RUNNING; 1215 running_event->domain = XFS_HEALTHMON_MOUNT; 1216 __xfs_healthmon_insert(hm, running_event); 1217 1218 /* 1219 * Preallocate the unmount event so that we can't fail to notify the 1220 * filesystem later. This is key for triggering fast exit of the 1221 * xfs_healer daemon. 1222 */ 1223 hm->unmount_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS); 1224 if (!hm->unmount_event) { 1225 ret = -ENOMEM; 1226 goto out_hm; 1227 } 1228 hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT; 1229 hm->unmount_event->domain = XFS_HEALTHMON_MOUNT; 1230 1231 /* 1232 * Try to attach this health monitor to the xfs_mount. The monitor is 1233 * considered live and will receive events if this succeeds. 1234 */ 1235 ret = xfs_healthmon_attach(mp, hm); 1236 if (ret) 1237 goto out_hm; 1238 1239 /* 1240 * Create the anonymous file and install a fd for it. If it succeeds, 1241 * the file owns hm and can go away at any time, so we must not access 1242 * it again. This must go last because we can't undo a fd table 1243 * installation. 1244 */ 1245 ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm, 1246 O_CLOEXEC | O_RDONLY); 1247 if (ret < 0) 1248 goto out_mp; 1249 1250 trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format); 1251 1252 return ret; 1253 1254 out_mp: 1255 xfs_healthmon_detach(hm); 1256 out_hm: 1257 ASSERT(refcount_read(&hm->ref) == 1); 1258 xfs_healthmon_put(hm); 1259 return ret; 1260 } 1261