1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (c) 2024-2026 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trace.h" 15 #include "xfs_ag.h" 16 #include "xfs_btree.h" 17 #include "xfs_da_format.h" 18 #include "xfs_da_btree.h" 19 #include "xfs_quota_defs.h" 20 #include "xfs_rtgroup.h" 21 #include "xfs_health.h" 22 #include "xfs_healthmon.h" 23 #include "xfs_fsops.h" 24 #include "xfs_notify_failure.h" 25 #include "xfs_file.h" 26 #include "xfs_ioctl.h" 27 28 #include <linux/anon_inodes.h> 29 #include <linux/eventpoll.h> 30 #include <linux/poll.h> 31 #include <linux/fserror.h> 32 33 /* 34 * Live Health Monitoring 35 * ====================== 36 * 37 * Autonomous self-healing of XFS filesystems requires a means for the kernel 38 * to send filesystem health events to a monitoring daemon in userspace. To 39 * accomplish this, we establish a thread_with_file kthread object to handle 40 * translating internal events about filesystem health into a format that can 41 * be parsed easily by userspace. When those internal events occur, the core 42 * filesystem code calls this health monitor to convey the events to userspace. 43 * Userspace reads events from the file descriptor returned by the ioctl. 44 * 45 * The healthmon abstraction has a weak reference to the host filesystem mount 46 * so that the queueing and processing of the events do not pin the mount and 47 * cannot slow down the main filesystem. The healthmon object can exist past 48 * the end of the filesystem mount. 49 */ 50 51 /* sign of a detached health monitor */ 52 #define DETACHED_MOUNT_COOKIE ((uintptr_t)0) 53 54 /* Constrain the number of event objects that can build up in memory. */ 55 #define XFS_HEALTHMON_MAX_EVENTS (SZ_32K / \ 56 sizeof(struct xfs_healthmon_event)) 57 58 /* Constrain the size of the output buffer for read_iter. */ 59 #define XFS_HEALTHMON_MAX_OUTBUF SZ_64K 60 61 /* spinlock for atomically updating xfs_mount <-> xfs_healthmon pointers */ 62 static DEFINE_SPINLOCK(xfs_healthmon_lock); 63 64 /* Grab a reference to the healthmon object for a given mount, if any. */ 65 static struct xfs_healthmon * 66 xfs_healthmon_get( 67 struct xfs_mount *mp) 68 { 69 struct xfs_healthmon *hm; 70 71 rcu_read_lock(); 72 hm = mp->m_healthmon; 73 if (hm && !refcount_inc_not_zero(&hm->ref)) 74 hm = NULL; 75 rcu_read_unlock(); 76 77 return hm; 78 } 79 80 /* 81 * Release the reference to a healthmon object. If there are no more holders, 82 * free the health monitor after an RCU grace period to eliminate possibility 83 * of races with xfs_healthmon_get. 84 */ 85 static void 86 xfs_healthmon_put( 87 struct xfs_healthmon *hm) 88 { 89 if (refcount_dec_and_test(&hm->ref)) { 90 struct xfs_healthmon_event *event; 91 struct xfs_healthmon_event *next = hm->first_event; 92 93 while ((event = next) != NULL) { 94 trace_xfs_healthmon_drop(hm, event); 95 next = event->next; 96 kfree(event); 97 } 98 99 kfree(hm->unmount_event); 100 kfree(hm->buffer); 101 mutex_destroy(&hm->lock); 102 kfree_rcu_mightsleep(hm); 103 } 104 } 105 106 /* Attach a health monitor to an xfs_mount. Only one allowed at a time. */ 107 STATIC int 108 xfs_healthmon_attach( 109 struct xfs_mount *mp, 110 struct xfs_healthmon *hm) 111 { 112 spin_lock(&xfs_healthmon_lock); 113 if (mp->m_healthmon != NULL) { 114 spin_unlock(&xfs_healthmon_lock); 115 return -EEXIST; 116 } 117 118 refcount_inc(&hm->ref); 119 mp->m_healthmon = hm; 120 hm->mount_cookie = (uintptr_t)mp->m_super; 121 spin_unlock(&xfs_healthmon_lock); 122 123 return 0; 124 } 125 126 /* Detach a xfs mount from a specific healthmon instance. */ 127 STATIC void 128 xfs_healthmon_detach( 129 struct xfs_healthmon *hm) 130 { 131 spin_lock(&xfs_healthmon_lock); 132 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) { 133 spin_unlock(&xfs_healthmon_lock); 134 return; 135 } 136 137 XFS_M((struct super_block *)hm->mount_cookie)->m_healthmon = NULL; 138 hm->mount_cookie = DETACHED_MOUNT_COOKIE; 139 spin_unlock(&xfs_healthmon_lock); 140 141 trace_xfs_healthmon_detach(hm); 142 xfs_healthmon_put(hm); 143 } 144 145 static inline void xfs_healthmon_bump_events(struct xfs_healthmon *hm) 146 { 147 hm->events++; 148 hm->total_events++; 149 } 150 151 static inline void xfs_healthmon_bump_lost(struct xfs_healthmon *hm) 152 { 153 hm->lost_prev_event++; 154 hm->total_lost++; 155 } 156 157 /* 158 * If possible, merge a new event into an existing event. Returns whether or 159 * not it merged anything. 160 */ 161 static bool 162 xfs_healthmon_merge_events( 163 struct xfs_healthmon_event *existing, 164 const struct xfs_healthmon_event *new) 165 { 166 if (!existing) 167 return false; 168 169 /* type and domain must match to merge events */ 170 if (existing->type != new->type || 171 existing->domain != new->domain) 172 return false; 173 174 switch (existing->type) { 175 case XFS_HEALTHMON_RUNNING: 176 case XFS_HEALTHMON_UNMOUNT: 177 /* should only ever be one of these events anyway */ 178 return false; 179 180 case XFS_HEALTHMON_LOST: 181 existing->lostcount += new->lostcount; 182 return true; 183 184 case XFS_HEALTHMON_SICK: 185 case XFS_HEALTHMON_CORRUPT: 186 case XFS_HEALTHMON_HEALTHY: 187 switch (existing->domain) { 188 case XFS_HEALTHMON_FS: 189 existing->fsmask |= new->fsmask; 190 return true; 191 case XFS_HEALTHMON_AG: 192 case XFS_HEALTHMON_RTGROUP: 193 if (existing->group == new->group){ 194 existing->grpmask |= new->grpmask; 195 return true; 196 } 197 return false; 198 case XFS_HEALTHMON_INODE: 199 if (existing->ino == new->ino && 200 existing->gen == new->gen) { 201 existing->imask |= new->imask; 202 return true; 203 } 204 return false; 205 default: 206 ASSERT(0); 207 return false; 208 } 209 return false; 210 211 case XFS_HEALTHMON_SHUTDOWN: 212 /* yes, we can race to shutdown */ 213 existing->flags |= new->flags; 214 return true; 215 216 case XFS_HEALTHMON_MEDIA_ERROR: 217 /* physically adjacent errors can merge */ 218 if (existing->daddr + existing->bbcount == new->daddr) { 219 existing->bbcount += new->bbcount; 220 return true; 221 } 222 if (new->daddr + new->bbcount == existing->daddr) { 223 existing->daddr = new->daddr; 224 existing->bbcount += new->bbcount; 225 return true; 226 } 227 return false; 228 229 case XFS_HEALTHMON_BUFREAD: 230 case XFS_HEALTHMON_BUFWRITE: 231 case XFS_HEALTHMON_DIOREAD: 232 case XFS_HEALTHMON_DIOWRITE: 233 case XFS_HEALTHMON_DATALOST: 234 /* logically adjacent file ranges can merge */ 235 if (existing->fino != new->fino || existing->fgen != new->fgen) 236 return false; 237 238 if (existing->fpos + existing->flen == new->fpos) { 239 existing->flen += new->flen; 240 return true; 241 } 242 243 if (new->fpos + new->flen == existing->fpos) { 244 existing->fpos = new->fpos; 245 existing->flen += new->flen; 246 return true; 247 } 248 return false; 249 } 250 251 return false; 252 } 253 254 /* Insert an event onto the start of the queue. */ 255 static inline void 256 __xfs_healthmon_insert( 257 struct xfs_healthmon *hm, 258 struct xfs_healthmon_event *event) 259 { 260 struct timespec64 now; 261 262 ktime_get_coarse_real_ts64(&now); 263 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; 264 265 event->next = hm->first_event; 266 if (!hm->first_event) 267 hm->first_event = event; 268 if (!hm->last_event) 269 hm->last_event = event; 270 xfs_healthmon_bump_events(hm); 271 wake_up(&hm->wait); 272 273 trace_xfs_healthmon_insert(hm, event); 274 } 275 276 /* Push an event onto the end of the queue. */ 277 static inline void 278 __xfs_healthmon_push( 279 struct xfs_healthmon *hm, 280 struct xfs_healthmon_event *event) 281 { 282 struct timespec64 now; 283 284 ktime_get_coarse_real_ts64(&now); 285 event->time_ns = (now.tv_sec * NSEC_PER_SEC) + now.tv_nsec; 286 287 if (!hm->first_event) 288 hm->first_event = event; 289 if (hm->last_event) 290 hm->last_event->next = event; 291 hm->last_event = event; 292 event->next = NULL; 293 xfs_healthmon_bump_events(hm); 294 wake_up(&hm->wait); 295 296 trace_xfs_healthmon_push(hm, event); 297 } 298 299 /* Deal with any previously lost events */ 300 static int 301 xfs_healthmon_clear_lost_prev( 302 struct xfs_healthmon *hm) 303 { 304 struct xfs_healthmon_event lost_event = { 305 .type = XFS_HEALTHMON_LOST, 306 .domain = XFS_HEALTHMON_MOUNT, 307 .lostcount = hm->lost_prev_event, 308 }; 309 struct xfs_healthmon_event *event = NULL; 310 311 if (xfs_healthmon_merge_events(hm->last_event, &lost_event)) { 312 trace_xfs_healthmon_merge(hm, hm->last_event); 313 wake_up(&hm->wait); 314 goto cleared; 315 } 316 317 if (hm->events < XFS_HEALTHMON_MAX_EVENTS) 318 event = kmemdup(&lost_event, sizeof(struct xfs_healthmon_event), 319 GFP_NOFS); 320 if (!event) 321 return -ENOMEM; 322 323 __xfs_healthmon_push(hm, event); 324 cleared: 325 hm->lost_prev_event = 0; 326 return 0; 327 } 328 329 /* 330 * Push an event onto the end of the list after dealing with lost events and 331 * possibly full queues. 332 */ 333 STATIC int 334 xfs_healthmon_push( 335 struct xfs_healthmon *hm, 336 const struct xfs_healthmon_event *template) 337 { 338 struct xfs_healthmon_event *event = NULL; 339 int error = 0; 340 341 /* 342 * Locklessly check if the health monitor has already detached from the 343 * mount. If so, ignore the event. If we race with deactivation, 344 * we'll queue the event but never send it. 345 */ 346 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) 347 return -ESHUTDOWN; 348 349 mutex_lock(&hm->lock); 350 351 /* Report previously lost events before we do anything else */ 352 if (hm->lost_prev_event) { 353 error = xfs_healthmon_clear_lost_prev(hm); 354 if (error) 355 goto out_unlock; 356 } 357 358 /* Try to merge with the newest event */ 359 if (xfs_healthmon_merge_events(hm->last_event, template)) { 360 trace_xfs_healthmon_merge(hm, hm->last_event); 361 wake_up(&hm->wait); 362 goto out_unlock; 363 } 364 365 /* Only create a heap event object if we're not already at capacity. */ 366 if (hm->events < XFS_HEALTHMON_MAX_EVENTS) 367 event = kmemdup(template, sizeof(struct xfs_healthmon_event), 368 GFP_NOFS); 369 if (!event) { 370 /* No memory means we lose the event */ 371 trace_xfs_healthmon_lost_event(hm); 372 xfs_healthmon_bump_lost(hm); 373 error = -ENOMEM; 374 goto out_unlock; 375 } 376 377 __xfs_healthmon_push(hm, event); 378 379 out_unlock: 380 mutex_unlock(&hm->lock); 381 return error; 382 } 383 384 /* 385 * Report that the filesystem is being unmounted, then detach the xfs mount 386 * from this healthmon instance. 387 */ 388 void 389 xfs_healthmon_unmount( 390 struct xfs_mount *mp) 391 { 392 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 393 394 if (!hm) 395 return; 396 397 trace_xfs_healthmon_report_unmount(hm); 398 399 /* 400 * Insert the unmount notification at the start of the event queue so 401 * that userspace knows the filesystem went away as soon as possible. 402 * There's nothing actionable for userspace after an unmount. Once 403 * we've inserted the unmount event, hm no longer owns that event. 404 */ 405 __xfs_healthmon_insert(hm, hm->unmount_event); 406 hm->unmount_event = NULL; 407 408 xfs_healthmon_detach(hm); 409 xfs_healthmon_put(hm); 410 } 411 412 /* Compute the reporting mask for non-unmount metadata health events. */ 413 static inline unsigned int 414 metadata_event_mask( 415 struct xfs_healthmon *hm, 416 enum xfs_healthmon_type type, 417 unsigned int old_mask, 418 unsigned int new_mask) 419 { 420 /* If we want all events, return all events. */ 421 if (hm->verbose) 422 return new_mask; 423 424 switch (type) { 425 case XFS_HEALTHMON_SICK: 426 /* Always report runtime corruptions */ 427 return new_mask; 428 case XFS_HEALTHMON_CORRUPT: 429 /* Only report new fsck errors */ 430 return new_mask & ~old_mask; 431 case XFS_HEALTHMON_HEALTHY: 432 /* Only report healthy metadata that got fixed */ 433 return new_mask & old_mask; 434 default: 435 ASSERT(0); 436 break; 437 } 438 439 return 0; 440 } 441 442 /* Report XFS_FS_SICK_* events to healthmon */ 443 void 444 xfs_healthmon_report_fs( 445 struct xfs_mount *mp, 446 enum xfs_healthmon_type type, 447 unsigned int old_mask, 448 unsigned int new_mask) 449 { 450 struct xfs_healthmon_event event = { 451 .type = type, 452 .domain = XFS_HEALTHMON_FS, 453 }; 454 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 455 456 if (!hm) 457 return; 458 459 event.fsmask = metadata_event_mask(hm, type, old_mask, new_mask) & 460 ~XFS_SICK_FS_SECONDARY; 461 trace_xfs_healthmon_report_fs(hm, old_mask, new_mask, &event); 462 463 if (event.fsmask) 464 xfs_healthmon_push(hm, &event); 465 466 xfs_healthmon_put(hm); 467 } 468 469 /* Report XFS_SICK_(AG|RG)* flags to healthmon */ 470 void 471 xfs_healthmon_report_group( 472 struct xfs_group *xg, 473 enum xfs_healthmon_type type, 474 unsigned int old_mask, 475 unsigned int new_mask) 476 { 477 struct xfs_healthmon_event event = { 478 .type = type, 479 .group = xg->xg_gno, 480 }; 481 struct xfs_healthmon *hm = xfs_healthmon_get(xg->xg_mount); 482 483 if (!hm) 484 return; 485 486 switch (xg->xg_type) { 487 case XG_TYPE_RTG: 488 event.domain = XFS_HEALTHMON_RTGROUP; 489 event.grpmask = metadata_event_mask(hm, type, old_mask, 490 new_mask) & 491 ~XFS_SICK_RG_SECONDARY; 492 break; 493 case XG_TYPE_AG: 494 event.domain = XFS_HEALTHMON_AG; 495 event.grpmask = metadata_event_mask(hm, type, old_mask, 496 new_mask) & 497 ~XFS_SICK_AG_SECONDARY; 498 break; 499 default: 500 ASSERT(0); 501 break; 502 } 503 504 trace_xfs_healthmon_report_group(hm, old_mask, new_mask, &event); 505 506 if (event.grpmask) 507 xfs_healthmon_push(hm, &event); 508 509 xfs_healthmon_put(hm); 510 } 511 512 /* Report XFS_SICK_INO_* flags to healthmon */ 513 void 514 xfs_healthmon_report_inode( 515 struct xfs_inode *ip, 516 enum xfs_healthmon_type type, 517 unsigned int old_mask, 518 unsigned int new_mask) 519 { 520 struct xfs_healthmon_event event = { 521 .type = type, 522 .domain = XFS_HEALTHMON_INODE, 523 .ino = ip->i_ino, 524 .gen = VFS_I(ip)->i_generation, 525 }; 526 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); 527 528 if (!hm) 529 return; 530 531 event.imask = metadata_event_mask(hm, type, old_mask, new_mask) & 532 ~XFS_SICK_INO_SECONDARY; 533 trace_xfs_healthmon_report_inode(hm, old_mask, event.imask, &event); 534 535 if (event.imask) 536 xfs_healthmon_push(hm, &event); 537 538 xfs_healthmon_put(hm); 539 } 540 541 /* Add a shutdown event to the reporting queue. */ 542 void 543 xfs_healthmon_report_shutdown( 544 struct xfs_mount *mp, 545 uint32_t flags) 546 { 547 struct xfs_healthmon_event event = { 548 .type = XFS_HEALTHMON_SHUTDOWN, 549 .domain = XFS_HEALTHMON_MOUNT, 550 .flags = flags, 551 }; 552 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 553 554 if (!hm) 555 return; 556 557 trace_xfs_healthmon_report_shutdown(hm, flags); 558 559 xfs_healthmon_push(hm, &event); 560 xfs_healthmon_put(hm); 561 } 562 563 static inline enum xfs_healthmon_domain 564 media_error_domain( 565 enum xfs_device fdev) 566 { 567 switch (fdev) { 568 case XFS_DEV_DATA: 569 return XFS_HEALTHMON_DATADEV; 570 case XFS_DEV_LOG: 571 return XFS_HEALTHMON_LOGDEV; 572 case XFS_DEV_RT: 573 return XFS_HEALTHMON_RTDEV; 574 } 575 576 ASSERT(0); 577 return 0; 578 } 579 580 /* Add a media error event to the reporting queue. */ 581 void 582 xfs_healthmon_report_media( 583 struct xfs_mount *mp, 584 enum xfs_device fdev, 585 xfs_daddr_t daddr, 586 uint64_t bbcount) 587 { 588 struct xfs_healthmon_event event = { 589 .type = XFS_HEALTHMON_MEDIA_ERROR, 590 .domain = media_error_domain(fdev), 591 .daddr = daddr, 592 .bbcount = bbcount, 593 }; 594 struct xfs_healthmon *hm = xfs_healthmon_get(mp); 595 596 if (!hm) 597 return; 598 599 trace_xfs_healthmon_report_media(hm, fdev, &event); 600 601 xfs_healthmon_push(hm, &event); 602 xfs_healthmon_put(hm); 603 } 604 605 static inline enum xfs_healthmon_type file_ioerr_type(enum fserror_type action) 606 { 607 switch (action) { 608 case FSERR_BUFFERED_READ: 609 return XFS_HEALTHMON_BUFREAD; 610 case FSERR_BUFFERED_WRITE: 611 return XFS_HEALTHMON_BUFWRITE; 612 case FSERR_DIRECTIO_READ: 613 return XFS_HEALTHMON_DIOREAD; 614 case FSERR_DIRECTIO_WRITE: 615 return XFS_HEALTHMON_DIOWRITE; 616 case FSERR_DATA_LOST: 617 return XFS_HEALTHMON_DATALOST; 618 case FSERR_METADATA: 619 /* filtered out by xfs_fs_report_error */ 620 break; 621 } 622 623 ASSERT(0); 624 return -1; 625 } 626 627 /* Add a file io error event to the reporting queue. */ 628 void 629 xfs_healthmon_report_file_ioerror( 630 struct xfs_inode *ip, 631 const struct fserror_event *p) 632 { 633 struct xfs_healthmon_event event = { 634 .type = file_ioerr_type(p->type), 635 .domain = XFS_HEALTHMON_FILERANGE, 636 .fino = ip->i_ino, 637 .fgen = VFS_I(ip)->i_generation, 638 .fpos = p->pos, 639 .flen = p->len, 640 /* send positive error number to userspace */ 641 .error = -p->error, 642 }; 643 struct xfs_healthmon *hm = xfs_healthmon_get(ip->i_mount); 644 645 if (!hm) 646 return; 647 648 trace_xfs_healthmon_report_file_ioerror(hm, p); 649 650 xfs_healthmon_push(hm, &event); 651 xfs_healthmon_put(hm); 652 } 653 654 static inline void 655 xfs_healthmon_reset_outbuf( 656 struct xfs_healthmon *hm) 657 { 658 hm->buftail = 0; 659 hm->bufhead = 0; 660 } 661 662 struct flags_map { 663 unsigned int in_mask; 664 unsigned int out_mask; 665 }; 666 667 static const struct flags_map shutdown_map[] = { 668 { SHUTDOWN_META_IO_ERROR, XFS_HEALTH_SHUTDOWN_META_IO_ERROR }, 669 { SHUTDOWN_LOG_IO_ERROR, XFS_HEALTH_SHUTDOWN_LOG_IO_ERROR }, 670 { SHUTDOWN_FORCE_UMOUNT, XFS_HEALTH_SHUTDOWN_FORCE_UMOUNT }, 671 { SHUTDOWN_CORRUPT_INCORE, XFS_HEALTH_SHUTDOWN_CORRUPT_INCORE }, 672 { SHUTDOWN_CORRUPT_ONDISK, XFS_HEALTH_SHUTDOWN_CORRUPT_ONDISK }, 673 { SHUTDOWN_DEVICE_REMOVED, XFS_HEALTH_SHUTDOWN_DEVICE_REMOVED }, 674 }; 675 676 static inline unsigned int 677 __map_flags( 678 const struct flags_map *map, 679 size_t array_len, 680 unsigned int flags) 681 { 682 const struct flags_map *m; 683 unsigned int ret = 0; 684 685 for (m = map; m < map + array_len; m++) { 686 if (flags & m->in_mask) 687 ret |= m->out_mask; 688 } 689 690 return ret; 691 } 692 693 #define map_flags(map, flags) __map_flags((map), ARRAY_SIZE(map), (flags)) 694 695 static inline unsigned int shutdown_mask(unsigned int in) 696 { 697 return map_flags(shutdown_map, in); 698 } 699 700 static const unsigned int domain_map[] = { 701 [XFS_HEALTHMON_MOUNT] = XFS_HEALTH_MONITOR_DOMAIN_MOUNT, 702 [XFS_HEALTHMON_FS] = XFS_HEALTH_MONITOR_DOMAIN_FS, 703 [XFS_HEALTHMON_AG] = XFS_HEALTH_MONITOR_DOMAIN_AG, 704 [XFS_HEALTHMON_INODE] = XFS_HEALTH_MONITOR_DOMAIN_INODE, 705 [XFS_HEALTHMON_RTGROUP] = XFS_HEALTH_MONITOR_DOMAIN_RTGROUP, 706 [XFS_HEALTHMON_DATADEV] = XFS_HEALTH_MONITOR_DOMAIN_DATADEV, 707 [XFS_HEALTHMON_RTDEV] = XFS_HEALTH_MONITOR_DOMAIN_RTDEV, 708 [XFS_HEALTHMON_LOGDEV] = XFS_HEALTH_MONITOR_DOMAIN_LOGDEV, 709 [XFS_HEALTHMON_FILERANGE] = XFS_HEALTH_MONITOR_DOMAIN_FILERANGE, 710 }; 711 712 static const unsigned int type_map[] = { 713 [XFS_HEALTHMON_RUNNING] = XFS_HEALTH_MONITOR_TYPE_RUNNING, 714 [XFS_HEALTHMON_LOST] = XFS_HEALTH_MONITOR_TYPE_LOST, 715 [XFS_HEALTHMON_SICK] = XFS_HEALTH_MONITOR_TYPE_SICK, 716 [XFS_HEALTHMON_CORRUPT] = XFS_HEALTH_MONITOR_TYPE_CORRUPT, 717 [XFS_HEALTHMON_HEALTHY] = XFS_HEALTH_MONITOR_TYPE_HEALTHY, 718 [XFS_HEALTHMON_UNMOUNT] = XFS_HEALTH_MONITOR_TYPE_UNMOUNT, 719 [XFS_HEALTHMON_SHUTDOWN] = XFS_HEALTH_MONITOR_TYPE_SHUTDOWN, 720 [XFS_HEALTHMON_MEDIA_ERROR] = XFS_HEALTH_MONITOR_TYPE_MEDIA_ERROR, 721 [XFS_HEALTHMON_BUFREAD] = XFS_HEALTH_MONITOR_TYPE_BUFREAD, 722 [XFS_HEALTHMON_BUFWRITE] = XFS_HEALTH_MONITOR_TYPE_BUFWRITE, 723 [XFS_HEALTHMON_DIOREAD] = XFS_HEALTH_MONITOR_TYPE_DIOREAD, 724 [XFS_HEALTHMON_DIOWRITE] = XFS_HEALTH_MONITOR_TYPE_DIOWRITE, 725 [XFS_HEALTHMON_DATALOST] = XFS_HEALTH_MONITOR_TYPE_DATALOST, 726 }; 727 728 /* Render event as a V0 structure */ 729 STATIC int 730 xfs_healthmon_format_v0( 731 struct xfs_healthmon *hm, 732 const struct xfs_healthmon_event *event) 733 { 734 struct xfs_health_monitor_event hme = { 735 .time_ns = event->time_ns, 736 }; 737 738 trace_xfs_healthmon_format(hm, event); 739 740 if (event->domain < 0 || event->domain >= ARRAY_SIZE(domain_map) || 741 event->type < 0 || event->type >= ARRAY_SIZE(type_map)) 742 return -EFSCORRUPTED; 743 744 hme.domain = domain_map[event->domain]; 745 hme.type = type_map[event->type]; 746 747 /* fill in the event-specific details */ 748 switch (event->domain) { 749 case XFS_HEALTHMON_MOUNT: 750 switch (event->type) { 751 case XFS_HEALTHMON_LOST: 752 hme.e.lost.count = event->lostcount; 753 break; 754 case XFS_HEALTHMON_SHUTDOWN: 755 hme.e.shutdown.reasons = shutdown_mask(event->flags); 756 break; 757 default: 758 break; 759 } 760 break; 761 case XFS_HEALTHMON_FS: 762 hme.e.fs.mask = xfs_healthmon_fs_mask(event->fsmask); 763 break; 764 case XFS_HEALTHMON_RTGROUP: 765 hme.e.group.mask = xfs_healthmon_rtgroup_mask(event->grpmask); 766 hme.e.group.gno = event->group; 767 break; 768 case XFS_HEALTHMON_AG: 769 hme.e.group.mask = xfs_healthmon_perag_mask(event->grpmask); 770 hme.e.group.gno = event->group; 771 break; 772 case XFS_HEALTHMON_INODE: 773 hme.e.inode.mask = xfs_healthmon_inode_mask(event->imask); 774 hme.e.inode.ino = event->ino; 775 hme.e.inode.gen = event->gen; 776 break; 777 case XFS_HEALTHMON_DATADEV: 778 case XFS_HEALTHMON_LOGDEV: 779 case XFS_HEALTHMON_RTDEV: 780 hme.e.media.daddr = event->daddr; 781 hme.e.media.bbcount = event->bbcount; 782 break; 783 case XFS_HEALTHMON_FILERANGE: 784 hme.e.filerange.ino = event->fino; 785 hme.e.filerange.gen = event->fgen; 786 hme.e.filerange.pos = event->fpos; 787 hme.e.filerange.len = event->flen; 788 hme.e.filerange.error = abs(event->error); 789 break; 790 default: 791 break; 792 } 793 794 ASSERT(hm->bufhead + sizeof(hme) <= hm->bufsize); 795 796 /* copy formatted object to the outbuf */ 797 if (hm->bufhead + sizeof(hme) <= hm->bufsize) { 798 memcpy(hm->buffer + hm->bufhead, &hme, sizeof(hme)); 799 hm->bufhead += sizeof(hme); 800 } 801 802 return 0; 803 } 804 805 /* How many bytes are waiting in the outbuf to be copied? */ 806 static inline size_t 807 xfs_healthmon_outbuf_bytes( 808 struct xfs_healthmon *hm) 809 { 810 if (hm->bufhead > hm->buftail) 811 return hm->bufhead - hm->buftail; 812 return 0; 813 } 814 815 /* 816 * Do we have something for userspace to read? This can mean unmount events, 817 * events pending in the queue, or pending bytes in the outbuf. 818 */ 819 static inline bool 820 xfs_healthmon_has_eventdata( 821 struct xfs_healthmon *hm) 822 { 823 /* 824 * If the health monitor is already detached from the xfs_mount, we 825 * want reads to return 0 bytes even if there are no events, because 826 * userspace interprets that as EOF. If we race with deactivation, 827 * read_iter will take the necessary locks to discover that there are 828 * no events to send. 829 */ 830 if (hm->mount_cookie == DETACHED_MOUNT_COOKIE) 831 return true; 832 833 /* 834 * Either there are events waiting to be formatted into the buffer, or 835 * there's unread bytes in the buffer. 836 */ 837 return hm->events > 0 || xfs_healthmon_outbuf_bytes(hm) > 0; 838 } 839 840 /* Try to copy the rest of the outbuf to the iov iter. */ 841 STATIC ssize_t 842 xfs_healthmon_copybuf( 843 struct xfs_healthmon *hm, 844 struct iov_iter *to) 845 { 846 size_t to_copy; 847 size_t w = 0; 848 849 trace_xfs_healthmon_copybuf(hm, to); 850 851 to_copy = xfs_healthmon_outbuf_bytes(hm); 852 if (to_copy) { 853 w = copy_to_iter(hm->buffer + hm->buftail, to_copy, to); 854 if (!w) 855 return -EFAULT; 856 857 hm->buftail += w; 858 } 859 860 /* 861 * Nothing left to copy? Reset the output buffer cursors to the start 862 * since there's no live data in the buffer. 863 */ 864 if (xfs_healthmon_outbuf_bytes(hm) == 0) 865 xfs_healthmon_reset_outbuf(hm); 866 return w; 867 } 868 869 /* 870 * Return a health monitoring event for formatting into the output buffer if 871 * there's enough space in the outbuf and an event waiting for us. Caller 872 * must hold i_rwsem on the healthmon file. 873 */ 874 static inline struct xfs_healthmon_event * 875 xfs_healthmon_format_pop( 876 struct xfs_healthmon *hm) 877 { 878 struct xfs_healthmon_event *event; 879 880 if (hm->bufhead + sizeof(*event) > hm->bufsize) 881 return NULL; 882 883 mutex_lock(&hm->lock); 884 event = hm->first_event; 885 if (event) { 886 if (hm->last_event == event) 887 hm->last_event = NULL; 888 hm->first_event = event->next; 889 hm->events--; 890 891 trace_xfs_healthmon_pop(hm, event); 892 } 893 mutex_unlock(&hm->lock); 894 return event; 895 } 896 897 /* Allocate formatting buffer */ 898 STATIC int 899 xfs_healthmon_alloc_outbuf( 900 struct xfs_healthmon *hm, 901 size_t user_bufsize) 902 { 903 void *outbuf; 904 size_t bufsize = 905 min(XFS_HEALTHMON_MAX_OUTBUF, max(PAGE_SIZE, user_bufsize)); 906 907 outbuf = kzalloc(bufsize, GFP_KERNEL); 908 if (!outbuf) { 909 if (bufsize == PAGE_SIZE) 910 return -ENOMEM; 911 912 bufsize = PAGE_SIZE; 913 outbuf = kzalloc(bufsize, GFP_KERNEL); 914 if (!outbuf) 915 return -ENOMEM; 916 } 917 918 hm->buffer = outbuf; 919 hm->bufsize = bufsize; 920 hm->bufhead = 0; 921 hm->buftail = 0; 922 923 return 0; 924 } 925 926 /* 927 * Convey queued event data to userspace. First copy any remaining bytes in 928 * the outbuf, then format the oldest event into the outbuf and copy that too. 929 */ 930 STATIC ssize_t 931 xfs_healthmon_read_iter( 932 struct kiocb *iocb, 933 struct iov_iter *to) 934 { 935 struct file *file = iocb->ki_filp; 936 struct inode *inode = file_inode(file); 937 struct xfs_healthmon *hm = file->private_data; 938 struct xfs_healthmon_event *event; 939 size_t copied = 0; 940 ssize_t ret = 0; 941 942 if (file->f_flags & O_NONBLOCK) { 943 if (!xfs_healthmon_has_eventdata(hm) || !inode_trylock(inode)) 944 return -EAGAIN; 945 } else { 946 ret = wait_event_interruptible(hm->wait, 947 xfs_healthmon_has_eventdata(hm)); 948 if (ret) 949 return ret; 950 951 inode_lock(inode); 952 } 953 954 if (hm->bufsize == 0) { 955 ret = xfs_healthmon_alloc_outbuf(hm, iov_iter_count(to)); 956 if (ret) 957 goto out_unlock; 958 } 959 960 trace_xfs_healthmon_read_start(hm); 961 962 /* 963 * If there's anything left in the output buffer, copy that before 964 * formatting more events. 965 */ 966 ret = xfs_healthmon_copybuf(hm, to); 967 if (ret < 0) 968 goto out_unlock; 969 copied += ret; 970 971 while (iov_iter_count(to) > 0) { 972 /* Format the next events into the outbuf until it's full. */ 973 while ((event = xfs_healthmon_format_pop(hm)) != NULL) { 974 ret = xfs_healthmon_format_v0(hm, event); 975 kfree(event); 976 if (ret) 977 goto out_unlock; 978 } 979 980 /* Copy anything formatted into outbuf to userspace */ 981 ret = xfs_healthmon_copybuf(hm, to); 982 if (ret <= 0) 983 break; 984 985 copied += ret; 986 } 987 988 out_unlock: 989 trace_xfs_healthmon_read_finish(hm); 990 inode_unlock(inode); 991 return copied ?: ret; 992 } 993 994 /* Poll for available events. */ 995 STATIC __poll_t 996 xfs_healthmon_poll( 997 struct file *file, 998 struct poll_table_struct *wait) 999 { 1000 struct xfs_healthmon *hm = file->private_data; 1001 __poll_t mask = 0; 1002 1003 poll_wait(file, &hm->wait, wait); 1004 1005 if (xfs_healthmon_has_eventdata(hm)) 1006 mask |= EPOLLIN; 1007 return mask; 1008 } 1009 1010 /* Free the health monitoring information. */ 1011 STATIC int 1012 xfs_healthmon_release( 1013 struct inode *inode, 1014 struct file *file) 1015 { 1016 struct xfs_healthmon *hm = file->private_data; 1017 1018 trace_xfs_healthmon_release(hm); 1019 1020 /* 1021 * We might be closing the healthmon file before the filesystem 1022 * unmounts, because userspace processes can terminate at any time and 1023 * for any reason. Null out xfs_mount::m_healthmon so that another 1024 * process can create another health monitor file. 1025 */ 1026 xfs_healthmon_detach(hm); 1027 1028 /* 1029 * Wake up any readers that might be left. There shouldn't be any 1030 * because the only users of the waiter are read and poll. 1031 */ 1032 wake_up_all(&hm->wait); 1033 1034 xfs_healthmon_put(hm); 1035 return 0; 1036 } 1037 1038 /* Validate ioctl parameters. */ 1039 static inline bool 1040 xfs_healthmon_validate( 1041 const struct xfs_health_monitor *hmo) 1042 { 1043 if (hmo->flags & ~XFS_HEALTH_MONITOR_ALL) 1044 return false; 1045 if (hmo->format != XFS_HEALTH_MONITOR_FMT_V0) 1046 return false; 1047 if (memchr_inv(&hmo->pad, 0, sizeof(hmo->pad))) 1048 return false; 1049 return true; 1050 } 1051 1052 /* Emit some data about the health monitoring fd. */ 1053 static void 1054 xfs_healthmon_show_fdinfo( 1055 struct seq_file *m, 1056 struct file *file) 1057 { 1058 struct xfs_healthmon *hm = file->private_data; 1059 1060 mutex_lock(&hm->lock); 1061 seq_printf(m, "state:\t%s\ndev:\t%d:%d\nformat:\tv0\nevents:\t%llu\nlost:\t%llu\n", 1062 hm->mount_cookie == DETACHED_MOUNT_COOKIE ? 1063 "dead" : "alive", 1064 MAJOR(hm->dev), MINOR(hm->dev), 1065 hm->total_events, 1066 hm->total_lost); 1067 mutex_unlock(&hm->lock); 1068 } 1069 1070 /* Reconfigure the health monitor. */ 1071 STATIC long 1072 xfs_healthmon_reconfigure( 1073 struct file *file, 1074 unsigned int cmd, 1075 void __user *arg) 1076 { 1077 struct xfs_health_monitor hmo; 1078 struct xfs_healthmon *hm = file->private_data; 1079 1080 if (copy_from_user(&hmo, arg, sizeof(hmo))) 1081 return -EFAULT; 1082 1083 if (!xfs_healthmon_validate(&hmo)) 1084 return -EINVAL; 1085 1086 mutex_lock(&hm->lock); 1087 hm->verbose = !!(hmo.flags & XFS_HEALTH_MONITOR_VERBOSE); 1088 mutex_unlock(&hm->lock); 1089 1090 return 0; 1091 } 1092 1093 /* Does the fd point to the same filesystem as the one we're monitoring? */ 1094 STATIC long 1095 xfs_healthmon_file_on_monitored_fs( 1096 struct file *file, 1097 unsigned int cmd, 1098 void __user *arg) 1099 { 1100 struct xfs_health_file_on_monitored_fs hms; 1101 struct xfs_healthmon *hm = file->private_data; 1102 struct inode *hms_inode; 1103 1104 if (copy_from_user(&hms, arg, sizeof(hms))) 1105 return -EFAULT; 1106 1107 if (hms.flags) 1108 return -EINVAL; 1109 1110 CLASS(fd, hms_fd)(hms.fd); 1111 if (fd_empty(hms_fd)) 1112 return -EBADF; 1113 1114 hms_inode = file_inode(fd_file(hms_fd)); 1115 mutex_lock(&hm->lock); 1116 if (hm->mount_cookie != (uintptr_t)hms_inode->i_sb) { 1117 mutex_unlock(&hm->lock); 1118 return -ESTALE; 1119 } 1120 1121 mutex_unlock(&hm->lock); 1122 return 0; 1123 } 1124 1125 /* Handle ioctls for the health monitoring thread. */ 1126 STATIC long 1127 xfs_healthmon_ioctl( 1128 struct file *file, 1129 unsigned int cmd, 1130 unsigned long p) 1131 { 1132 void __user *arg = (void __user *)p; 1133 1134 switch (cmd) { 1135 case XFS_IOC_HEALTH_MONITOR: 1136 return xfs_healthmon_reconfigure(file, cmd, arg); 1137 case XFS_IOC_HEALTH_FD_ON_MONITORED_FS: 1138 return xfs_healthmon_file_on_monitored_fs(file, cmd, arg); 1139 default: 1140 break; 1141 } 1142 1143 return -ENOTTY; 1144 } 1145 1146 static const struct file_operations xfs_healthmon_fops = { 1147 .owner = THIS_MODULE, 1148 .show_fdinfo = xfs_healthmon_show_fdinfo, 1149 .read_iter = xfs_healthmon_read_iter, 1150 .poll = xfs_healthmon_poll, 1151 .release = xfs_healthmon_release, 1152 .unlocked_ioctl = xfs_healthmon_ioctl, 1153 }; 1154 1155 /* 1156 * Create a health monitoring file. Returns an index to the fd table or a 1157 * negative errno. 1158 */ 1159 long 1160 xfs_ioc_health_monitor( 1161 struct file *file, 1162 struct xfs_health_monitor __user *arg) 1163 { 1164 struct xfs_health_monitor hmo; 1165 struct xfs_healthmon_event *running_event; 1166 struct xfs_healthmon *hm; 1167 struct xfs_inode *ip = XFS_I(file_inode(file)); 1168 struct xfs_mount *mp = ip->i_mount; 1169 int ret; 1170 1171 /* 1172 * The only intended user of the health monitoring system should be the 1173 * xfs_healer daemon running on behalf of the whole filesystem in the 1174 * initial user namespace. IOWs, we don't allow unprivileged userspace 1175 * (they can use fsnotify) nor do we allow containers. 1176 */ 1177 if (!capable(CAP_SYS_ADMIN)) 1178 return -EPERM; 1179 if (ip->i_ino != mp->m_sb.sb_rootino) 1180 return -EPERM; 1181 if (current_user_ns() != &init_user_ns) 1182 return -EPERM; 1183 1184 if (copy_from_user(&hmo, arg, sizeof(hmo))) 1185 return -EFAULT; 1186 1187 if (!xfs_healthmon_validate(&hmo)) 1188 return -EINVAL; 1189 1190 hm = kzalloc_obj(*hm); 1191 if (!hm) 1192 return -ENOMEM; 1193 hm->dev = mp->m_super->s_dev; 1194 refcount_set(&hm->ref, 1); 1195 1196 mutex_init(&hm->lock); 1197 init_waitqueue_head(&hm->wait); 1198 1199 if (hmo.flags & XFS_HEALTH_MONITOR_VERBOSE) 1200 hm->verbose = true; 1201 1202 /* Queue up the first event that lets the client know we're running. */ 1203 running_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS); 1204 if (!running_event) { 1205 ret = -ENOMEM; 1206 goto out_hm; 1207 } 1208 running_event->type = XFS_HEALTHMON_RUNNING; 1209 running_event->domain = XFS_HEALTHMON_MOUNT; 1210 __xfs_healthmon_insert(hm, running_event); 1211 1212 /* 1213 * Preallocate the unmount event so that we can't fail to notify the 1214 * filesystem later. This is key for triggering fast exit of the 1215 * xfs_healer daemon. 1216 */ 1217 hm->unmount_event = kzalloc_obj(struct xfs_healthmon_event, GFP_NOFS); 1218 if (!hm->unmount_event) { 1219 ret = -ENOMEM; 1220 goto out_hm; 1221 } 1222 hm->unmount_event->type = XFS_HEALTHMON_UNMOUNT; 1223 hm->unmount_event->domain = XFS_HEALTHMON_MOUNT; 1224 1225 /* 1226 * Try to attach this health monitor to the xfs_mount. The monitor is 1227 * considered live and will receive events if this succeeds. 1228 */ 1229 ret = xfs_healthmon_attach(mp, hm); 1230 if (ret) 1231 goto out_hm; 1232 1233 /* 1234 * Create the anonymous file and install a fd for it. If it succeeds, 1235 * the file owns hm and can go away at any time, so we must not access 1236 * it again. This must go last because we can't undo a fd table 1237 * installation. 1238 */ 1239 ret = anon_inode_getfd("xfs_healthmon", &xfs_healthmon_fops, hm, 1240 O_CLOEXEC | O_RDONLY); 1241 if (ret < 0) 1242 goto out_mp; 1243 1244 trace_xfs_healthmon_create(mp->m_super->s_dev, hmo.flags, hmo.format); 1245 1246 return ret; 1247 1248 out_mp: 1249 xfs_healthmon_detach(hm); 1250 out_hm: 1251 ASSERT(refcount_read(&hm->ref) == 1); 1252 xfs_healthmon_put(hm); 1253 return ret; 1254 } 1255