1*21945e6cSDarrick J. Wong // SPDX-License-Identifier: GPL-2.0-or-later 2*21945e6cSDarrick J. Wong /* 3*21945e6cSDarrick J. Wong * Copyright (c) 2025 Oracle. All Rights Reserved. 4*21945e6cSDarrick J. Wong * Author: Darrick J. Wong <djwong@kernel.org> 5*21945e6cSDarrick J. Wong */ 6*21945e6cSDarrick J. Wong #include <linux/fs.h> 7*21945e6cSDarrick J. Wong #include <linux/fsnotify.h> 8*21945e6cSDarrick J. Wong #include <linux/mempool.h> 9*21945e6cSDarrick J. Wong #include <linux/fserror.h> 10*21945e6cSDarrick J. Wong 11*21945e6cSDarrick J. Wong #define FSERROR_DEFAULT_EVENT_POOL_SIZE (32) 12*21945e6cSDarrick J. Wong 13*21945e6cSDarrick J. Wong static struct mempool fserror_events_pool; 14*21945e6cSDarrick J. Wong 15*21945e6cSDarrick J. Wong void fserror_mount(struct super_block *sb) 16*21945e6cSDarrick J. Wong { 17*21945e6cSDarrick J. Wong /* 18*21945e6cSDarrick J. Wong * The pending error counter is biased by 1 so that we don't wake_var 19*21945e6cSDarrick J. Wong * until we're actually trying to unmount. 20*21945e6cSDarrick J. Wong */ 21*21945e6cSDarrick J. Wong refcount_set(&sb->s_pending_errors, 1); 22*21945e6cSDarrick J. Wong } 23*21945e6cSDarrick J. Wong 24*21945e6cSDarrick J. Wong void fserror_unmount(struct super_block *sb) 25*21945e6cSDarrick J. Wong { 26*21945e6cSDarrick J. Wong /* 27*21945e6cSDarrick J. Wong * If we don't drop the pending error count to zero, then wait for it 28*21945e6cSDarrick J. Wong * to drop below 1, which means that the pending errors cleared and 29*21945e6cSDarrick J. Wong * hopefully we didn't saturate with 1 billion+ concurrent events. 30*21945e6cSDarrick J. Wong */ 31*21945e6cSDarrick J. Wong if (!refcount_dec_and_test(&sb->s_pending_errors)) 32*21945e6cSDarrick J. Wong wait_var_event(&sb->s_pending_errors, 33*21945e6cSDarrick J. Wong refcount_read(&sb->s_pending_errors) < 1); 34*21945e6cSDarrick J. Wong } 35*21945e6cSDarrick J. Wong 36*21945e6cSDarrick J. Wong static inline void fserror_pending_dec(struct super_block *sb) 37*21945e6cSDarrick J. Wong { 38*21945e6cSDarrick J. Wong if (refcount_dec_and_test(&sb->s_pending_errors)) 39*21945e6cSDarrick J. Wong wake_up_var(&sb->s_pending_errors); 40*21945e6cSDarrick J. Wong } 41*21945e6cSDarrick J. Wong 42*21945e6cSDarrick J. Wong static inline void fserror_free_event(struct fserror_event *event) 43*21945e6cSDarrick J. Wong { 44*21945e6cSDarrick J. Wong fserror_pending_dec(event->sb); 45*21945e6cSDarrick J. Wong mempool_free(event, &fserror_events_pool); 46*21945e6cSDarrick J. Wong } 47*21945e6cSDarrick J. Wong 48*21945e6cSDarrick J. Wong static void fserror_worker(struct work_struct *work) 49*21945e6cSDarrick J. Wong { 50*21945e6cSDarrick J. Wong struct fserror_event *event = 51*21945e6cSDarrick J. Wong container_of(work, struct fserror_event, work); 52*21945e6cSDarrick J. Wong struct super_block *sb = event->sb; 53*21945e6cSDarrick J. Wong 54*21945e6cSDarrick J. Wong if (sb->s_flags & SB_ACTIVE) { 55*21945e6cSDarrick J. Wong struct fs_error_report report = { 56*21945e6cSDarrick J. Wong /* send positive error number to userspace */ 57*21945e6cSDarrick J. Wong .error = -event->error, 58*21945e6cSDarrick J. Wong .inode = event->inode, 59*21945e6cSDarrick J. Wong .sb = event->sb, 60*21945e6cSDarrick J. Wong }; 61*21945e6cSDarrick J. Wong 62*21945e6cSDarrick J. Wong if (sb->s_op->report_error) 63*21945e6cSDarrick J. Wong sb->s_op->report_error(event); 64*21945e6cSDarrick J. Wong 65*21945e6cSDarrick J. Wong fsnotify(FS_ERROR, &report, FSNOTIFY_EVENT_ERROR, NULL, NULL, 66*21945e6cSDarrick J. Wong NULL, 0); 67*21945e6cSDarrick J. Wong } 68*21945e6cSDarrick J. Wong 69*21945e6cSDarrick J. Wong iput(event->inode); 70*21945e6cSDarrick J. Wong fserror_free_event(event); 71*21945e6cSDarrick J. Wong } 72*21945e6cSDarrick J. Wong 73*21945e6cSDarrick J. Wong static inline struct fserror_event *fserror_alloc_event(struct super_block *sb, 74*21945e6cSDarrick J. Wong gfp_t gfp_flags) 75*21945e6cSDarrick J. Wong { 76*21945e6cSDarrick J. Wong struct fserror_event *event = NULL; 77*21945e6cSDarrick J. Wong 78*21945e6cSDarrick J. Wong /* 79*21945e6cSDarrick J. Wong * If pending_errors already reached zero or is no longer active, 80*21945e6cSDarrick J. Wong * the superblock is being deactivated so there's no point in 81*21945e6cSDarrick J. Wong * continuing. 82*21945e6cSDarrick J. Wong * 83*21945e6cSDarrick J. Wong * The order of the check of s_pending_errors and SB_ACTIVE are 84*21945e6cSDarrick J. Wong * mandated by order of accesses in generic_shutdown_super and 85*21945e6cSDarrick J. Wong * fserror_unmount. Barriers are implicitly provided by the refcount 86*21945e6cSDarrick J. Wong * manipulations in this function and fserror_unmount. 87*21945e6cSDarrick J. Wong */ 88*21945e6cSDarrick J. Wong if (!refcount_inc_not_zero(&sb->s_pending_errors)) 89*21945e6cSDarrick J. Wong return NULL; 90*21945e6cSDarrick J. Wong if (!(sb->s_flags & SB_ACTIVE)) 91*21945e6cSDarrick J. Wong goto out_pending; 92*21945e6cSDarrick J. Wong 93*21945e6cSDarrick J. Wong event = mempool_alloc(&fserror_events_pool, gfp_flags); 94*21945e6cSDarrick J. Wong if (!event) 95*21945e6cSDarrick J. Wong goto out_pending; 96*21945e6cSDarrick J. Wong 97*21945e6cSDarrick J. Wong /* mempool_alloc doesn't support GFP_ZERO */ 98*21945e6cSDarrick J. Wong memset(event, 0, sizeof(*event)); 99*21945e6cSDarrick J. Wong event->sb = sb; 100*21945e6cSDarrick J. Wong INIT_WORK(&event->work, fserror_worker); 101*21945e6cSDarrick J. Wong 102*21945e6cSDarrick J. Wong return event; 103*21945e6cSDarrick J. Wong 104*21945e6cSDarrick J. Wong out_pending: 105*21945e6cSDarrick J. Wong fserror_pending_dec(sb); 106*21945e6cSDarrick J. Wong return NULL; 107*21945e6cSDarrick J. Wong } 108*21945e6cSDarrick J. Wong 109*21945e6cSDarrick J. Wong /** 110*21945e6cSDarrick J. Wong * fserror_report - report a filesystem error of some kind 111*21945e6cSDarrick J. Wong * 112*21945e6cSDarrick J. Wong * @sb: superblock of the filesystem 113*21945e6cSDarrick J. Wong * @inode: inode within that filesystem, if applicable 114*21945e6cSDarrick J. Wong * @type: type of error encountered 115*21945e6cSDarrick J. Wong * @pos: start of inode range affected, if applicable 116*21945e6cSDarrick J. Wong * @len: length of inode range affected, if applicable 117*21945e6cSDarrick J. Wong * @error: error number encountered, must be negative 118*21945e6cSDarrick J. Wong * @gfp: memory allocation flags for conveying the event to a worker, 119*21945e6cSDarrick J. Wong * since this function can be called from atomic contexts 120*21945e6cSDarrick J. Wong * 121*21945e6cSDarrick J. Wong * Report details of a filesystem error to the super_operations::report_error 122*21945e6cSDarrick J. Wong * callback if present; and to fsnotify for distribution to userspace. @sb, 123*21945e6cSDarrick J. Wong * @gfp, @type, and @error must all be specified. For file I/O errors, the 124*21945e6cSDarrick J. Wong * @inode, @pos, and @len fields must also be specified. For file metadata 125*21945e6cSDarrick J. Wong * errors, @inode must be specified. If @inode is not NULL, then @inode->i_sb 126*21945e6cSDarrick J. Wong * must point to @sb. 127*21945e6cSDarrick J. Wong * 128*21945e6cSDarrick J. Wong * Reporting work is deferred to a workqueue to ensure that ->report_error is 129*21945e6cSDarrick J. Wong * called from process context without any locks held. An active reference to 130*21945e6cSDarrick J. Wong * the inode is maintained until event handling is complete, and unmount will 131*21945e6cSDarrick J. Wong * wait for queued events to drain. 132*21945e6cSDarrick J. Wong */ 133*21945e6cSDarrick J. Wong void fserror_report(struct super_block *sb, struct inode *inode, 134*21945e6cSDarrick J. Wong enum fserror_type type, loff_t pos, u64 len, int error, 135*21945e6cSDarrick J. Wong gfp_t gfp) 136*21945e6cSDarrick J. Wong { 137*21945e6cSDarrick J. Wong struct fserror_event *event; 138*21945e6cSDarrick J. Wong 139*21945e6cSDarrick J. Wong /* sb and inode must be from the same filesystem */ 140*21945e6cSDarrick J. Wong WARN_ON_ONCE(inode && inode->i_sb != sb); 141*21945e6cSDarrick J. Wong 142*21945e6cSDarrick J. Wong /* error number must be negative */ 143*21945e6cSDarrick J. Wong WARN_ON_ONCE(error >= 0); 144*21945e6cSDarrick J. Wong 145*21945e6cSDarrick J. Wong event = fserror_alloc_event(sb, gfp); 146*21945e6cSDarrick J. Wong if (!event) 147*21945e6cSDarrick J. Wong goto lost; 148*21945e6cSDarrick J. Wong 149*21945e6cSDarrick J. Wong event->type = type; 150*21945e6cSDarrick J. Wong event->pos = pos; 151*21945e6cSDarrick J. Wong event->len = len; 152*21945e6cSDarrick J. Wong event->error = error; 153*21945e6cSDarrick J. Wong 154*21945e6cSDarrick J. Wong /* 155*21945e6cSDarrick J. Wong * Can't iput from non-sleeping context, so grabbing another reference 156*21945e6cSDarrick J. Wong * to the inode must be the last thing before submitting the event. 157*21945e6cSDarrick J. Wong */ 158*21945e6cSDarrick J. Wong if (inode) { 159*21945e6cSDarrick J. Wong event->inode = igrab(inode); 160*21945e6cSDarrick J. Wong if (!event->inode) 161*21945e6cSDarrick J. Wong goto lost_event; 162*21945e6cSDarrick J. Wong } 163*21945e6cSDarrick J. Wong 164*21945e6cSDarrick J. Wong /* 165*21945e6cSDarrick J. Wong * Use schedule_work here even if we're already in process context so 166*21945e6cSDarrick J. Wong * that fsnotify and super_operations::report_error implementations are 167*21945e6cSDarrick J. Wong * guaranteed to run in process context without any locks held. Since 168*21945e6cSDarrick J. Wong * errors are supposed to be rare, the overhead shouldn't kill us any 169*21945e6cSDarrick J. Wong * more than the failing device will. 170*21945e6cSDarrick J. Wong */ 171*21945e6cSDarrick J. Wong schedule_work(&event->work); 172*21945e6cSDarrick J. Wong return; 173*21945e6cSDarrick J. Wong 174*21945e6cSDarrick J. Wong lost_event: 175*21945e6cSDarrick J. Wong fserror_free_event(event); 176*21945e6cSDarrick J. Wong lost: 177*21945e6cSDarrick J. Wong if (inode) 178*21945e6cSDarrick J. Wong pr_err_ratelimited( 179*21945e6cSDarrick J. Wong "%s: lost file I/O error report for ino %lu type %u pos 0x%llx len 0x%llx error %d", 180*21945e6cSDarrick J. Wong sb->s_id, inode->i_ino, type, pos, len, error); 181*21945e6cSDarrick J. Wong else 182*21945e6cSDarrick J. Wong pr_err_ratelimited( 183*21945e6cSDarrick J. Wong "%s: lost filesystem error report for type %u error %d", 184*21945e6cSDarrick J. Wong sb->s_id, type, error); 185*21945e6cSDarrick J. Wong } 186*21945e6cSDarrick J. Wong EXPORT_SYMBOL_GPL(fserror_report); 187*21945e6cSDarrick J. Wong 188*21945e6cSDarrick J. Wong static int __init fserror_init(void) 189*21945e6cSDarrick J. Wong { 190*21945e6cSDarrick J. Wong return mempool_init_kmalloc_pool(&fserror_events_pool, 191*21945e6cSDarrick J. Wong FSERROR_DEFAULT_EVENT_POOL_SIZE, 192*21945e6cSDarrick J. Wong sizeof(struct fserror_event)); 193*21945e6cSDarrick J. Wong } 194*21945e6cSDarrick J. Wong fs_initcall(fserror_init); 195