xref: /linux/fs/notify/fanotify/fanotify_user.c (revision 7f4f3b14e8079ecde096bd734af10e30d40c27b7)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <linux/fanotify.h>
3 #include <linux/fcntl.h>
4 #include <linux/file.h>
5 #include <linux/fs.h>
6 #include <linux/anon_inodes.h>
7 #include <linux/fsnotify_backend.h>
8 #include <linux/init.h>
9 #include <linux/mount.h>
10 #include <linux/namei.h>
11 #include <linux/poll.h>
12 #include <linux/security.h>
13 #include <linux/syscalls.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/uaccess.h>
17 #include <linux/compat.h>
18 #include <linux/sched/signal.h>
19 #include <linux/memcontrol.h>
20 #include <linux/statfs.h>
21 #include <linux/exportfs.h>
22 
23 #include <asm/ioctls.h>
24 
25 #include "../fsnotify.h"
26 #include "../fdinfo.h"
27 #include "fanotify.h"
28 
29 #define FANOTIFY_DEFAULT_MAX_EVENTS	16384
30 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS	8192
31 #define FANOTIFY_DEFAULT_MAX_GROUPS	128
32 #define FANOTIFY_DEFAULT_FEE_POOL_SIZE	32
33 
34 /*
35  * Legacy fanotify marks limits (8192) is per group and we introduced a tunable
36  * limit of marks per user, similar to inotify.  Effectively, the legacy limit
37  * of fanotify marks per user is <max marks per group> * <max groups per user>.
38  * This default limit (1M) also happens to match the increased limit of inotify
39  * max_user_watches since v5.10.
40  */
41 #define FANOTIFY_DEFAULT_MAX_USER_MARKS	\
42 	(FANOTIFY_OLD_DEFAULT_MAX_MARKS * FANOTIFY_DEFAULT_MAX_GROUPS)
43 
44 /*
45  * Most of the memory cost of adding an inode mark is pinning the marked inode.
46  * The size of the filesystem inode struct is not uniform across filesystems,
47  * so double the size of a VFS inode is used as a conservative approximation.
48  */
49 #define INODE_MARK_COST	(2 * sizeof(struct inode))
50 
51 /* configurable via /proc/sys/fs/fanotify/ */
52 static int fanotify_max_queued_events __read_mostly;
53 
54 #ifdef CONFIG_SYSCTL
55 
56 #include <linux/sysctl.h>
57 
58 static long ft_zero = 0;
59 static long ft_int_max = INT_MAX;
60 
61 static struct ctl_table fanotify_table[] = {
62 	{
63 		.procname	= "max_user_groups",
64 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS],
65 		.maxlen		= sizeof(long),
66 		.mode		= 0644,
67 		.proc_handler	= proc_doulongvec_minmax,
68 		.extra1		= &ft_zero,
69 		.extra2		= &ft_int_max,
70 	},
71 	{
72 		.procname	= "max_user_marks",
73 		.data	= &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS],
74 		.maxlen		= sizeof(long),
75 		.mode		= 0644,
76 		.proc_handler	= proc_doulongvec_minmax,
77 		.extra1		= &ft_zero,
78 		.extra2		= &ft_int_max,
79 	},
80 	{
81 		.procname	= "max_queued_events",
82 		.data		= &fanotify_max_queued_events,
83 		.maxlen		= sizeof(int),
84 		.mode		= 0644,
85 		.proc_handler	= proc_dointvec_minmax,
86 		.extra1		= SYSCTL_ZERO
87 	},
88 };
89 
90 static void __init fanotify_sysctls_init(void)
91 {
92 	register_sysctl("fs/fanotify", fanotify_table);
93 }
94 #else
95 #define fanotify_sysctls_init() do { } while (0)
96 #endif /* CONFIG_SYSCTL */
97 
98 /*
99  * All flags that may be specified in parameter event_f_flags of fanotify_init.
100  *
101  * Internal and external open flags are stored together in field f_flags of
102  * struct file. Only external open flags shall be allowed in event_f_flags.
103  * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be
104  * excluded.
105  */
106 #define	FANOTIFY_INIT_ALL_EVENT_F_BITS				( \
107 		O_ACCMODE	| O_APPEND	| O_NONBLOCK	| \
108 		__O_SYNC	| O_DSYNC	| O_CLOEXEC     | \
109 		O_LARGEFILE	| O_NOATIME	)
110 
111 extern const struct fsnotify_ops fanotify_fsnotify_ops;
112 
113 struct kmem_cache *fanotify_mark_cache __ro_after_init;
114 struct kmem_cache *fanotify_fid_event_cachep __ro_after_init;
115 struct kmem_cache *fanotify_path_event_cachep __ro_after_init;
116 struct kmem_cache *fanotify_perm_event_cachep __ro_after_init;
117 
118 #define FANOTIFY_EVENT_ALIGN 4
119 #define FANOTIFY_FID_INFO_HDR_LEN \
120 	(sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle))
121 #define FANOTIFY_PIDFD_INFO_HDR_LEN \
122 	sizeof(struct fanotify_event_info_pidfd)
123 #define FANOTIFY_ERROR_INFO_LEN \
124 	(sizeof(struct fanotify_event_info_error))
125 
126 static int fanotify_fid_info_len(int fh_len, int name_len)
127 {
128 	int info_len = fh_len;
129 
130 	if (name_len)
131 		info_len += name_len + 1;
132 
133 	return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len,
134 		       FANOTIFY_EVENT_ALIGN);
135 }
136 
137 /* FAN_RENAME may have one or two dir+name info records */
138 static int fanotify_dir_name_info_len(struct fanotify_event *event)
139 {
140 	struct fanotify_info *info = fanotify_event_info(event);
141 	int dir_fh_len = fanotify_event_dir_fh_len(event);
142 	int dir2_fh_len = fanotify_event_dir2_fh_len(event);
143 	int info_len = 0;
144 
145 	if (dir_fh_len)
146 		info_len += fanotify_fid_info_len(dir_fh_len,
147 						  info->name_len);
148 	if (dir2_fh_len)
149 		info_len += fanotify_fid_info_len(dir2_fh_len,
150 						  info->name2_len);
151 
152 	return info_len;
153 }
154 
155 static size_t fanotify_event_len(unsigned int info_mode,
156 				 struct fanotify_event *event)
157 {
158 	size_t event_len = FAN_EVENT_METADATA_LEN;
159 	int fh_len;
160 	int dot_len = 0;
161 
162 	if (!info_mode)
163 		return event_len;
164 
165 	if (fanotify_is_error_event(event->mask))
166 		event_len += FANOTIFY_ERROR_INFO_LEN;
167 
168 	if (fanotify_event_has_any_dir_fh(event)) {
169 		event_len += fanotify_dir_name_info_len(event);
170 	} else if ((info_mode & FAN_REPORT_NAME) &&
171 		   (event->mask & FAN_ONDIR)) {
172 		/*
173 		 * With group flag FAN_REPORT_NAME, if name was not recorded in
174 		 * event on a directory, we will report the name ".".
175 		 */
176 		dot_len = 1;
177 	}
178 
179 	if (info_mode & FAN_REPORT_PIDFD)
180 		event_len += FANOTIFY_PIDFD_INFO_HDR_LEN;
181 
182 	if (fanotify_event_has_object_fh(event)) {
183 		fh_len = fanotify_event_object_fh_len(event);
184 		event_len += fanotify_fid_info_len(fh_len, dot_len);
185 	}
186 
187 	return event_len;
188 }
189 
190 /*
191  * Remove an hashed event from merge hash table.
192  */
193 static void fanotify_unhash_event(struct fsnotify_group *group,
194 				  struct fanotify_event *event)
195 {
196 	assert_spin_locked(&group->notification_lock);
197 
198 	pr_debug("%s: group=%p event=%p bucket=%u\n", __func__,
199 		 group, event, fanotify_event_hash_bucket(group, event));
200 
201 	if (WARN_ON_ONCE(hlist_unhashed(&event->merge_list)))
202 		return;
203 
204 	hlist_del_init(&event->merge_list);
205 }
206 
207 /*
208  * Get an fanotify notification event if one exists and is small
209  * enough to fit in "count". Return an error pointer if the count
210  * is not large enough. When permission event is dequeued, its state is
211  * updated accordingly.
212  */
213 static struct fanotify_event *get_one_event(struct fsnotify_group *group,
214 					    size_t count)
215 {
216 	size_t event_size;
217 	struct fanotify_event *event = NULL;
218 	struct fsnotify_event *fsn_event;
219 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
220 
221 	pr_debug("%s: group=%p count=%zd\n", __func__, group, count);
222 
223 	spin_lock(&group->notification_lock);
224 	fsn_event = fsnotify_peek_first_event(group);
225 	if (!fsn_event)
226 		goto out;
227 
228 	event = FANOTIFY_E(fsn_event);
229 	event_size = fanotify_event_len(info_mode, event);
230 
231 	if (event_size > count) {
232 		event = ERR_PTR(-EINVAL);
233 		goto out;
234 	}
235 
236 	/*
237 	 * Held the notification_lock the whole time, so this is the
238 	 * same event we peeked above.
239 	 */
240 	fsnotify_remove_first_event(group);
241 	if (fanotify_is_perm_event(event->mask))
242 		FANOTIFY_PERM(event)->state = FAN_EVENT_REPORTED;
243 	if (fanotify_is_hashed_event(event->mask))
244 		fanotify_unhash_event(group, event);
245 out:
246 	spin_unlock(&group->notification_lock);
247 	return event;
248 }
249 
250 static int create_fd(struct fsnotify_group *group, const struct path *path,
251 		     struct file **file)
252 {
253 	int client_fd;
254 	struct file *new_file;
255 
256 	client_fd = get_unused_fd_flags(group->fanotify_data.f_flags);
257 	if (client_fd < 0)
258 		return client_fd;
259 
260 	/*
261 	 * we need a new file handle for the userspace program so it can read even if it was
262 	 * originally opened O_WRONLY.
263 	 */
264 	new_file = dentry_open(path,
265 			       group->fanotify_data.f_flags | __FMODE_NONOTIFY,
266 			       current_cred());
267 	if (IS_ERR(new_file)) {
268 		put_unused_fd(client_fd);
269 		client_fd = PTR_ERR(new_file);
270 	} else {
271 		*file = new_file;
272 	}
273 
274 	return client_fd;
275 }
276 
277 static int process_access_response_info(const char __user *info,
278 					size_t info_len,
279 				struct fanotify_response_info_audit_rule *friar)
280 {
281 	if (info_len != sizeof(*friar))
282 		return -EINVAL;
283 
284 	if (copy_from_user(friar, info, sizeof(*friar)))
285 		return -EFAULT;
286 
287 	if (friar->hdr.type != FAN_RESPONSE_INFO_AUDIT_RULE)
288 		return -EINVAL;
289 	if (friar->hdr.pad != 0)
290 		return -EINVAL;
291 	if (friar->hdr.len != sizeof(*friar))
292 		return -EINVAL;
293 
294 	return info_len;
295 }
296 
297 /*
298  * Finish processing of permission event by setting it to ANSWERED state and
299  * drop group->notification_lock.
300  */
301 static void finish_permission_event(struct fsnotify_group *group,
302 				    struct fanotify_perm_event *event, u32 response,
303 				    struct fanotify_response_info_audit_rule *friar)
304 				    __releases(&group->notification_lock)
305 {
306 	bool destroy = false;
307 
308 	assert_spin_locked(&group->notification_lock);
309 	event->response = response & ~FAN_INFO;
310 	if (response & FAN_INFO)
311 		memcpy(&event->audit_rule, friar, sizeof(*friar));
312 
313 	if (event->state == FAN_EVENT_CANCELED)
314 		destroy = true;
315 	else
316 		event->state = FAN_EVENT_ANSWERED;
317 	spin_unlock(&group->notification_lock);
318 	if (destroy)
319 		fsnotify_destroy_event(group, &event->fae.fse);
320 }
321 
322 static int process_access_response(struct fsnotify_group *group,
323 				   struct fanotify_response *response_struct,
324 				   const char __user *info,
325 				   size_t info_len)
326 {
327 	struct fanotify_perm_event *event;
328 	int fd = response_struct->fd;
329 	u32 response = response_struct->response;
330 	int ret = info_len;
331 	struct fanotify_response_info_audit_rule friar;
332 
333 	pr_debug("%s: group=%p fd=%d response=%u buf=%p size=%zu\n", __func__,
334 		 group, fd, response, info, info_len);
335 	/*
336 	 * make sure the response is valid, if invalid we do nothing and either
337 	 * userspace can send a valid response or we will clean it up after the
338 	 * timeout
339 	 */
340 	if (response & ~FANOTIFY_RESPONSE_VALID_MASK)
341 		return -EINVAL;
342 
343 	switch (response & FANOTIFY_RESPONSE_ACCESS) {
344 	case FAN_ALLOW:
345 	case FAN_DENY:
346 		break;
347 	default:
348 		return -EINVAL;
349 	}
350 
351 	if ((response & FAN_AUDIT) && !FAN_GROUP_FLAG(group, FAN_ENABLE_AUDIT))
352 		return -EINVAL;
353 
354 	if (response & FAN_INFO) {
355 		ret = process_access_response_info(info, info_len, &friar);
356 		if (ret < 0)
357 			return ret;
358 		if (fd == FAN_NOFD)
359 			return ret;
360 	} else {
361 		ret = 0;
362 	}
363 
364 	if (fd < 0)
365 		return -EINVAL;
366 
367 	spin_lock(&group->notification_lock);
368 	list_for_each_entry(event, &group->fanotify_data.access_list,
369 			    fae.fse.list) {
370 		if (event->fd != fd)
371 			continue;
372 
373 		list_del_init(&event->fae.fse.list);
374 		finish_permission_event(group, event, response, &friar);
375 		wake_up(&group->fanotify_data.access_waitq);
376 		return ret;
377 	}
378 	spin_unlock(&group->notification_lock);
379 
380 	return -ENOENT;
381 }
382 
383 static size_t copy_error_info_to_user(struct fanotify_event *event,
384 				      char __user *buf, int count)
385 {
386 	struct fanotify_event_info_error info = { };
387 	struct fanotify_error_event *fee = FANOTIFY_EE(event);
388 
389 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR;
390 	info.hdr.len = FANOTIFY_ERROR_INFO_LEN;
391 
392 	if (WARN_ON(count < info.hdr.len))
393 		return -EFAULT;
394 
395 	info.error = fee->error;
396 	info.error_count = fee->err_count;
397 
398 	if (copy_to_user(buf, &info, sizeof(info)))
399 		return -EFAULT;
400 
401 	return info.hdr.len;
402 }
403 
404 static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh,
405 				 int info_type, const char *name,
406 				 size_t name_len,
407 				 char __user *buf, size_t count)
408 {
409 	struct fanotify_event_info_fid info = { };
410 	struct file_handle handle = { };
411 	unsigned char bounce[FANOTIFY_INLINE_FH_LEN], *fh_buf;
412 	size_t fh_len = fh ? fh->len : 0;
413 	size_t info_len = fanotify_fid_info_len(fh_len, name_len);
414 	size_t len = info_len;
415 
416 	pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n",
417 		 __func__, fh_len, name_len, info_len, count);
418 
419 	if (WARN_ON_ONCE(len < sizeof(info) || len > count))
420 		return -EFAULT;
421 
422 	/*
423 	 * Copy event info fid header followed by variable sized file handle
424 	 * and optionally followed by variable sized filename.
425 	 */
426 	switch (info_type) {
427 	case FAN_EVENT_INFO_TYPE_FID:
428 	case FAN_EVENT_INFO_TYPE_DFID:
429 		if (WARN_ON_ONCE(name_len))
430 			return -EFAULT;
431 		break;
432 	case FAN_EVENT_INFO_TYPE_DFID_NAME:
433 	case FAN_EVENT_INFO_TYPE_OLD_DFID_NAME:
434 	case FAN_EVENT_INFO_TYPE_NEW_DFID_NAME:
435 		if (WARN_ON_ONCE(!name || !name_len))
436 			return -EFAULT;
437 		break;
438 	default:
439 		return -EFAULT;
440 	}
441 
442 	info.hdr.info_type = info_type;
443 	info.hdr.len = len;
444 	info.fsid = *fsid;
445 	if (copy_to_user(buf, &info, sizeof(info)))
446 		return -EFAULT;
447 
448 	buf += sizeof(info);
449 	len -= sizeof(info);
450 	if (WARN_ON_ONCE(len < sizeof(handle)))
451 		return -EFAULT;
452 
453 	handle.handle_type = fh->type;
454 	handle.handle_bytes = fh_len;
455 
456 	/* Mangle handle_type for bad file_handle */
457 	if (!fh_len)
458 		handle.handle_type = FILEID_INVALID;
459 
460 	if (copy_to_user(buf, &handle, sizeof(handle)))
461 		return -EFAULT;
462 
463 	buf += sizeof(handle);
464 	len -= sizeof(handle);
465 	if (WARN_ON_ONCE(len < fh_len))
466 		return -EFAULT;
467 
468 	/*
469 	 * For an inline fh and inline file name, copy through stack to exclude
470 	 * the copy from usercopy hardening protections.
471 	 */
472 	fh_buf = fanotify_fh_buf(fh);
473 	if (fh_len <= FANOTIFY_INLINE_FH_LEN) {
474 		memcpy(bounce, fh_buf, fh_len);
475 		fh_buf = bounce;
476 	}
477 	if (copy_to_user(buf, fh_buf, fh_len))
478 		return -EFAULT;
479 
480 	buf += fh_len;
481 	len -= fh_len;
482 
483 	if (name_len) {
484 		/* Copy the filename with terminating null */
485 		name_len++;
486 		if (WARN_ON_ONCE(len < name_len))
487 			return -EFAULT;
488 
489 		if (copy_to_user(buf, name, name_len))
490 			return -EFAULT;
491 
492 		buf += name_len;
493 		len -= name_len;
494 	}
495 
496 	/* Pad with 0's */
497 	WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN);
498 	if (len > 0 && clear_user(buf, len))
499 		return -EFAULT;
500 
501 	return info_len;
502 }
503 
504 static int copy_pidfd_info_to_user(int pidfd,
505 				   char __user *buf,
506 				   size_t count)
507 {
508 	struct fanotify_event_info_pidfd info = { };
509 	size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN;
510 
511 	if (WARN_ON_ONCE(info_len > count))
512 		return -EFAULT;
513 
514 	info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD;
515 	info.hdr.len = info_len;
516 	info.pidfd = pidfd;
517 
518 	if (copy_to_user(buf, &info, info_len))
519 		return -EFAULT;
520 
521 	return info_len;
522 }
523 
524 static int copy_info_records_to_user(struct fanotify_event *event,
525 				     struct fanotify_info *info,
526 				     unsigned int info_mode, int pidfd,
527 				     char __user *buf, size_t count)
528 {
529 	int ret, total_bytes = 0, info_type = 0;
530 	unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS;
531 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
532 
533 	/*
534 	 * Event info records order is as follows:
535 	 * 1. dir fid + name
536 	 * 2. (optional) new dir fid + new name
537 	 * 3. (optional) child fid
538 	 */
539 	if (fanotify_event_has_dir_fh(event)) {
540 		info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME :
541 					     FAN_EVENT_INFO_TYPE_DFID;
542 
543 		/* FAN_RENAME uses special info types */
544 		if (event->mask & FAN_RENAME)
545 			info_type = FAN_EVENT_INFO_TYPE_OLD_DFID_NAME;
546 
547 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
548 					    fanotify_info_dir_fh(info),
549 					    info_type,
550 					    fanotify_info_name(info),
551 					    info->name_len, buf, count);
552 		if (ret < 0)
553 			return ret;
554 
555 		buf += ret;
556 		count -= ret;
557 		total_bytes += ret;
558 	}
559 
560 	/* New dir fid+name may be reported in addition to old dir fid+name */
561 	if (fanotify_event_has_dir2_fh(event)) {
562 		info_type = FAN_EVENT_INFO_TYPE_NEW_DFID_NAME;
563 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
564 					    fanotify_info_dir2_fh(info),
565 					    info_type,
566 					    fanotify_info_name2(info),
567 					    info->name2_len, buf, count);
568 		if (ret < 0)
569 			return ret;
570 
571 		buf += ret;
572 		count -= ret;
573 		total_bytes += ret;
574 	}
575 
576 	if (fanotify_event_has_object_fh(event)) {
577 		const char *dot = NULL;
578 		int dot_len = 0;
579 
580 		if (fid_mode == FAN_REPORT_FID || info_type) {
581 			/*
582 			 * With only group flag FAN_REPORT_FID only type FID is
583 			 * reported. Second info record type is always FID.
584 			 */
585 			info_type = FAN_EVENT_INFO_TYPE_FID;
586 		} else if ((fid_mode & FAN_REPORT_NAME) &&
587 			   (event->mask & FAN_ONDIR)) {
588 			/*
589 			 * With group flag FAN_REPORT_NAME, if name was not
590 			 * recorded in an event on a directory, report the name
591 			 * "." with info type DFID_NAME.
592 			 */
593 			info_type = FAN_EVENT_INFO_TYPE_DFID_NAME;
594 			dot = ".";
595 			dot_len = 1;
596 		} else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) ||
597 			   (event->mask & FAN_ONDIR)) {
598 			/*
599 			 * With group flag FAN_REPORT_DIR_FID, a single info
600 			 * record has type DFID for directory entry modification
601 			 * event and for event on a directory.
602 			 */
603 			info_type = FAN_EVENT_INFO_TYPE_DFID;
604 		} else {
605 			/*
606 			 * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID,
607 			 * a single info record has type FID for event on a
608 			 * non-directory, when there is no directory to report.
609 			 * For example, on FAN_DELETE_SELF event.
610 			 */
611 			info_type = FAN_EVENT_INFO_TYPE_FID;
612 		}
613 
614 		ret = copy_fid_info_to_user(fanotify_event_fsid(event),
615 					    fanotify_event_object_fh(event),
616 					    info_type, dot, dot_len,
617 					    buf, count);
618 		if (ret < 0)
619 			return ret;
620 
621 		buf += ret;
622 		count -= ret;
623 		total_bytes += ret;
624 	}
625 
626 	if (pidfd_mode) {
627 		ret = copy_pidfd_info_to_user(pidfd, buf, count);
628 		if (ret < 0)
629 			return ret;
630 
631 		buf += ret;
632 		count -= ret;
633 		total_bytes += ret;
634 	}
635 
636 	if (fanotify_is_error_event(event->mask)) {
637 		ret = copy_error_info_to_user(event, buf, count);
638 		if (ret < 0)
639 			return ret;
640 		buf += ret;
641 		count -= ret;
642 		total_bytes += ret;
643 	}
644 
645 	return total_bytes;
646 }
647 
648 static ssize_t copy_event_to_user(struct fsnotify_group *group,
649 				  struct fanotify_event *event,
650 				  char __user *buf, size_t count)
651 {
652 	struct fanotify_event_metadata metadata;
653 	const struct path *path = fanotify_event_path(event);
654 	struct fanotify_info *info = fanotify_event_info(event);
655 	unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES);
656 	unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD;
657 	struct file *f = NULL, *pidfd_file = NULL;
658 	int ret, pidfd = -ESRCH, fd = -EBADF;
659 
660 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
661 
662 	metadata.event_len = fanotify_event_len(info_mode, event);
663 	metadata.metadata_len = FAN_EVENT_METADATA_LEN;
664 	metadata.vers = FANOTIFY_METADATA_VERSION;
665 	metadata.reserved = 0;
666 	metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS;
667 	metadata.pid = pid_vnr(event->pid);
668 	/*
669 	 * For an unprivileged listener, event->pid can be used to identify the
670 	 * events generated by the listener process itself, without disclosing
671 	 * the pids of other processes.
672 	 */
673 	if (FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
674 	    task_tgid(current) != event->pid)
675 		metadata.pid = 0;
676 
677 	/*
678 	 * For now, fid mode is required for an unprivileged listener and
679 	 * fid mode does not report fd in events.  Keep this check anyway
680 	 * for safety in case fid mode requirement is relaxed in the future
681 	 * to allow unprivileged listener to get events with no fd and no fid.
682 	 */
683 	if (!FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV) &&
684 	    path && path->mnt && path->dentry) {
685 		fd = create_fd(group, path, &f);
686 		/*
687 		 * Opening an fd from dentry can fail for several reasons.
688 		 * For example, when tasks are gone and we try to open their
689 		 * /proc files or we try to open a WRONLY file like in sysfs
690 		 * or when trying to open a file that was deleted on the
691 		 * remote network server.
692 		 *
693 		 * For a group with FAN_REPORT_FD_ERROR, we will send the
694 		 * event with the error instead of the open fd, otherwise
695 		 * Userspace may not get the error at all.
696 		 * In any case, userspace will not know which file failed to
697 		 * open, so add a debug print for further investigation.
698 		 */
699 		if (fd < 0) {
700 			pr_debug("fanotify: create_fd(%pd2) failed err=%d\n",
701 				 path->dentry, fd);
702 			if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR)) {
703 				/*
704 				 * Historically, we've handled EOPENSTALE in a
705 				 * special way and silently dropped such
706 				 * events. Now we have to keep it to maintain
707 				 * backward compatibility...
708 				 */
709 				if (fd == -EOPENSTALE)
710 					fd = 0;
711 				return fd;
712 			}
713 		}
714 	}
715 	if (FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR))
716 		metadata.fd = fd;
717 	else
718 		metadata.fd = fd >= 0 ? fd : FAN_NOFD;
719 
720 	if (pidfd_mode) {
721 		/*
722 		 * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual
723 		 * exclusion is ever lifted. At the time of incoporating pidfd
724 		 * support within fanotify, the pidfd API only supported the
725 		 * creation of pidfds for thread-group leaders.
726 		 */
727 		WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID));
728 
729 		/*
730 		 * The PIDTYPE_TGID check for an event->pid is performed
731 		 * preemptively in an attempt to catch out cases where the event
732 		 * listener reads events after the event generating process has
733 		 * already terminated.  Depending on flag FAN_REPORT_FD_ERROR,
734 		 * report either -ESRCH or FAN_NOPIDFD to the event listener in
735 		 * those cases with all other pidfd creation errors reported as
736 		 * the error code itself or as FAN_EPIDFD.
737 		 */
738 		if (metadata.pid && pid_has_task(event->pid, PIDTYPE_TGID))
739 			pidfd = pidfd_prepare(event->pid, 0, &pidfd_file);
740 
741 		if (!FAN_GROUP_FLAG(group, FAN_REPORT_FD_ERROR) && pidfd < 0)
742 			pidfd = pidfd == -ESRCH ? FAN_NOPIDFD : FAN_EPIDFD;
743 	}
744 
745 	ret = -EFAULT;
746 	/*
747 	 * Sanity check copy size in case get_one_event() and
748 	 * event_len sizes ever get out of sync.
749 	 */
750 	if (WARN_ON_ONCE(metadata.event_len > count))
751 		goto out_close_fd;
752 
753 	if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN))
754 		goto out_close_fd;
755 
756 	buf += FAN_EVENT_METADATA_LEN;
757 	count -= FAN_EVENT_METADATA_LEN;
758 
759 	if (info_mode) {
760 		ret = copy_info_records_to_user(event, info, info_mode, pidfd,
761 						buf, count);
762 		if (ret < 0)
763 			goto out_close_fd;
764 	}
765 
766 	if (f)
767 		fd_install(fd, f);
768 
769 	if (pidfd_file)
770 		fd_install(pidfd, pidfd_file);
771 
772 	if (fanotify_is_perm_event(event->mask))
773 		FANOTIFY_PERM(event)->fd = fd;
774 
775 	return metadata.event_len;
776 
777 out_close_fd:
778 	if (f) {
779 		put_unused_fd(fd);
780 		fput(f);
781 	}
782 
783 	if (pidfd_file) {
784 		put_unused_fd(pidfd);
785 		fput(pidfd_file);
786 	}
787 
788 	return ret;
789 }
790 
791 /* intofiy userspace file descriptor functions */
792 static __poll_t fanotify_poll(struct file *file, poll_table *wait)
793 {
794 	struct fsnotify_group *group = file->private_data;
795 	__poll_t ret = 0;
796 
797 	poll_wait(file, &group->notification_waitq, wait);
798 	spin_lock(&group->notification_lock);
799 	if (!fsnotify_notify_queue_is_empty(group))
800 		ret = EPOLLIN | EPOLLRDNORM;
801 	spin_unlock(&group->notification_lock);
802 
803 	return ret;
804 }
805 
806 static ssize_t fanotify_read(struct file *file, char __user *buf,
807 			     size_t count, loff_t *pos)
808 {
809 	struct fsnotify_group *group;
810 	struct fanotify_event *event;
811 	char __user *start;
812 	int ret;
813 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
814 
815 	start = buf;
816 	group = file->private_data;
817 
818 	pr_debug("%s: group=%p\n", __func__, group);
819 
820 	add_wait_queue(&group->notification_waitq, &wait);
821 	while (1) {
822 		/*
823 		 * User can supply arbitrarily large buffer. Avoid softlockups
824 		 * in case there are lots of available events.
825 		 */
826 		cond_resched();
827 		event = get_one_event(group, count);
828 		if (IS_ERR(event)) {
829 			ret = PTR_ERR(event);
830 			break;
831 		}
832 
833 		if (!event) {
834 			ret = -EAGAIN;
835 			if (file->f_flags & O_NONBLOCK)
836 				break;
837 
838 			ret = -ERESTARTSYS;
839 			if (signal_pending(current))
840 				break;
841 
842 			if (start != buf)
843 				break;
844 
845 			wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
846 			continue;
847 		}
848 
849 		ret = copy_event_to_user(group, event, buf, count);
850 
851 		/*
852 		 * Permission events get queued to wait for response.  Other
853 		 * events can be destroyed now.
854 		 */
855 		if (!fanotify_is_perm_event(event->mask)) {
856 			fsnotify_destroy_event(group, &event->fse);
857 		} else {
858 			if (ret <= 0 || FANOTIFY_PERM(event)->fd < 0) {
859 				spin_lock(&group->notification_lock);
860 				finish_permission_event(group,
861 					FANOTIFY_PERM(event), FAN_DENY, NULL);
862 				wake_up(&group->fanotify_data.access_waitq);
863 			} else {
864 				spin_lock(&group->notification_lock);
865 				list_add_tail(&event->fse.list,
866 					&group->fanotify_data.access_list);
867 				spin_unlock(&group->notification_lock);
868 			}
869 		}
870 		if (ret < 0)
871 			break;
872 		buf += ret;
873 		count -= ret;
874 	}
875 	remove_wait_queue(&group->notification_waitq, &wait);
876 
877 	if (start != buf && ret != -EFAULT)
878 		ret = buf - start;
879 	return ret;
880 }
881 
882 static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
883 {
884 	struct fanotify_response response;
885 	struct fsnotify_group *group;
886 	int ret;
887 	const char __user *info_buf = buf + sizeof(struct fanotify_response);
888 	size_t info_len;
889 
890 	if (!IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
891 		return -EINVAL;
892 
893 	group = file->private_data;
894 
895 	pr_debug("%s: group=%p count=%zu\n", __func__, group, count);
896 
897 	if (count < sizeof(response))
898 		return -EINVAL;
899 
900 	if (copy_from_user(&response, buf, sizeof(response)))
901 		return -EFAULT;
902 
903 	info_len = count - sizeof(response);
904 
905 	ret = process_access_response(group, &response, info_buf, info_len);
906 	if (ret < 0)
907 		count = ret;
908 	else
909 		count = sizeof(response) + ret;
910 
911 	return count;
912 }
913 
914 static int fanotify_release(struct inode *ignored, struct file *file)
915 {
916 	struct fsnotify_group *group = file->private_data;
917 	struct fsnotify_event *fsn_event;
918 
919 	/*
920 	 * Stop new events from arriving in the notification queue. since
921 	 * userspace cannot use fanotify fd anymore, no event can enter or
922 	 * leave access_list by now either.
923 	 */
924 	fsnotify_group_stop_queueing(group);
925 
926 	/*
927 	 * Process all permission events on access_list and notification queue
928 	 * and simulate reply from userspace.
929 	 */
930 	spin_lock(&group->notification_lock);
931 	while (!list_empty(&group->fanotify_data.access_list)) {
932 		struct fanotify_perm_event *event;
933 
934 		event = list_first_entry(&group->fanotify_data.access_list,
935 				struct fanotify_perm_event, fae.fse.list);
936 		list_del_init(&event->fae.fse.list);
937 		finish_permission_event(group, event, FAN_ALLOW, NULL);
938 		spin_lock(&group->notification_lock);
939 	}
940 
941 	/*
942 	 * Destroy all non-permission events. For permission events just
943 	 * dequeue them and set the response. They will be freed once the
944 	 * response is consumed and fanotify_get_response() returns.
945 	 */
946 	while ((fsn_event = fsnotify_remove_first_event(group))) {
947 		struct fanotify_event *event = FANOTIFY_E(fsn_event);
948 
949 		if (!(event->mask & FANOTIFY_PERM_EVENTS)) {
950 			spin_unlock(&group->notification_lock);
951 			fsnotify_destroy_event(group, fsn_event);
952 		} else {
953 			finish_permission_event(group, FANOTIFY_PERM(event),
954 						FAN_ALLOW, NULL);
955 		}
956 		spin_lock(&group->notification_lock);
957 	}
958 	spin_unlock(&group->notification_lock);
959 
960 	/* Response for all permission events it set, wakeup waiters */
961 	wake_up(&group->fanotify_data.access_waitq);
962 
963 	/* matches the fanotify_init->fsnotify_alloc_group */
964 	fsnotify_destroy_group(group);
965 
966 	return 0;
967 }
968 
969 static long fanotify_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
970 {
971 	struct fsnotify_group *group;
972 	struct fsnotify_event *fsn_event;
973 	void __user *p;
974 	int ret = -ENOTTY;
975 	size_t send_len = 0;
976 
977 	group = file->private_data;
978 
979 	p = (void __user *) arg;
980 
981 	switch (cmd) {
982 	case FIONREAD:
983 		spin_lock(&group->notification_lock);
984 		list_for_each_entry(fsn_event, &group->notification_list, list)
985 			send_len += FAN_EVENT_METADATA_LEN;
986 		spin_unlock(&group->notification_lock);
987 		ret = put_user(send_len, (int __user *) p);
988 		break;
989 	}
990 
991 	return ret;
992 }
993 
994 static const struct file_operations fanotify_fops = {
995 	.show_fdinfo	= fanotify_show_fdinfo,
996 	.poll		= fanotify_poll,
997 	.read		= fanotify_read,
998 	.write		= fanotify_write,
999 	.fasync		= NULL,
1000 	.release	= fanotify_release,
1001 	.unlocked_ioctl	= fanotify_ioctl,
1002 	.compat_ioctl	= compat_ptr_ioctl,
1003 	.llseek		= noop_llseek,
1004 };
1005 
1006 static int fanotify_find_path(int dfd, const char __user *filename,
1007 			      struct path *path, unsigned int flags, __u64 mask,
1008 			      unsigned int obj_type)
1009 {
1010 	int ret;
1011 
1012 	pr_debug("%s: dfd=%d filename=%p flags=%x\n", __func__,
1013 		 dfd, filename, flags);
1014 
1015 	if (filename == NULL) {
1016 		CLASS(fd, f)(dfd);
1017 
1018 		if (fd_empty(f))
1019 			return -EBADF;
1020 
1021 		if ((flags & FAN_MARK_ONLYDIR) &&
1022 		    !(S_ISDIR(file_inode(fd_file(f))->i_mode)))
1023 			return -ENOTDIR;
1024 
1025 		*path = fd_file(f)->f_path;
1026 		path_get(path);
1027 	} else {
1028 		unsigned int lookup_flags = 0;
1029 
1030 		if (!(flags & FAN_MARK_DONT_FOLLOW))
1031 			lookup_flags |= LOOKUP_FOLLOW;
1032 		if (flags & FAN_MARK_ONLYDIR)
1033 			lookup_flags |= LOOKUP_DIRECTORY;
1034 
1035 		ret = user_path_at(dfd, filename, lookup_flags, path);
1036 		if (ret)
1037 			goto out;
1038 	}
1039 
1040 	/* you can only watch an inode if you have read permissions on it */
1041 	ret = path_permission(path, MAY_READ);
1042 	if (ret) {
1043 		path_put(path);
1044 		goto out;
1045 	}
1046 
1047 	ret = security_path_notify(path, mask, obj_type);
1048 	if (ret)
1049 		path_put(path);
1050 
1051 out:
1052 	return ret;
1053 }
1054 
1055 static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
1056 					    __u32 mask, unsigned int flags,
1057 					    __u32 umask, int *destroy)
1058 {
1059 	__u32 oldmask, newmask;
1060 
1061 	/* umask bits cannot be removed by user */
1062 	mask &= ~umask;
1063 	spin_lock(&fsn_mark->lock);
1064 	oldmask = fsnotify_calc_mask(fsn_mark);
1065 	if (!(flags & FANOTIFY_MARK_IGNORE_BITS)) {
1066 		fsn_mark->mask &= ~mask;
1067 	} else {
1068 		fsn_mark->ignore_mask &= ~mask;
1069 	}
1070 	newmask = fsnotify_calc_mask(fsn_mark);
1071 	/*
1072 	 * We need to keep the mark around even if remaining mask cannot
1073 	 * result in any events (e.g. mask == FAN_ONDIR) to support incremenal
1074 	 * changes to the mask.
1075 	 * Destroy mark when only umask bits remain.
1076 	 */
1077 	*destroy = !((fsn_mark->mask | fsn_mark->ignore_mask) & ~umask);
1078 	spin_unlock(&fsn_mark->lock);
1079 
1080 	return oldmask & ~newmask;
1081 }
1082 
1083 static int fanotify_remove_mark(struct fsnotify_group *group,
1084 				void *obj, unsigned int obj_type, __u32 mask,
1085 				unsigned int flags, __u32 umask)
1086 {
1087 	struct fsnotify_mark *fsn_mark = NULL;
1088 	__u32 removed;
1089 	int destroy_mark;
1090 
1091 	fsnotify_group_lock(group);
1092 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1093 	if (!fsn_mark) {
1094 		fsnotify_group_unlock(group);
1095 		return -ENOENT;
1096 	}
1097 
1098 	removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags,
1099 						 umask, &destroy_mark);
1100 	if (removed & fsnotify_conn_mask(fsn_mark->connector))
1101 		fsnotify_recalc_mask(fsn_mark->connector);
1102 	if (destroy_mark)
1103 		fsnotify_detach_mark(fsn_mark);
1104 	fsnotify_group_unlock(group);
1105 	if (destroy_mark)
1106 		fsnotify_free_mark(fsn_mark);
1107 
1108 	/* matches the fsnotify_find_mark() */
1109 	fsnotify_put_mark(fsn_mark);
1110 	return 0;
1111 }
1112 
1113 static bool fanotify_mark_update_flags(struct fsnotify_mark *fsn_mark,
1114 				       unsigned int fan_flags)
1115 {
1116 	bool want_iref = !(fan_flags & FAN_MARK_EVICTABLE);
1117 	unsigned int ignore = fan_flags & FANOTIFY_MARK_IGNORE_BITS;
1118 	bool recalc = false;
1119 
1120 	/*
1121 	 * When using FAN_MARK_IGNORE for the first time, mark starts using
1122 	 * independent event flags in ignore mask.  After that, trying to
1123 	 * update the ignore mask with the old FAN_MARK_IGNORED_MASK API
1124 	 * will result in EEXIST error.
1125 	 */
1126 	if (ignore == FAN_MARK_IGNORE)
1127 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS;
1128 
1129 	/*
1130 	 * Setting FAN_MARK_IGNORED_SURV_MODIFY for the first time may lead to
1131 	 * the removal of the FS_MODIFY bit in calculated mask if it was set
1132 	 * because of an ignore mask that is now going to survive FS_MODIFY.
1133 	 */
1134 	if (ignore && (fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1135 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) {
1136 		fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
1137 		if (!(fsn_mark->mask & FS_MODIFY))
1138 			recalc = true;
1139 	}
1140 
1141 	if (fsn_mark->connector->type != FSNOTIFY_OBJ_TYPE_INODE ||
1142 	    want_iref == !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1143 		return recalc;
1144 
1145 	/*
1146 	 * NO_IREF may be removed from a mark, but not added.
1147 	 * When removed, fsnotify_recalc_mask() will take the inode ref.
1148 	 */
1149 	WARN_ON_ONCE(!want_iref);
1150 	fsn_mark->flags &= ~FSNOTIFY_MARK_FLAG_NO_IREF;
1151 
1152 	return true;
1153 }
1154 
1155 static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
1156 				      __u32 mask, unsigned int fan_flags)
1157 {
1158 	bool recalc;
1159 
1160 	spin_lock(&fsn_mark->lock);
1161 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS))
1162 		fsn_mark->mask |= mask;
1163 	else
1164 		fsn_mark->ignore_mask |= mask;
1165 
1166 	recalc = fsnotify_calc_mask(fsn_mark) &
1167 		~fsnotify_conn_mask(fsn_mark->connector);
1168 
1169 	recalc |= fanotify_mark_update_flags(fsn_mark, fan_flags);
1170 	spin_unlock(&fsn_mark->lock);
1171 
1172 	return recalc;
1173 }
1174 
1175 struct fan_fsid {
1176 	struct super_block *sb;
1177 	__kernel_fsid_t id;
1178 	bool weak;
1179 };
1180 
1181 static int fanotify_set_mark_fsid(struct fsnotify_group *group,
1182 				  struct fsnotify_mark *mark,
1183 				  struct fan_fsid *fsid)
1184 {
1185 	struct fsnotify_mark_connector *conn;
1186 	struct fsnotify_mark *old;
1187 	struct super_block *old_sb = NULL;
1188 
1189 	FANOTIFY_MARK(mark)->fsid = fsid->id;
1190 	mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
1191 	if (fsid->weak)
1192 		mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
1193 
1194 	/* First mark added will determine if group is single or multi fsid */
1195 	if (list_empty(&group->marks_list))
1196 		return 0;
1197 
1198 	/* Find sb of an existing mark */
1199 	list_for_each_entry(old, &group->marks_list, g_list) {
1200 		conn = READ_ONCE(old->connector);
1201 		if (!conn)
1202 			continue;
1203 		old_sb = fsnotify_connector_sb(conn);
1204 		if (old_sb)
1205 			break;
1206 	}
1207 
1208 	/* Only detached marks left? */
1209 	if (!old_sb)
1210 		return 0;
1211 
1212 	/* Do not allow mixing of marks with weak and strong fsid */
1213 	if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
1214 		return -EXDEV;
1215 
1216 	/* Allow mixing of marks with strong fsid from different fs */
1217 	if (!fsid->weak)
1218 		return 0;
1219 
1220 	/* Do not allow mixing marks with weak fsid from different fs */
1221 	if (old_sb != fsid->sb)
1222 		return -EXDEV;
1223 
1224 	/* Do not allow mixing marks from different btrfs sub-volumes */
1225 	if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
1226 				 &FANOTIFY_MARK(mark)->fsid))
1227 		return -EXDEV;
1228 
1229 	return 0;
1230 }
1231 
1232 static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
1233 						   void *obj,
1234 						   unsigned int obj_type,
1235 						   unsigned int fan_flags,
1236 						   struct fan_fsid *fsid)
1237 {
1238 	struct ucounts *ucounts = group->fanotify_data.ucounts;
1239 	struct fanotify_mark *fan_mark;
1240 	struct fsnotify_mark *mark;
1241 	int ret;
1242 
1243 	/*
1244 	 * Enforce per user marks limits per user in all containing user ns.
1245 	 * A group with FAN_UNLIMITED_MARKS does not contribute to mark count
1246 	 * in the limited groups account.
1247 	 */
1248 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
1249 	    !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
1250 		return ERR_PTR(-ENOSPC);
1251 
1252 	fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
1253 	if (!fan_mark) {
1254 		ret = -ENOMEM;
1255 		goto out_dec_ucounts;
1256 	}
1257 
1258 	mark = &fan_mark->fsn_mark;
1259 	fsnotify_init_mark(mark, group);
1260 	if (fan_flags & FAN_MARK_EVICTABLE)
1261 		mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
1262 
1263 	/* Cache fsid of filesystem containing the marked object */
1264 	if (fsid) {
1265 		ret = fanotify_set_mark_fsid(group, mark, fsid);
1266 		if (ret)
1267 			goto out_put_mark;
1268 	} else {
1269 		fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
1270 	}
1271 
1272 	ret = fsnotify_add_mark_locked(mark, obj, obj_type, 0);
1273 	if (ret)
1274 		goto out_put_mark;
1275 
1276 	return mark;
1277 
1278 out_put_mark:
1279 	fsnotify_put_mark(mark);
1280 out_dec_ucounts:
1281 	if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
1282 		dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
1283 	return ERR_PTR(ret);
1284 }
1285 
1286 static int fanotify_group_init_error_pool(struct fsnotify_group *group)
1287 {
1288 	if (mempool_initialized(&group->fanotify_data.error_events_pool))
1289 		return 0;
1290 
1291 	return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool,
1292 					 FANOTIFY_DEFAULT_FEE_POOL_SIZE,
1293 					 sizeof(struct fanotify_error_event));
1294 }
1295 
1296 static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
1297 					      unsigned int fan_flags)
1298 {
1299 	/*
1300 	 * Non evictable mark cannot be downgraded to evictable mark.
1301 	 */
1302 	if (fan_flags & FAN_MARK_EVICTABLE &&
1303 	    !(fsn_mark->flags & FSNOTIFY_MARK_FLAG_NO_IREF))
1304 		return -EEXIST;
1305 
1306 	/*
1307 	 * New ignore mask semantics cannot be downgraded to old semantics.
1308 	 */
1309 	if (fan_flags & FAN_MARK_IGNORED_MASK &&
1310 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_HAS_IGNORE_FLAGS)
1311 		return -EEXIST;
1312 
1313 	/*
1314 	 * An ignore mask that survives modify could never be downgraded to not
1315 	 * survive modify.  With new FAN_MARK_IGNORE semantics we make that rule
1316 	 * explicit and return an error when trying to update the ignore mask
1317 	 * without the original FAN_MARK_IGNORED_SURV_MODIFY value.
1318 	 */
1319 	if (fan_flags & FAN_MARK_IGNORE &&
1320 	    !(fan_flags & FAN_MARK_IGNORED_SURV_MODIFY) &&
1321 	    fsn_mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)
1322 		return -EEXIST;
1323 
1324 	return 0;
1325 }
1326 
1327 static int fanotify_add_mark(struct fsnotify_group *group,
1328 			     void *obj, unsigned int obj_type,
1329 			     __u32 mask, unsigned int fan_flags,
1330 			     struct fan_fsid *fsid)
1331 {
1332 	struct fsnotify_mark *fsn_mark;
1333 	bool recalc;
1334 	int ret = 0;
1335 
1336 	fsnotify_group_lock(group);
1337 	fsn_mark = fsnotify_find_mark(obj, obj_type, group);
1338 	if (!fsn_mark) {
1339 		fsn_mark = fanotify_add_new_mark(group, obj, obj_type,
1340 						 fan_flags, fsid);
1341 		if (IS_ERR(fsn_mark)) {
1342 			fsnotify_group_unlock(group);
1343 			return PTR_ERR(fsn_mark);
1344 		}
1345 	}
1346 
1347 	/*
1348 	 * Check if requested mark flags conflict with an existing mark flags.
1349 	 */
1350 	ret = fanotify_may_update_existing_mark(fsn_mark, fan_flags);
1351 	if (ret)
1352 		goto out;
1353 
1354 	/*
1355 	 * Error events are pre-allocated per group, only if strictly
1356 	 * needed (i.e. FAN_FS_ERROR was requested).
1357 	 */
1358 	if (!(fan_flags & FANOTIFY_MARK_IGNORE_BITS) &&
1359 	    (mask & FAN_FS_ERROR)) {
1360 		ret = fanotify_group_init_error_pool(group);
1361 		if (ret)
1362 			goto out;
1363 	}
1364 
1365 	recalc = fanotify_mark_add_to_mask(fsn_mark, mask, fan_flags);
1366 	if (recalc)
1367 		fsnotify_recalc_mask(fsn_mark->connector);
1368 
1369 out:
1370 	fsnotify_group_unlock(group);
1371 
1372 	fsnotify_put_mark(fsn_mark);
1373 	return ret;
1374 }
1375 
1376 static struct fsnotify_event *fanotify_alloc_overflow_event(void)
1377 {
1378 	struct fanotify_event *oevent;
1379 
1380 	oevent = kmalloc(sizeof(*oevent), GFP_KERNEL_ACCOUNT);
1381 	if (!oevent)
1382 		return NULL;
1383 
1384 	fanotify_init_event(oevent, 0, FS_Q_OVERFLOW);
1385 	oevent->type = FANOTIFY_EVENT_TYPE_OVERFLOW;
1386 
1387 	return &oevent->fse;
1388 }
1389 
1390 static struct hlist_head *fanotify_alloc_merge_hash(void)
1391 {
1392 	struct hlist_head *hash;
1393 
1394 	hash = kmalloc(sizeof(struct hlist_head) << FANOTIFY_HTABLE_BITS,
1395 		       GFP_KERNEL_ACCOUNT);
1396 	if (!hash)
1397 		return NULL;
1398 
1399 	__hash_init(hash, FANOTIFY_HTABLE_SIZE);
1400 
1401 	return hash;
1402 }
1403 
1404 /* fanotify syscalls */
1405 SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
1406 {
1407 	struct fsnotify_group *group;
1408 	int f_flags, fd;
1409 	unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
1410 	unsigned int class = flags & FANOTIFY_CLASS_BITS;
1411 	unsigned int internal_flags = 0;
1412 
1413 	pr_debug("%s: flags=%x event_f_flags=%x\n",
1414 		 __func__, flags, event_f_flags);
1415 
1416 	if (!capable(CAP_SYS_ADMIN)) {
1417 		/*
1418 		 * An unprivileged user can setup an fanotify group with
1419 		 * limited functionality - an unprivileged group is limited to
1420 		 * notification events with file handles and it cannot use
1421 		 * unlimited queue/marks.
1422 		 */
1423 		if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
1424 			return -EPERM;
1425 
1426 		/*
1427 		 * Setting the internal flag FANOTIFY_UNPRIV on the group
1428 		 * prevents setting mount/filesystem marks on this group and
1429 		 * prevents reporting pid and open fd in events.
1430 		 */
1431 		internal_flags |= FANOTIFY_UNPRIV;
1432 	}
1433 
1434 #ifdef CONFIG_AUDITSYSCALL
1435 	if (flags & ~(FANOTIFY_INIT_FLAGS | FAN_ENABLE_AUDIT))
1436 #else
1437 	if (flags & ~FANOTIFY_INIT_FLAGS)
1438 #endif
1439 		return -EINVAL;
1440 
1441 	/*
1442 	 * A pidfd can only be returned for a thread-group leader; thus
1443 	 * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually
1444 	 * exclusive.
1445 	 */
1446 	if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID))
1447 		return -EINVAL;
1448 
1449 	if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS)
1450 		return -EINVAL;
1451 
1452 	switch (event_f_flags & O_ACCMODE) {
1453 	case O_RDONLY:
1454 	case O_RDWR:
1455 	case O_WRONLY:
1456 		break;
1457 	default:
1458 		return -EINVAL;
1459 	}
1460 
1461 	if (fid_mode && class != FAN_CLASS_NOTIF)
1462 		return -EINVAL;
1463 
1464 	/*
1465 	 * Child name is reported with parent fid so requires dir fid.
1466 	 * We can report both child fid and dir fid with or without name.
1467 	 */
1468 	if ((fid_mode & FAN_REPORT_NAME) && !(fid_mode & FAN_REPORT_DIR_FID))
1469 		return -EINVAL;
1470 
1471 	/*
1472 	 * FAN_REPORT_TARGET_FID requires FAN_REPORT_NAME and FAN_REPORT_FID
1473 	 * and is used as an indication to report both dir and child fid on all
1474 	 * dirent events.
1475 	 */
1476 	if ((fid_mode & FAN_REPORT_TARGET_FID) &&
1477 	    (!(fid_mode & FAN_REPORT_NAME) || !(fid_mode & FAN_REPORT_FID)))
1478 		return -EINVAL;
1479 
1480 	f_flags = O_RDWR | __FMODE_NONOTIFY;
1481 	if (flags & FAN_CLOEXEC)
1482 		f_flags |= O_CLOEXEC;
1483 	if (flags & FAN_NONBLOCK)
1484 		f_flags |= O_NONBLOCK;
1485 
1486 	/* fsnotify_alloc_group takes a ref.  Dropped in fanotify_release */
1487 	group = fsnotify_alloc_group(&fanotify_fsnotify_ops,
1488 				     FSNOTIFY_GROUP_USER);
1489 	if (IS_ERR(group)) {
1490 		return PTR_ERR(group);
1491 	}
1492 
1493 	/* Enforce groups limits per user in all containing user ns */
1494 	group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
1495 						  current_euid(),
1496 						  UCOUNT_FANOTIFY_GROUPS);
1497 	if (!group->fanotify_data.ucounts) {
1498 		fd = -EMFILE;
1499 		goto out_destroy_group;
1500 	}
1501 
1502 	group->fanotify_data.flags = flags | internal_flags;
1503 	group->memcg = get_mem_cgroup_from_mm(current->mm);
1504 
1505 	group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
1506 	if (!group->fanotify_data.merge_hash) {
1507 		fd = -ENOMEM;
1508 		goto out_destroy_group;
1509 	}
1510 
1511 	group->overflow_event = fanotify_alloc_overflow_event();
1512 	if (unlikely(!group->overflow_event)) {
1513 		fd = -ENOMEM;
1514 		goto out_destroy_group;
1515 	}
1516 
1517 	if (force_o_largefile())
1518 		event_f_flags |= O_LARGEFILE;
1519 	group->fanotify_data.f_flags = event_f_flags;
1520 	init_waitqueue_head(&group->fanotify_data.access_waitq);
1521 	INIT_LIST_HEAD(&group->fanotify_data.access_list);
1522 	switch (class) {
1523 	case FAN_CLASS_NOTIF:
1524 		group->priority = FSNOTIFY_PRIO_NORMAL;
1525 		break;
1526 	case FAN_CLASS_CONTENT:
1527 		group->priority = FSNOTIFY_PRIO_CONTENT;
1528 		break;
1529 	case FAN_CLASS_PRE_CONTENT:
1530 		group->priority = FSNOTIFY_PRIO_PRE_CONTENT;
1531 		break;
1532 	default:
1533 		fd = -EINVAL;
1534 		goto out_destroy_group;
1535 	}
1536 
1537 	if (flags & FAN_UNLIMITED_QUEUE) {
1538 		fd = -EPERM;
1539 		if (!capable(CAP_SYS_ADMIN))
1540 			goto out_destroy_group;
1541 		group->max_events = UINT_MAX;
1542 	} else {
1543 		group->max_events = fanotify_max_queued_events;
1544 	}
1545 
1546 	if (flags & FAN_UNLIMITED_MARKS) {
1547 		fd = -EPERM;
1548 		if (!capable(CAP_SYS_ADMIN))
1549 			goto out_destroy_group;
1550 	}
1551 
1552 	if (flags & FAN_ENABLE_AUDIT) {
1553 		fd = -EPERM;
1554 		if (!capable(CAP_AUDIT_WRITE))
1555 			goto out_destroy_group;
1556 	}
1557 
1558 	fd = anon_inode_getfd("[fanotify]", &fanotify_fops, group, f_flags);
1559 	if (fd < 0)
1560 		goto out_destroy_group;
1561 
1562 	return fd;
1563 
1564 out_destroy_group:
1565 	fsnotify_destroy_group(group);
1566 	return fd;
1567 }
1568 
1569 static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
1570 			      struct fan_fsid *fsid)
1571 {
1572 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1573 	__kernel_fsid_t root_fsid;
1574 	int err;
1575 
1576 	/*
1577 	 * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
1578 	 */
1579 	err = vfs_get_fsid(dentry, &fsid->id);
1580 	if (err)
1581 		return err;
1582 
1583 	fsid->sb = dentry->d_sb;
1584 	if (!fsid->id.val[0] && !fsid->id.val[1]) {
1585 		err = -ENODEV;
1586 		goto weak;
1587 	}
1588 
1589 	/*
1590 	 * Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
1591 	 * which uses a different fsid than sb root.
1592 	 */
1593 	err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid);
1594 	if (err)
1595 		return err;
1596 
1597 	if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
1598 		err = -EXDEV;
1599 		goto weak;
1600 	}
1601 
1602 	fsid->weak = false;
1603 	return 0;
1604 
1605 weak:
1606 	/* Allow weak fsid when marking inodes */
1607 	fsid->weak = true;
1608 	return (mark_type == FAN_MARK_INODE) ? 0 : err;
1609 }
1610 
1611 /* Check if filesystem can encode a unique fid */
1612 static int fanotify_test_fid(struct dentry *dentry, unsigned int flags)
1613 {
1614 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1615 	const struct export_operations *nop = dentry->d_sb->s_export_op;
1616 
1617 	/*
1618 	 * We need to make sure that the filesystem supports encoding of
1619 	 * file handles so user can use name_to_handle_at() to compare fids
1620 	 * reported with events to the file handle of watched objects.
1621 	 */
1622 	if (!exportfs_can_encode_fid(nop))
1623 		return -EOPNOTSUPP;
1624 
1625 	/*
1626 	 * For sb/mount mark, we also need to make sure that the filesystem
1627 	 * supports decoding file handles, so user has a way to map back the
1628 	 * reported fids to filesystem objects.
1629 	 */
1630 	if (mark_type != FAN_MARK_INODE && !exportfs_can_decode_fh(nop))
1631 		return -EOPNOTSUPP;
1632 
1633 	return 0;
1634 }
1635 
1636 static int fanotify_events_supported(struct fsnotify_group *group,
1637 				     const struct path *path, __u64 mask,
1638 				     unsigned int flags)
1639 {
1640 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1641 	/* Strict validation of events in non-dir inode mask with v5.17+ APIs */
1642 	bool strict_dir_events = FAN_GROUP_FLAG(group, FAN_REPORT_TARGET_FID) ||
1643 				 (mask & FAN_RENAME) ||
1644 				 (flags & FAN_MARK_IGNORE);
1645 
1646 	/*
1647 	 * Some filesystems such as 'proc' acquire unusual locks when opening
1648 	 * files. For them fanotify permission events have high chances of
1649 	 * deadlocking the system - open done when reporting fanotify event
1650 	 * blocks on this "unusual" lock while another process holding the lock
1651 	 * waits for fanotify permission event to be answered. Just disallow
1652 	 * permission events for such filesystems.
1653 	 */
1654 	if (mask & FANOTIFY_PERM_EVENTS &&
1655 	    path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
1656 		return -EINVAL;
1657 
1658 	/*
1659 	 * mount and sb marks are not allowed on kernel internal pseudo fs,
1660 	 * like pipe_mnt, because that would subscribe to events on all the
1661 	 * anonynous pipes in the system.
1662 	 *
1663 	 * SB_NOUSER covers all of the internal pseudo fs whose objects are not
1664 	 * exposed to user's mount namespace, but there are other SB_KERNMOUNT
1665 	 * fs, like nsfs, debugfs, for which the value of allowing sb and mount
1666 	 * mark is questionable. For now we leave them alone.
1667 	 */
1668 	if (mark_type != FAN_MARK_INODE &&
1669 	    path->mnt->mnt_sb->s_flags & SB_NOUSER)
1670 		return -EINVAL;
1671 
1672 	/*
1673 	 * We shouldn't have allowed setting dirent events and the directory
1674 	 * flags FAN_ONDIR and FAN_EVENT_ON_CHILD in mask of non-dir inode,
1675 	 * but because we always allowed it, error only when using new APIs.
1676 	 */
1677 	if (strict_dir_events && mark_type == FAN_MARK_INODE &&
1678 	    !d_is_dir(path->dentry) && (mask & FANOTIFY_DIRONLY_EVENT_BITS))
1679 		return -ENOTDIR;
1680 
1681 	return 0;
1682 }
1683 
1684 static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
1685 			    int dfd, const char  __user *pathname)
1686 {
1687 	struct inode *inode = NULL;
1688 	struct vfsmount *mnt = NULL;
1689 	struct fsnotify_group *group;
1690 	struct path path;
1691 	struct fan_fsid __fsid, *fsid = NULL;
1692 	u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
1693 	unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
1694 	unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
1695 	unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS;
1696 	unsigned int obj_type, fid_mode;
1697 	void *obj;
1698 	u32 umask = 0;
1699 	int ret;
1700 
1701 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
1702 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
1703 
1704 	/* we only use the lower 32 bits as of right now. */
1705 	if (upper_32_bits(mask))
1706 		return -EINVAL;
1707 
1708 	if (flags & ~FANOTIFY_MARK_FLAGS)
1709 		return -EINVAL;
1710 
1711 	switch (mark_type) {
1712 	case FAN_MARK_INODE:
1713 		obj_type = FSNOTIFY_OBJ_TYPE_INODE;
1714 		break;
1715 	case FAN_MARK_MOUNT:
1716 		obj_type = FSNOTIFY_OBJ_TYPE_VFSMOUNT;
1717 		break;
1718 	case FAN_MARK_FILESYSTEM:
1719 		obj_type = FSNOTIFY_OBJ_TYPE_SB;
1720 		break;
1721 	default:
1722 		return -EINVAL;
1723 	}
1724 
1725 	switch (mark_cmd) {
1726 	case FAN_MARK_ADD:
1727 	case FAN_MARK_REMOVE:
1728 		if (!mask)
1729 			return -EINVAL;
1730 		break;
1731 	case FAN_MARK_FLUSH:
1732 		if (flags & ~(FANOTIFY_MARK_TYPE_BITS | FAN_MARK_FLUSH))
1733 			return -EINVAL;
1734 		break;
1735 	default:
1736 		return -EINVAL;
1737 	}
1738 
1739 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS))
1740 		valid_mask |= FANOTIFY_PERM_EVENTS;
1741 
1742 	if (mask & ~valid_mask)
1743 		return -EINVAL;
1744 
1745 
1746 	/* We don't allow FAN_MARK_IGNORE & FAN_MARK_IGNORED_MASK together */
1747 	if (ignore == (FAN_MARK_IGNORE | FAN_MARK_IGNORED_MASK))
1748 		return -EINVAL;
1749 
1750 	/*
1751 	 * Event flags (FAN_ONDIR, FAN_EVENT_ON_CHILD) have no effect with
1752 	 * FAN_MARK_IGNORED_MASK.
1753 	 */
1754 	if (ignore == FAN_MARK_IGNORED_MASK) {
1755 		mask &= ~FANOTIFY_EVENT_FLAGS;
1756 		umask = FANOTIFY_EVENT_FLAGS;
1757 	}
1758 
1759 	CLASS(fd, f)(fanotify_fd);
1760 	if (fd_empty(f))
1761 		return -EBADF;
1762 
1763 	/* verify that this is indeed an fanotify instance */
1764 	if (unlikely(fd_file(f)->f_op != &fanotify_fops))
1765 		return -EINVAL;
1766 	group = fd_file(f)->private_data;
1767 
1768 	/*
1769 	 * An unprivileged user is not allowed to setup mount nor filesystem
1770 	 * marks.  This also includes setting up such marks by a group that
1771 	 * was initialized by an unprivileged user.
1772 	 */
1773 	if ((!capable(CAP_SYS_ADMIN) ||
1774 	     FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
1775 	    mark_type != FAN_MARK_INODE)
1776 		return -EPERM;
1777 
1778 	/*
1779 	 * Permission events require minimum priority FAN_CLASS_CONTENT.
1780 	 */
1781 	if (mask & FANOTIFY_PERM_EVENTS &&
1782 	    group->priority < FSNOTIFY_PRIO_CONTENT)
1783 		return -EINVAL;
1784 
1785 	if (mask & FAN_FS_ERROR &&
1786 	    mark_type != FAN_MARK_FILESYSTEM)
1787 		return -EINVAL;
1788 
1789 	/*
1790 	 * Evictable is only relevant for inode marks, because only inode object
1791 	 * can be evicted on memory pressure.
1792 	 */
1793 	if (flags & FAN_MARK_EVICTABLE &&
1794 	     mark_type != FAN_MARK_INODE)
1795 		return -EINVAL;
1796 
1797 	/*
1798 	 * Events that do not carry enough information to report
1799 	 * event->fd require a group that supports reporting fid.  Those
1800 	 * events are not supported on a mount mark, because they do not
1801 	 * carry enough information (i.e. path) to be filtered by mount
1802 	 * point.
1803 	 */
1804 	fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
1805 	if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
1806 	    (!fid_mode || mark_type == FAN_MARK_MOUNT))
1807 		return -EINVAL;
1808 
1809 	/*
1810 	 * FAN_RENAME uses special info type records to report the old and
1811 	 * new parent+name.  Reporting only old and new parent id is less
1812 	 * useful and was not implemented.
1813 	 */
1814 	if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
1815 		return -EINVAL;
1816 
1817 	if (mark_cmd == FAN_MARK_FLUSH) {
1818 		if (mark_type == FAN_MARK_MOUNT)
1819 			fsnotify_clear_vfsmount_marks_by_group(group);
1820 		else if (mark_type == FAN_MARK_FILESYSTEM)
1821 			fsnotify_clear_sb_marks_by_group(group);
1822 		else
1823 			fsnotify_clear_inode_marks_by_group(group);
1824 		return 0;
1825 	}
1826 
1827 	ret = fanotify_find_path(dfd, pathname, &path, flags,
1828 			(mask & ALL_FSNOTIFY_EVENTS), obj_type);
1829 	if (ret)
1830 		return ret;
1831 
1832 	if (mark_cmd == FAN_MARK_ADD) {
1833 		ret = fanotify_events_supported(group, &path, mask, flags);
1834 		if (ret)
1835 			goto path_put_and_out;
1836 	}
1837 
1838 	if (fid_mode) {
1839 		ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
1840 		if (ret)
1841 			goto path_put_and_out;
1842 
1843 		ret = fanotify_test_fid(path.dentry, flags);
1844 		if (ret)
1845 			goto path_put_and_out;
1846 
1847 		fsid = &__fsid;
1848 	}
1849 
1850 	/* inode held in place by reference to path; group by fget on fd */
1851 	if (mark_type == FAN_MARK_INODE) {
1852 		inode = path.dentry->d_inode;
1853 		obj = inode;
1854 	} else {
1855 		mnt = path.mnt;
1856 		if (mark_type == FAN_MARK_MOUNT)
1857 			obj = mnt;
1858 		else
1859 			obj = mnt->mnt_sb;
1860 	}
1861 
1862 	/*
1863 	 * If some other task has this inode open for write we should not add
1864 	 * an ignore mask, unless that ignore mask is supposed to survive
1865 	 * modification changes anyway.
1866 	 */
1867 	if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) &&
1868 	    !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) {
1869 		ret = mnt ? -EINVAL : -EISDIR;
1870 		/* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */
1871 		if (ignore == FAN_MARK_IGNORE &&
1872 		    (mnt || S_ISDIR(inode->i_mode)))
1873 			goto path_put_and_out;
1874 
1875 		ret = 0;
1876 		if (inode && inode_is_open_for_write(inode))
1877 			goto path_put_and_out;
1878 	}
1879 
1880 	/* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */
1881 	if (mnt || !S_ISDIR(inode->i_mode)) {
1882 		mask &= ~FAN_EVENT_ON_CHILD;
1883 		umask = FAN_EVENT_ON_CHILD;
1884 		/*
1885 		 * If group needs to report parent fid, register for getting
1886 		 * events with parent/name info for non-directory.
1887 		 */
1888 		if ((fid_mode & FAN_REPORT_DIR_FID) &&
1889 		    (flags & FAN_MARK_ADD) && !ignore)
1890 			mask |= FAN_EVENT_ON_CHILD;
1891 	}
1892 
1893 	/* create/update an inode mark */
1894 	switch (mark_cmd) {
1895 	case FAN_MARK_ADD:
1896 		ret = fanotify_add_mark(group, obj, obj_type, mask, flags,
1897 					fsid);
1898 		break;
1899 	case FAN_MARK_REMOVE:
1900 		ret = fanotify_remove_mark(group, obj, obj_type, mask, flags,
1901 					   umask);
1902 		break;
1903 	default:
1904 		ret = -EINVAL;
1905 	}
1906 
1907 path_put_and_out:
1908 	path_put(&path);
1909 	return ret;
1910 }
1911 
1912 #ifndef CONFIG_ARCH_SPLIT_ARG64
1913 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
1914 			      __u64, mask, int, dfd,
1915 			      const char  __user *, pathname)
1916 {
1917 	return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
1918 }
1919 #endif
1920 
1921 #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT)
1922 SYSCALL32_DEFINE6(fanotify_mark,
1923 				int, fanotify_fd, unsigned int, flags,
1924 				SC_ARG64(mask), int, dfd,
1925 				const char  __user *, pathname)
1926 {
1927 	return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask),
1928 				dfd, pathname);
1929 }
1930 #endif
1931 
1932 /*
1933  * fanotify_user_setup - Our initialization function.  Note that we cannot return
1934  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
1935  * must result in panic().
1936  */
1937 static int __init fanotify_user_setup(void)
1938 {
1939 	struct sysinfo si;
1940 	int max_marks;
1941 
1942 	si_meminfo(&si);
1943 	/*
1944 	 * Allow up to 1% of addressable memory to be accounted for per user
1945 	 * marks limited to the range [8192, 1048576]. mount and sb marks are
1946 	 * a lot cheaper than inode marks, but there is no reason for a user
1947 	 * to have many of those, so calculate by the cost of inode marks.
1948 	 */
1949 	max_marks = (((si.totalram - si.totalhigh) / 100) << PAGE_SHIFT) /
1950 		    INODE_MARK_COST;
1951 	max_marks = clamp(max_marks, FANOTIFY_OLD_DEFAULT_MAX_MARKS,
1952 				     FANOTIFY_DEFAULT_MAX_USER_MARKS);
1953 
1954 	BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS);
1955 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13);
1956 	BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
1957 
1958 	fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
1959 					 SLAB_PANIC|SLAB_ACCOUNT);
1960 	fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
1961 					       SLAB_PANIC);
1962 	fanotify_path_event_cachep = KMEM_CACHE(fanotify_path_event,
1963 						SLAB_PANIC);
1964 	if (IS_ENABLED(CONFIG_FANOTIFY_ACCESS_PERMISSIONS)) {
1965 		fanotify_perm_event_cachep =
1966 			KMEM_CACHE(fanotify_perm_event, SLAB_PANIC);
1967 	}
1968 
1969 	fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS;
1970 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] =
1971 					FANOTIFY_DEFAULT_MAX_GROUPS;
1972 	init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS] = max_marks;
1973 	fanotify_sysctls_init();
1974 
1975 	return 0;
1976 }
1977 device_initcall(fanotify_user_setup);
1978