xref: /linux/fs/afs/validation.c (revision f728c17fc97aea7a33151d9ba64106291c62bb02)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* vnode and volume validity verification.
3  *
4  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/sched.h>
11 #include "internal.h"
12 
13 /*
14  * Data validation is managed through a number of mechanisms from the server:
15  *
16  *  (1) On first contact with a server (such as if it has just been rebooted),
17  *      the server sends us a CB.InitCallBackState* request.
18  *
19  *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20  *      calls, the server maintains a time-limited per-vnode promise that it
21  *      will send us a CB.CallBack request if a third party alters the vnodes
22  *      accessed.
23  *
24  *      Note that a vnode-level callbacks may also be sent for other reasons,
25  *      such as filelock release.
26  *
27  *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28  *      calls, each server maintains a time-limited per-volume promise that it
29  *      will send us a CB.CallBack request if the RO volume is updated to a
30  *      snapshot of the RW volume ("vos release").  This is an atomic event
31  *      that cuts over all instances of the RO volume across multiple servers
32  *      simultaneously.
33  *
34  *	Note that a volume-level callbacks may also be sent for other reasons,
35  *	such as the volumeserver taking over control of the volume from the
36  *	fileserver.
37  *
38  *	Note also that each server maintains an independent time limit on an
39  *	independent callback.
40  *
41  *  (4) Certain RPC calls include a volume information record "VolSync" in
42  *      their reply.  This contains a creation date for the volume that should
43  *      remain unchanged for a RW volume (but will be changed if the volume is
44  *      restored from backup) or will be bumped to the time of snapshotting
45  *      when a RO volume is released.
46  *
47  * In order to track this events, the following are provided:
48  *
49  *	->cb_v_break.  A counter of events that might mean that the contents of
50  *	a volume have been altered since we last checked a vnode.
51  *
52  *	->cb_v_check.  A counter of the number of events that we've sent a
53  *	query to the server for.  Everything's up to date if this equals
54  *	cb_v_break.
55  *
56  *	->cb_scrub.  A counter of the number of regression events for which we
57  *	have to completely wipe the cache.
58  *
59  *	->cb_ro_snapshot.  A counter of the number of times that we've
60  *      recognised that a RO volume has been updated.
61  *
62  *	->cb_break.  A counter of events that might mean that the contents of a
63  *      vnode have been altered.
64  *
65  *	->cb_expires_at.  The time at which the callback promise expires or
66  *      AFS_NO_CB_PROMISE if we have no promise.
67  *
68  * The way we manage things is:
69  *
70  *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71  *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72  *      volume and volume's server record.
73  *
74  *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75  *	callback break on all the volumes that have been using that volume
76  *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
77  *
78  *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79  *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
80  *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
81  *	force reentry to the filesystem for revalidation.
82  *
83  *  (4) When entering the filesystem, we call afs_validate() to check the
84  *	validity of a vnode.  This first checks to see if ->cb_v_check and
85  *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86  *	exclusively and perform an FS.FetchStatus on the vnode.
87  *
88  *	After checking the volume, we check the vnode.  If there's a mismatch
89  *	between the volume counters and the vnode's mirrors of those counters,
90  *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91  *
92  *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93  *      parsed:
94  *
95  *	(A) If the Creation timestamp has changed on a RW volume or regressed
96  *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
97  *	    RO volume, we assume "vos release" happened and try to increment
98  *	    ->cb_ro_snapshot.
99  *
100  *      (B) If the Update timestamp has regressed, we try to increment
101  *	    ->cb_scrub.
102  *
103  *      Note that in both of these cases, we only do the increment if we can
104  *      cmpxchg the value of the timestamp from the value we noted before the
105  *      op.  This tries to prevent parallel ops from fighting one another.
106  *
107  *	volume->cb_v_check is then set to ->cb_v_break.
108  *
109  *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110  *	parsed and used to set the promise in ->cb_expires_at for the vnode,
111  *	the volume and the volume's server record.
112  *
113  *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114  *      the vnode.
115  */
116 
117 /*
118  * Check the validity of a vnode/inode and its parent volume.
119  */
120 bool afs_check_validity(const struct afs_vnode *vnode)
121 {
122 	const struct afs_volume *volume = vnode->volume;
123 	time64_t deadline = ktime_get_real_seconds() + 10;
124 
125 	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
126 	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
127 	    volume->cb_expires_at <= deadline ||
128 	    vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
129 	    vnode->cb_scrub	  != atomic_read(&volume->cb_scrub) ||
130 	    test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
131 		_debug("inval");
132 		return false;
133 	}
134 
135 	return true;
136 }
137 
138 /*
139  * See if the server we've just talked to is currently excluded.
140  */
141 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
142 {
143 	const struct afs_server_entry *se;
144 	const struct afs_server_list *slist;
145 	bool is_excluded = true;
146 	int i;
147 
148 	rcu_read_lock();
149 
150 	slist = rcu_dereference(volume->servers);
151 	for (i = 0; i < slist->nr_servers; i++) {
152 		se = &slist->servers[i];
153 		if (op->server == se->server) {
154 			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
155 			break;
156 		}
157 	}
158 
159 	rcu_read_unlock();
160 	return is_excluded;
161 }
162 
163 /*
164  * Update the volume's server list when the creation time changes and see if
165  * the server we've just talked to is currently excluded.
166  */
167 static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
168 {
169 	int ret;
170 
171 	if (__afs_is_server_excluded(op, volume))
172 		return 1;
173 
174 	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
175 	ret = afs_check_volume_status(op->volume, op);
176 	if (ret < 0)
177 		return ret;
178 
179 	return __afs_is_server_excluded(op, volume);
180 }
181 
182 /*
183  * Handle a change to the volume creation time in the VolSync record.
184  */
185 static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
186 {
187 	unsigned int snap;
188 	time64_t cur = volume->creation_time;
189 	time64_t old = op->pre_volsync.creation;
190 	time64_t new = op->volsync.creation;
191 	int ret;
192 
193 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
194 
195 	if (cur == TIME64_MIN) {
196 		volume->creation_time = new;
197 		return 0;
198 	}
199 
200 	if (new == cur)
201 		return 0;
202 
203 	/* Try to advance the creation timestamp from what we had before the
204 	 * operation to what we got back from the server.  This should
205 	 * hopefully ensure that in a race between multiple operations only one
206 	 * of them will do this.
207 	 */
208 	if (cur != old)
209 		return 0;
210 
211 	/* If the creation time changes in an unexpected way, we need to scrub
212 	 * our caches.  For a RW vol, this will only change if the volume is
213 	 * restored from a backup; for a RO/Backup vol, this will advance when
214 	 * the volume is updated to a new snapshot (eg. "vos release").
215 	 */
216 	if (volume->type == AFSVL_RWVOL)
217 		goto regressed;
218 	if (volume->type == AFSVL_BACKVOL) {
219 		if (new < old)
220 			goto regressed;
221 		goto advance;
222 	}
223 
224 	/* We have an RO volume, we need to query the VL server and look at the
225 	 * server flags to see if RW->RO replication is in progress.
226 	 */
227 	ret = afs_is_server_excluded(op, volume);
228 	if (ret < 0)
229 		return ret;
230 	if (ret > 0) {
231 		snap = atomic_read(&volume->cb_ro_snapshot);
232 		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
233 		return ret;
234 	}
235 
236 advance:
237 	snap = atomic_inc_return(&volume->cb_ro_snapshot);
238 	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
239 	volume->creation_time = new;
240 	return 0;
241 
242 regressed:
243 	atomic_inc(&volume->cb_scrub);
244 	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
245 	volume->creation_time = new;
246 	return 0;
247 }
248 
249 /*
250  * Handle a change to the volume update time in the VolSync record.
251  */
252 static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
253 {
254 	enum afs_cb_break_reason reason = afs_cb_break_no_break;
255 	time64_t cur = volume->update_time;
256 	time64_t old = op->pre_volsync.update;
257 	time64_t new = op->volsync.update;
258 
259 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
260 
261 	if (cur == TIME64_MIN) {
262 		volume->update_time = new;
263 		return;
264 	}
265 
266 	if (new == cur)
267 		return;
268 
269 	/* If the volume update time changes in an unexpected way, we need to
270 	 * scrub our caches.  For a RW vol, this will advance on every
271 	 * modification op; for a RO/Backup vol, this will advance when the
272 	 * volume is updated to a new snapshot (eg. "vos release").
273 	 */
274 	if (new < old)
275 		reason = afs_cb_break_for_update_regress;
276 
277 	/* Try to advance the update timestamp from what we had before the
278 	 * operation to what we got back from the server.  This should
279 	 * hopefully ensure that in a race between multiple operations only one
280 	 * of them will do this.
281 	 */
282 	if (cur == old) {
283 		if (reason == afs_cb_break_for_update_regress) {
284 			atomic_inc(&volume->cb_scrub);
285 			trace_afs_cb_v_break(volume->vid, 0, reason);
286 		}
287 		volume->update_time = new;
288 	}
289 }
290 
291 static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
292 {
293 	int ret = 0;
294 
295 	if (likely(op->volsync.creation == volume->creation_time &&
296 		   op->volsync.update == volume->update_time))
297 		return 0;
298 
299 	mutex_lock(&volume->volsync_lock);
300 	if (op->volsync.creation != volume->creation_time) {
301 		ret = afs_update_volume_creation_time(op, volume);
302 		if (ret < 0)
303 			goto out;
304 	}
305 	if (op->volsync.update != volume->update_time)
306 		afs_update_volume_update_time(op, volume);
307 out:
308 	mutex_unlock(&volume->volsync_lock);
309 	return ret;
310 }
311 
312 /*
313  * Update the state of a volume, including recording the expiration time of the
314  * callback promise.  Returns 1 to redo the operation from the start.
315  */
316 int afs_update_volume_state(struct afs_operation *op)
317 {
318 	struct afs_server_list *slist = op->server_list;
319 	struct afs_server_entry *se = &slist->servers[op->server_index];
320 	struct afs_callback *cb = &op->file[0].scb.callback;
321 	struct afs_volume *volume = op->volume;
322 	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
323 	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
324 	int ret;
325 
326 	_enter("%llx", op->volume->vid);
327 
328 	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
329 		ret = afs_update_volume_times(op, volume);
330 		if (ret != 0) {
331 			_leave(" = %d", ret);
332 			return ret;
333 		}
334 	}
335 
336 	if (op->cb_v_break == cb_v_break &&
337 	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
338 		time64_t expires_at = cb->expires_at;
339 
340 		if (!op->file[0].scb.have_cb)
341 			expires_at = op->file[1].scb.callback.expires_at;
342 
343 		se->cb_expires_at = expires_at;
344 		volume->cb_expires_at = expires_at;
345 	}
346 	if (cb_v_check < op->cb_v_break)
347 		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
348 	return 0;
349 }
350 
351 /*
352  * mark the data attached to an inode as obsolete due to a write on the server
353  * - might also want to ditch all the outstanding writes and dirty pages
354  */
355 static void afs_zap_data(struct afs_vnode *vnode)
356 {
357 	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
358 
359 	afs_invalidate_cache(vnode, 0);
360 
361 	/* nuke all the non-dirty pages that aren't locked, mapped or being
362 	 * written back in a regular file and completely discard the pages in a
363 	 * directory or symlink */
364 	if (S_ISREG(vnode->netfs.inode.i_mode))
365 		invalidate_remote_inode(&vnode->netfs.inode);
366 	else
367 		invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
368 }
369 
370 /*
371  * validate a vnode/inode
372  * - there are several things we need to check
373  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
374  *     symlink)
375  *   - parent dir metadata changed (security changes)
376  *   - dentry data changed (write, truncate)
377  *   - dentry metadata changed (security changes)
378  */
379 int afs_validate(struct afs_vnode *vnode, struct key *key)
380 {
381 	struct afs_volume *volume = vnode->volume;
382 	unsigned int cb_ro_snapshot, cb_scrub;
383 	time64_t deadline = ktime_get_real_seconds() + 10;
384 	bool zap = false, locked_vol = false;
385 	int ret;
386 
387 	_enter("{v={%llx:%llu} fl=%lx},%x",
388 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
389 	       key_serial(key));
390 
391 	if (afs_check_validity(vnode))
392 		return 0;
393 
394 	ret = down_write_killable(&vnode->validate_lock);
395 	if (ret < 0)
396 		goto error;
397 
398 	/* Validate a volume after the v_break has changed or the volume
399 	 * callback expired.  We only want to do this once per volume per
400 	 * v_break change.  The actual work will be done when parsing the
401 	 * status fetch reply.
402 	 */
403 	if (volume->cb_expires_at <= deadline ||
404 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
405 		ret = mutex_lock_interruptible(&volume->cb_check_lock);
406 		if (ret < 0)
407 			goto error_unlock;
408 		locked_vol = true;
409 	}
410 
411 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
412 	cb_scrub = atomic_read(&volume->cb_scrub);
413 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
414 	    vnode->cb_scrub	  != cb_scrub)
415 		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
416 
417 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
418 	    vnode->cb_scrub	  != cb_scrub ||
419 	    volume->cb_expires_at <= deadline ||
420 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
421 	    atomic64_read(&vnode->cb_expires_at) <= deadline
422 	    ) {
423 		ret = afs_fetch_status(vnode, key, false, NULL);
424 		if (ret < 0) {
425 			if (ret == -ENOENT) {
426 				set_bit(AFS_VNODE_DELETED, &vnode->flags);
427 				ret = -ESTALE;
428 			}
429 			goto error_unlock;
430 		}
431 
432 		_debug("new promise [fl=%lx]", vnode->flags);
433 	}
434 
435 	/* We can drop the volume lock now as. */
436 	if (locked_vol) {
437 		mutex_unlock(&volume->cb_check_lock);
438 		locked_vol = false;
439 	}
440 
441 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
442 	cb_scrub = atomic_read(&volume->cb_scrub);
443 	_debug("vnode inval %x==%x %x==%x",
444 	       vnode->cb_ro_snapshot, cb_ro_snapshot,
445 	       vnode->cb_scrub, cb_scrub);
446 	if (vnode->cb_scrub != cb_scrub)
447 		zap = true;
448 	vnode->cb_ro_snapshot = cb_ro_snapshot;
449 	vnode->cb_scrub = cb_scrub;
450 
451 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
452 		_debug("file already deleted");
453 		ret = -ESTALE;
454 		goto error_unlock;
455 	}
456 
457 	/* if the vnode's data version number changed then its contents are
458 	 * different */
459 	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
460 	if (zap)
461 		afs_zap_data(vnode);
462 	up_write(&vnode->validate_lock);
463 	_leave(" = 0");
464 	return 0;
465 
466 error_unlock:
467 	if (locked_vol)
468 		mutex_unlock(&volume->cb_check_lock);
469 	up_write(&vnode->validate_lock);
470 error:
471 	_leave(" = %d", ret);
472 	return ret;
473 }
474