xref: /linux/fs/afs/validation.c (revision ca56a74a31e26d81a481304ed2f631e65883372b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* vnode and volume validity verification.
3  *
4  * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
5  * Written by David Howells (dhowells@redhat.com)
6  */
7 
8 #include <linux/kernel.h>
9 #include <linux/module.h>
10 #include <linux/sched.h>
11 #include "internal.h"
12 
13 /*
14  * Data validation is managed through a number of mechanisms from the server:
15  *
16  *  (1) On first contact with a server (such as if it has just been rebooted),
17  *      the server sends us a CB.InitCallBackState* request.
18  *
19  *  (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
20  *      calls, the server maintains a time-limited per-vnode promise that it
21  *      will send us a CB.CallBack request if a third party alters the vnodes
22  *      accessed.
23  *
24  *      Note that a vnode-level callbacks may also be sent for other reasons,
25  *      such as filelock release.
26  *
27  *  (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
28  *      calls, each server maintains a time-limited per-volume promise that it
29  *      will send us a CB.CallBack request if the RO volume is updated to a
30  *      snapshot of the RW volume ("vos release").  This is an atomic event
31  *      that cuts over all instances of the RO volume across multiple servers
32  *      simultaneously.
33  *
34  *	Note that a volume-level callbacks may also be sent for other reasons,
35  *	such as the volumeserver taking over control of the volume from the
36  *	fileserver.
37  *
38  *	Note also that each server maintains an independent time limit on an
39  *	independent callback.
40  *
41  *  (4) Certain RPC calls include a volume information record "VolSync" in
42  *      their reply.  This contains a creation date for the volume that should
43  *      remain unchanged for a RW volume (but will be changed if the volume is
44  *      restored from backup) or will be bumped to the time of snapshotting
45  *      when a RO volume is released.
46  *
47  * In order to track this events, the following are provided:
48  *
49  *	->cb_v_break.  A counter of events that might mean that the contents of
50  *	a volume have been altered since we last checked a vnode.
51  *
52  *	->cb_v_check.  A counter of the number of events that we've sent a
53  *	query to the server for.  Everything's up to date if this equals
54  *	cb_v_break.
55  *
56  *	->cb_scrub.  A counter of the number of regression events for which we
57  *	have to completely wipe the cache.
58  *
59  *	->cb_ro_snapshot.  A counter of the number of times that we've
60  *      recognised that a RO volume has been updated.
61  *
62  *	->cb_break.  A counter of events that might mean that the contents of a
63  *      vnode have been altered.
64  *
65  *	->cb_expires_at.  The time at which the callback promise expires or
66  *      AFS_NO_CB_PROMISE if we have no promise.
67  *
68  * The way we manage things is:
69  *
70  *  (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
71  *      the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
72  *      volume and volume's server record.
73  *
74  *  (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
75  *	callback break on all the volumes that have been using that volume
76  *	(ie. increment ->cb_v_break and reset ->cb_expires_at).
77  *
78  *  (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
79  *	vnode and reset its ->cb_expires_at.  If the vnode is mmapped, we also
80  *	dispatch a work item to unmap all PTEs to the vnode's pagecache to
81  *	force reentry to the filesystem for revalidation.
82  *
83  *  (4) When entering the filesystem, we call afs_validate() to check the
84  *	validity of a vnode.  This first checks to see if ->cb_v_check and
85  *	->cb_v_break match, and if they don't, we lock volume->cb_check_lock
86  *	exclusively and perform an FS.FetchStatus on the vnode.
87  *
88  *	After checking the volume, we check the vnode.  If there's a mismatch
89  *	between the volume counters and the vnode's mirrors of those counters,
90  *	we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
91  *
92  *  (5) When the reply from FS.FetchStatus arrives, the VolSync record is
93  *      parsed:
94  *
95  *	(A) If the Creation timestamp has changed on a RW volume or regressed
96  *	    on a RO volume, we try to increment ->cb_scrub; if it advances on a
97  *	    RO volume, we assume "vos release" happened and try to increment
98  *	    ->cb_ro_snapshot.
99  *
100  *      (B) If the Update timestamp has regressed, we try to increment
101  *	    ->cb_scrub.
102  *
103  *      Note that in both of these cases, we only do the increment if we can
104  *      cmpxchg the value of the timestamp from the value we noted before the
105  *      op.  This tries to prevent parallel ops from fighting one another.
106  *
107  *	volume->cb_v_check is then set to ->cb_v_break.
108  *
109  *  (6) The AFSCallBack record included in the FS.FetchStatus reply is also
110  *	parsed and used to set the promise in ->cb_expires_at for the vnode,
111  *	the volume and the volume's server record.
112  *
113  *  (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
114  *      the vnode.
115  */
116 
117 /*
118  * Check the validity of a vnode/inode and its parent volume.
119  */
afs_check_validity(const struct afs_vnode * vnode)120 bool afs_check_validity(const struct afs_vnode *vnode)
121 {
122 	const struct afs_volume *volume = vnode->volume;
123 	enum afs_vnode_invalid_trace trace = afs_vnode_valid_trace;
124 	time64_t cb_expires_at = atomic64_read(&vnode->cb_expires_at);
125 	time64_t deadline = ktime_get_real_seconds() + 10;
126 
127 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
128 		return true;
129 
130 	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break))
131 		trace = afs_vnode_invalid_trace_cb_v_break;
132 	else if (cb_expires_at == AFS_NO_CB_PROMISE)
133 		trace = afs_vnode_invalid_trace_no_cb_promise;
134 	else if (cb_expires_at <= deadline)
135 		trace = afs_vnode_invalid_trace_expired;
136 	else if (volume->cb_expires_at <= deadline)
137 		trace = afs_vnode_invalid_trace_vol_expired;
138 	else if (vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot))
139 		trace = afs_vnode_invalid_trace_cb_ro_snapshot;
140 	else if (vnode->cb_scrub != atomic_read(&volume->cb_scrub))
141 		trace = afs_vnode_invalid_trace_cb_scrub;
142 	else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
143 		trace = afs_vnode_invalid_trace_zap_data;
144 	else
145 		return true;
146 	trace_afs_vnode_invalid(vnode, trace);
147 	return false;
148 }
149 
150 /*
151  * See if the server we've just talked to is currently excluded.
152  */
__afs_is_server_excluded(struct afs_operation * op,struct afs_volume * volume)153 static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
154 {
155 	const struct afs_server_entry *se;
156 	const struct afs_server_list *slist;
157 	bool is_excluded = true;
158 	int i;
159 
160 	rcu_read_lock();
161 
162 	slist = rcu_dereference(volume->servers);
163 	for (i = 0; i < slist->nr_servers; i++) {
164 		se = &slist->servers[i];
165 		if (op->server == se->server) {
166 			is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
167 			break;
168 		}
169 	}
170 
171 	rcu_read_unlock();
172 	return is_excluded;
173 }
174 
175 /*
176  * Update the volume's server list when the creation time changes and see if
177  * the server we've just talked to is currently excluded.
178  */
afs_is_server_excluded(struct afs_operation * op,struct afs_volume * volume)179 static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
180 {
181 	int ret;
182 
183 	if (__afs_is_server_excluded(op, volume))
184 		return 1;
185 
186 	set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
187 	ret = afs_check_volume_status(op->volume, op);
188 	if (ret < 0)
189 		return ret;
190 
191 	return __afs_is_server_excluded(op, volume);
192 }
193 
194 /*
195  * Handle a change to the volume creation time in the VolSync record.
196  */
afs_update_volume_creation_time(struct afs_operation * op,struct afs_volume * volume)197 static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
198 {
199 	unsigned int snap;
200 	time64_t cur = volume->creation_time;
201 	time64_t old = op->pre_volsync.creation;
202 	time64_t new = op->volsync.creation;
203 	int ret;
204 
205 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
206 
207 	if (cur == TIME64_MIN) {
208 		volume->creation_time = new;
209 		return 0;
210 	}
211 
212 	if (new == cur)
213 		return 0;
214 
215 	/* Try to advance the creation timestamp from what we had before the
216 	 * operation to what we got back from the server.  This should
217 	 * hopefully ensure that in a race between multiple operations only one
218 	 * of them will do this.
219 	 */
220 	if (cur != old)
221 		return 0;
222 
223 	/* If the creation time changes in an unexpected way, we need to scrub
224 	 * our caches.  For a RW vol, this will only change if the volume is
225 	 * restored from a backup; for a RO/Backup vol, this will advance when
226 	 * the volume is updated to a new snapshot (eg. "vos release").
227 	 */
228 	if (volume->type == AFSVL_RWVOL)
229 		goto regressed;
230 	if (volume->type == AFSVL_BACKVOL) {
231 		if (new < old)
232 			goto regressed;
233 		goto advance;
234 	}
235 
236 	/* We have an RO volume, we need to query the VL server and look at the
237 	 * server flags to see if RW->RO replication is in progress.
238 	 */
239 	ret = afs_is_server_excluded(op, volume);
240 	if (ret < 0)
241 		return ret;
242 	if (ret > 0) {
243 		snap = atomic_read(&volume->cb_ro_snapshot);
244 		trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
245 		return ret;
246 	}
247 
248 advance:
249 	snap = atomic_inc_return(&volume->cb_ro_snapshot);
250 	trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
251 	volume->creation_time = new;
252 	return 0;
253 
254 regressed:
255 	atomic_inc(&volume->cb_scrub);
256 	trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
257 	volume->creation_time = new;
258 	return 0;
259 }
260 
261 /*
262  * Handle a change to the volume update time in the VolSync record.
263  */
afs_update_volume_update_time(struct afs_operation * op,struct afs_volume * volume)264 static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
265 {
266 	enum afs_cb_break_reason reason = afs_cb_break_no_break;
267 	time64_t cur = volume->update_time;
268 	time64_t old = op->pre_volsync.update;
269 	time64_t new = op->volsync.update;
270 
271 	_enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
272 
273 	if (cur == TIME64_MIN) {
274 		volume->update_time = new;
275 		return;
276 	}
277 
278 	if (new == cur)
279 		return;
280 
281 	/* If the volume update time changes in an unexpected way, we need to
282 	 * scrub our caches.  For a RW vol, this will advance on every
283 	 * modification op; for a RO/Backup vol, this will advance when the
284 	 * volume is updated to a new snapshot (eg. "vos release").
285 	 */
286 	if (new < old)
287 		reason = afs_cb_break_for_update_regress;
288 
289 	/* Try to advance the update timestamp from what we had before the
290 	 * operation to what we got back from the server.  This should
291 	 * hopefully ensure that in a race between multiple operations only one
292 	 * of them will do this.
293 	 */
294 	if (cur == old) {
295 		if (reason == afs_cb_break_for_update_regress) {
296 			atomic_inc(&volume->cb_scrub);
297 			trace_afs_cb_v_break(volume->vid, 0, reason);
298 		}
299 		volume->update_time = new;
300 	}
301 }
302 
afs_update_volume_times(struct afs_operation * op,struct afs_volume * volume)303 static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
304 {
305 	int ret = 0;
306 
307 	if (likely(op->volsync.creation == volume->creation_time &&
308 		   op->volsync.update == volume->update_time))
309 		return 0;
310 
311 	mutex_lock(&volume->volsync_lock);
312 	if (op->volsync.creation != volume->creation_time) {
313 		ret = afs_update_volume_creation_time(op, volume);
314 		if (ret < 0)
315 			goto out;
316 	}
317 	if (op->volsync.update != volume->update_time)
318 		afs_update_volume_update_time(op, volume);
319 out:
320 	mutex_unlock(&volume->volsync_lock);
321 	return ret;
322 }
323 
324 /*
325  * Update the state of a volume, including recording the expiration time of the
326  * callback promise.  Returns 1 to redo the operation from the start.
327  */
afs_update_volume_state(struct afs_operation * op)328 int afs_update_volume_state(struct afs_operation *op)
329 {
330 	struct afs_server_list *slist = op->server_list;
331 	struct afs_server_entry *se = &slist->servers[op->server_index];
332 	struct afs_callback *cb = &op->file[0].scb.callback;
333 	struct afs_volume *volume = op->volume;
334 	unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
335 	unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
336 	int ret;
337 
338 	_enter("%llx", op->volume->vid);
339 
340 	if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
341 		ret = afs_update_volume_times(op, volume);
342 		if (ret != 0) {
343 			_leave(" = %d", ret);
344 			return ret;
345 		}
346 	}
347 
348 	if (op->cb_v_break == cb_v_break &&
349 	    (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
350 		time64_t expires_at = cb->expires_at;
351 
352 		if (!op->file[0].scb.have_cb)
353 			expires_at = op->file[1].scb.callback.expires_at;
354 
355 		se->cb_expires_at = expires_at;
356 		volume->cb_expires_at = expires_at;
357 	}
358 	if (cb_v_check < op->cb_v_break)
359 		atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
360 	return 0;
361 }
362 
363 /*
364  * mark the data attached to an inode as obsolete due to a write on the server
365  * - might also want to ditch all the outstanding writes and dirty pages
366  */
afs_zap_data(struct afs_vnode * vnode)367 static void afs_zap_data(struct afs_vnode *vnode)
368 {
369 	_enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
370 
371 	afs_invalidate_cache(vnode, 0);
372 
373 	/* nuke all the non-dirty pages that aren't locked, mapped or being
374 	 * written back in a regular file and completely discard the pages in a
375 	 * directory or symlink */
376 	if (S_ISREG(vnode->netfs.inode.i_mode))
377 		filemap_invalidate_inode(&vnode->netfs.inode, true, 0, LLONG_MAX);
378 	else
379 		filemap_invalidate_inode(&vnode->netfs.inode, false, 0, LLONG_MAX);
380 }
381 
382 /*
383  * validate a vnode/inode
384  * - there are several things we need to check
385  *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
386  *     symlink)
387  *   - parent dir metadata changed (security changes)
388  *   - dentry data changed (write, truncate)
389  *   - dentry metadata changed (security changes)
390  */
afs_validate(struct afs_vnode * vnode,struct key * key)391 int afs_validate(struct afs_vnode *vnode, struct key *key)
392 {
393 	struct afs_volume *volume = vnode->volume;
394 	unsigned int cb_ro_snapshot, cb_scrub;
395 	time64_t deadline = ktime_get_real_seconds() + 10;
396 	bool zap = false, locked_vol = false;
397 	int ret;
398 
399 	_enter("{v={%llx:%llu} fl=%lx},%x",
400 	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
401 	       key_serial(key));
402 
403 	if (afs_check_validity(vnode))
404 		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
405 
406 	ret = down_write_killable(&vnode->validate_lock);
407 	if (ret < 0)
408 		goto error;
409 
410 	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
411 		ret = -ESTALE;
412 		goto error_unlock;
413 	}
414 
415 	/* Validate a volume after the v_break has changed or the volume
416 	 * callback expired.  We only want to do this once per volume per
417 	 * v_break change.  The actual work will be done when parsing the
418 	 * status fetch reply.
419 	 */
420 	if (volume->cb_expires_at <= deadline ||
421 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
422 		ret = mutex_lock_interruptible(&volume->cb_check_lock);
423 		if (ret < 0)
424 			goto error_unlock;
425 		locked_vol = true;
426 	}
427 
428 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
429 	cb_scrub = atomic_read(&volume->cb_scrub);
430 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
431 	    vnode->cb_scrub	  != cb_scrub)
432 		unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
433 
434 	if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
435 	    vnode->cb_scrub	  != cb_scrub ||
436 	    volume->cb_expires_at <= deadline ||
437 	    atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
438 	    atomic64_read(&vnode->cb_expires_at) <= deadline
439 	    ) {
440 		ret = afs_fetch_status(vnode, key, false, NULL);
441 		if (ret < 0) {
442 			if (ret == -ENOENT) {
443 				set_bit(AFS_VNODE_DELETED, &vnode->flags);
444 				ret = -ESTALE;
445 			}
446 			goto error_unlock;
447 		}
448 
449 		_debug("new promise [fl=%lx]", vnode->flags);
450 	}
451 
452 	/* We can drop the volume lock now as. */
453 	if (locked_vol) {
454 		mutex_unlock(&volume->cb_check_lock);
455 		locked_vol = false;
456 	}
457 
458 	cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
459 	cb_scrub = atomic_read(&volume->cb_scrub);
460 	_debug("vnode inval %x==%x %x==%x",
461 	       vnode->cb_ro_snapshot, cb_ro_snapshot,
462 	       vnode->cb_scrub, cb_scrub);
463 	if (vnode->cb_scrub != cb_scrub)
464 		zap = true;
465 	vnode->cb_ro_snapshot = cb_ro_snapshot;
466 	vnode->cb_scrub = cb_scrub;
467 
468 	/* if the vnode's data version number changed then its contents are
469 	 * different */
470 	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
471 	if (zap)
472 		afs_zap_data(vnode);
473 	up_write(&vnode->validate_lock);
474 	_leave(" = 0");
475 	return 0;
476 
477 error_unlock:
478 	if (locked_vol)
479 		mutex_unlock(&volume->cb_check_lock);
480 	up_write(&vnode->validate_lock);
481 error:
482 	_leave(" = %d", ret);
483 	return ret;
484 }
485