xref: /linux/fs/xfs/libxfs/xfs_defer.c (revision 001821b0e79716c4e17c71d8e053a23599a7a508)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright (C) 2016 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_defer.h"
14 #include "xfs_trans.h"
15 #include "xfs_buf_item.h"
16 #include "xfs_inode.h"
17 #include "xfs_inode_item.h"
18 #include "xfs_trace.h"
19 #include "xfs_icache.h"
20 #include "xfs_log.h"
21 #include "xfs_rmap.h"
22 #include "xfs_refcount.h"
23 #include "xfs_bmap.h"
24 #include "xfs_alloc.h"
25 #include "xfs_buf.h"
26 #include "xfs_da_format.h"
27 #include "xfs_da_btree.h"
28 #include "xfs_attr.h"
29 #include "xfs_trans_priv.h"
30 #include "xfs_exchmaps.h"
31 
32 static struct kmem_cache	*xfs_defer_pending_cache;
33 
34 /*
35  * Deferred Operations in XFS
36  *
37  * Due to the way locking rules work in XFS, certain transactions (block
38  * mapping and unmapping, typically) have permanent reservations so that
39  * we can roll the transaction to adhere to AG locking order rules and
40  * to unlock buffers between metadata updates.  Prior to rmap/reflink,
41  * the mapping code had a mechanism to perform these deferrals for
42  * extents that were going to be freed; this code makes that facility
43  * more generic.
44  *
45  * When adding the reverse mapping and reflink features, it became
46  * necessary to perform complex remapping multi-transactions to comply
47  * with AG locking order rules, and to be able to spread a single
48  * refcount update operation (an operation on an n-block extent can
49  * update as many as n records!) among multiple transactions.  XFS can
50  * roll a transaction to facilitate this, but using this facility
51  * requires us to log "intent" items in case log recovery needs to
52  * redo the operation, and to log "done" items to indicate that redo
53  * is not necessary.
54  *
55  * Deferred work is tracked in xfs_defer_pending items.  Each pending
56  * item tracks one type of deferred work.  Incoming work items (which
57  * have not yet had an intent logged) are attached to a pending item
58  * on the dop_intake list, where they wait for the caller to finish
59  * the deferred operations.
60  *
61  * Finishing a set of deferred operations is an involved process.  To
62  * start, we define "rolling a deferred-op transaction" as follows:
63  *
64  * > For each xfs_defer_pending item on the dop_intake list,
65  *   - Sort the work items in AG order.  XFS locking
66  *     order rules require us to lock buffers in AG order.
67  *   - Create a log intent item for that type.
68  *   - Attach it to the pending item.
69  *   - Move the pending item from the dop_intake list to the
70  *     dop_pending list.
71  * > Roll the transaction.
72  *
73  * NOTE: To avoid exceeding the transaction reservation, we limit the
74  * number of items that we attach to a given xfs_defer_pending.
75  *
76  * The actual finishing process looks like this:
77  *
78  * > For each xfs_defer_pending in the dop_pending list,
79  *   - Roll the deferred-op transaction as above.
80  *   - Create a log done item for that type, and attach it to the
81  *     log intent item.
82  *   - For each work item attached to the log intent item,
83  *     * Perform the described action.
84  *     * Attach the work item to the log done item.
85  *     * If the result of doing the work was -EAGAIN, ->finish work
86  *       wants a new transaction.  See the "Requesting a Fresh
87  *       Transaction while Finishing Deferred Work" section below for
88  *       details.
89  *
90  * The key here is that we must log an intent item for all pending
91  * work items every time we roll the transaction, and that we must log
92  * a done item as soon as the work is completed.  With this mechanism
93  * we can perform complex remapping operations, chaining intent items
94  * as needed.
95  *
96  * Requesting a Fresh Transaction while Finishing Deferred Work
97  *
98  * If ->finish_item decides that it needs a fresh transaction to
99  * finish the work, it must ask its caller (xfs_defer_finish) for a
100  * continuation.  The most likely cause of this circumstance are the
101  * refcount adjust functions deciding that they've logged enough items
102  * to be at risk of exceeding the transaction reservation.
103  *
104  * To get a fresh transaction, we want to log the existing log done
105  * item to prevent the log intent item from replaying, immediately log
106  * a new log intent item with the unfinished work items, roll the
107  * transaction, and re-call ->finish_item wherever it left off.  The
108  * log done item and the new log intent item must be in the same
109  * transaction or atomicity cannot be guaranteed; defer_finish ensures
110  * that this happens.
111  *
112  * This requires some coordination between ->finish_item and
113  * defer_finish.  Upon deciding to request a new transaction,
114  * ->finish_item should update the current work item to reflect the
115  * unfinished work.  Next, it should reset the log done item's list
116  * count to the number of items finished, and return -EAGAIN.
117  * defer_finish sees the -EAGAIN, logs the new log intent item
118  * with the remaining work items, and leaves the xfs_defer_pending
119  * item at the head of the dop_work queue.  Then it rolls the
120  * transaction and picks up processing where it left off.  It is
121  * required that ->finish_item must be careful to leave enough
122  * transaction reservation to fit the new log intent item.
123  *
124  * This is an example of remapping the extent (E, E+B) into file X at
125  * offset A and dealing with the extent (C, C+B) already being mapped
126  * there:
127  * +-------------------------------------------------+
128  * | Unmap file X startblock C offset A length B     | t0
129  * | Intent to reduce refcount for extent (C, B)     |
130  * | Intent to remove rmap (X, C, A, B)              |
131  * | Intent to free extent (D, 1) (bmbt block)       |
132  * | Intent to map (X, A, B) at startblock E         |
133  * +-------------------------------------------------+
134  * | Map file X startblock E offset A length B       | t1
135  * | Done mapping (X, E, A, B)                       |
136  * | Intent to increase refcount for extent (E, B)   |
137  * | Intent to add rmap (X, E, A, B)                 |
138  * +-------------------------------------------------+
139  * | Reduce refcount for extent (C, B)               | t2
140  * | Done reducing refcount for extent (C, 9)        |
141  * | Intent to reduce refcount for extent (C+9, B-9) |
142  * | (ran out of space after 9 refcount updates)     |
143  * +-------------------------------------------------+
144  * | Reduce refcount for extent (C+9, B+9)           | t3
145  * | Done reducing refcount for extent (C+9, B-9)    |
146  * | Increase refcount for extent (E, B)             |
147  * | Done increasing refcount for extent (E, B)      |
148  * | Intent to free extent (C, B)                    |
149  * | Intent to free extent (F, 1) (refcountbt block) |
150  * | Intent to remove rmap (F, 1, REFC)              |
151  * +-------------------------------------------------+
152  * | Remove rmap (X, C, A, B)                        | t4
153  * | Done removing rmap (X, C, A, B)                 |
154  * | Add rmap (X, E, A, B)                           |
155  * | Done adding rmap (X, E, A, B)                   |
156  * | Remove rmap (F, 1, REFC)                        |
157  * | Done removing rmap (F, 1, REFC)                 |
158  * +-------------------------------------------------+
159  * | Free extent (C, B)                              | t5
160  * | Done freeing extent (C, B)                      |
161  * | Free extent (D, 1)                              |
162  * | Done freeing extent (D, 1)                      |
163  * | Free extent (F, 1)                              |
164  * | Done freeing extent (F, 1)                      |
165  * +-------------------------------------------------+
166  *
167  * If we should crash before t2 commits, log recovery replays
168  * the following intent items:
169  *
170  * - Intent to reduce refcount for extent (C, B)
171  * - Intent to remove rmap (X, C, A, B)
172  * - Intent to free extent (D, 1) (bmbt block)
173  * - Intent to increase refcount for extent (E, B)
174  * - Intent to add rmap (X, E, A, B)
175  *
176  * In the process of recovering, it should also generate and take care
177  * of these intent items:
178  *
179  * - Intent to free extent (C, B)
180  * - Intent to free extent (F, 1) (refcountbt block)
181  * - Intent to remove rmap (F, 1, REFC)
182  *
183  * Note that the continuation requested between t2 and t3 is likely to
184  * reoccur.
185  */
186 STATIC struct xfs_log_item *
187 xfs_defer_barrier_create_intent(
188 	struct xfs_trans		*tp,
189 	struct list_head		*items,
190 	unsigned int			count,
191 	bool				sort)
192 {
193 	return NULL;
194 }
195 
196 STATIC void
197 xfs_defer_barrier_abort_intent(
198 	struct xfs_log_item		*intent)
199 {
200 	/* empty */
201 }
202 
203 STATIC struct xfs_log_item *
204 xfs_defer_barrier_create_done(
205 	struct xfs_trans		*tp,
206 	struct xfs_log_item		*intent,
207 	unsigned int			count)
208 {
209 	return NULL;
210 }
211 
212 STATIC int
213 xfs_defer_barrier_finish_item(
214 	struct xfs_trans		*tp,
215 	struct xfs_log_item		*done,
216 	struct list_head		*item,
217 	struct xfs_btree_cur		**state)
218 {
219 	ASSERT(0);
220 	return -EFSCORRUPTED;
221 }
222 
223 STATIC void
224 xfs_defer_barrier_cancel_item(
225 	struct list_head		*item)
226 {
227 	ASSERT(0);
228 }
229 
230 static const struct xfs_defer_op_type xfs_barrier_defer_type = {
231 	.max_items	= 1,
232 	.create_intent	= xfs_defer_barrier_create_intent,
233 	.abort_intent	= xfs_defer_barrier_abort_intent,
234 	.create_done	= xfs_defer_barrier_create_done,
235 	.finish_item	= xfs_defer_barrier_finish_item,
236 	.cancel_item	= xfs_defer_barrier_cancel_item,
237 };
238 
239 /* Create a log intent done item for a log intent item. */
240 static inline void
241 xfs_defer_create_done(
242 	struct xfs_trans		*tp,
243 	struct xfs_defer_pending	*dfp)
244 {
245 	struct xfs_log_item		*lip;
246 
247 	/* If there is no log intent item, there can be no log done item. */
248 	if (!dfp->dfp_intent)
249 		return;
250 
251 	/*
252 	 * Mark the transaction dirty, even on error. This ensures the
253 	 * transaction is aborted, which:
254 	 *
255 	 * 1.) releases the log intent item and frees the log done item
256 	 * 2.) shuts down the filesystem
257 	 */
258 	tp->t_flags |= XFS_TRANS_DIRTY;
259 	lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
260 	if (!lip)
261 		return;
262 
263 	tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE;
264 	xfs_trans_add_item(tp, lip);
265 	set_bit(XFS_LI_DIRTY, &lip->li_flags);
266 	dfp->dfp_done = lip;
267 }
268 
269 /*
270  * Ensure there's a log intent item associated with this deferred work item if
271  * the operation must be restarted on crash.  Returns 1 if there's a log item;
272  * 0 if there isn't; or a negative errno.
273  */
274 static int
275 xfs_defer_create_intent(
276 	struct xfs_trans		*tp,
277 	struct xfs_defer_pending	*dfp,
278 	bool				sort)
279 {
280 	struct xfs_log_item		*lip;
281 
282 	if (dfp->dfp_intent)
283 		return 1;
284 
285 	lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count,
286 			sort);
287 	if (!lip)
288 		return 0;
289 	if (IS_ERR(lip))
290 		return PTR_ERR(lip);
291 
292 	tp->t_flags |= XFS_TRANS_DIRTY;
293 	xfs_trans_add_item(tp, lip);
294 	set_bit(XFS_LI_DIRTY, &lip->li_flags);
295 	dfp->dfp_intent = lip;
296 	return 1;
297 }
298 
299 /*
300  * For each pending item in the intake list, log its intent item and the
301  * associated extents, then add the entire intake list to the end of
302  * the pending list.
303  *
304  * Returns 1 if at least one log item was associated with the deferred work;
305  * 0 if there are no log items; or a negative errno.
306  */
307 static int
308 xfs_defer_create_intents(
309 	struct xfs_trans		*tp)
310 {
311 	struct xfs_defer_pending	*dfp;
312 	int				ret = 0;
313 
314 	list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
315 		int			ret2;
316 
317 		trace_xfs_defer_create_intent(tp->t_mountp, dfp);
318 		ret2 = xfs_defer_create_intent(tp, dfp, true);
319 		if (ret2 < 0)
320 			return ret2;
321 		ret |= ret2;
322 	}
323 	return ret;
324 }
325 
326 static inline void
327 xfs_defer_pending_abort(
328 	struct xfs_mount		*mp,
329 	struct xfs_defer_pending	*dfp)
330 {
331 	trace_xfs_defer_pending_abort(mp, dfp);
332 
333 	if (dfp->dfp_intent && !dfp->dfp_done) {
334 		dfp->dfp_ops->abort_intent(dfp->dfp_intent);
335 		dfp->dfp_intent = NULL;
336 	}
337 }
338 
339 static inline void
340 xfs_defer_pending_cancel_work(
341 	struct xfs_mount		*mp,
342 	struct xfs_defer_pending	*dfp)
343 {
344 	struct list_head		*pwi;
345 	struct list_head		*n;
346 
347 	trace_xfs_defer_cancel_list(mp, dfp);
348 
349 	list_del(&dfp->dfp_list);
350 	list_for_each_safe(pwi, n, &dfp->dfp_work) {
351 		list_del(pwi);
352 		dfp->dfp_count--;
353 		trace_xfs_defer_cancel_item(mp, dfp, pwi);
354 		dfp->dfp_ops->cancel_item(pwi);
355 	}
356 	ASSERT(dfp->dfp_count == 0);
357 	kmem_cache_free(xfs_defer_pending_cache, dfp);
358 }
359 
360 STATIC void
361 xfs_defer_pending_abort_list(
362 	struct xfs_mount		*mp,
363 	struct list_head		*dop_list)
364 {
365 	struct xfs_defer_pending	*dfp;
366 
367 	/* Abort intent items that don't have a done item. */
368 	list_for_each_entry(dfp, dop_list, dfp_list)
369 		xfs_defer_pending_abort(mp, dfp);
370 }
371 
372 /* Abort all the intents that were committed. */
373 STATIC void
374 xfs_defer_trans_abort(
375 	struct xfs_trans		*tp,
376 	struct list_head		*dop_pending)
377 {
378 	trace_xfs_defer_trans_abort(tp, _RET_IP_);
379 	xfs_defer_pending_abort_list(tp->t_mountp, dop_pending);
380 }
381 
382 /*
383  * Capture resources that the caller said not to release ("held") when the
384  * transaction commits.  Caller is responsible for zero-initializing @dres.
385  */
386 static int
387 xfs_defer_save_resources(
388 	struct xfs_defer_resources	*dres,
389 	struct xfs_trans		*tp)
390 {
391 	struct xfs_buf_log_item		*bli;
392 	struct xfs_inode_log_item	*ili;
393 	struct xfs_log_item		*lip;
394 
395 	BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
396 
397 	list_for_each_entry(lip, &tp->t_items, li_trans) {
398 		switch (lip->li_type) {
399 		case XFS_LI_BUF:
400 			bli = container_of(lip, struct xfs_buf_log_item,
401 					   bli_item);
402 			if (bli->bli_flags & XFS_BLI_HOLD) {
403 				if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
404 					ASSERT(0);
405 					return -EFSCORRUPTED;
406 				}
407 				if (bli->bli_flags & XFS_BLI_ORDERED)
408 					dres->dr_ordered |=
409 							(1U << dres->dr_bufs);
410 				else
411 					xfs_trans_dirty_buf(tp, bli->bli_buf);
412 				dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
413 			}
414 			break;
415 		case XFS_LI_INODE:
416 			ili = container_of(lip, struct xfs_inode_log_item,
417 					   ili_item);
418 			if (ili->ili_lock_flags == 0) {
419 				if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
420 					ASSERT(0);
421 					return -EFSCORRUPTED;
422 				}
423 				xfs_trans_log_inode(tp, ili->ili_inode,
424 						    XFS_ILOG_CORE);
425 				dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
426 			}
427 			break;
428 		default:
429 			break;
430 		}
431 	}
432 
433 	return 0;
434 }
435 
436 /* Attach the held resources to the transaction. */
437 static void
438 xfs_defer_restore_resources(
439 	struct xfs_trans		*tp,
440 	struct xfs_defer_resources	*dres)
441 {
442 	unsigned short			i;
443 
444 	/* Rejoin the joined inodes. */
445 	for (i = 0; i < dres->dr_inos; i++)
446 		xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
447 
448 	/* Rejoin the buffers and dirty them so the log moves forward. */
449 	for (i = 0; i < dres->dr_bufs; i++) {
450 		xfs_trans_bjoin(tp, dres->dr_bp[i]);
451 		if (dres->dr_ordered & (1U << i))
452 			xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
453 		xfs_trans_bhold(tp, dres->dr_bp[i]);
454 	}
455 }
456 
457 /* Roll a transaction so we can do some deferred op processing. */
458 STATIC int
459 xfs_defer_trans_roll(
460 	struct xfs_trans		**tpp)
461 {
462 	struct xfs_defer_resources	dres = { };
463 	int				error;
464 
465 	error = xfs_defer_save_resources(&dres, *tpp);
466 	if (error)
467 		return error;
468 
469 	trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
470 
471 	/*
472 	 * Roll the transaction.  Rolling always given a new transaction (even
473 	 * if committing the old one fails!) to hand back to the caller, so we
474 	 * join the held resources to the new transaction so that we always
475 	 * return with the held resources joined to @tpp, no matter what
476 	 * happened.
477 	 */
478 	error = xfs_trans_roll(tpp);
479 
480 	xfs_defer_restore_resources(*tpp, &dres);
481 
482 	if (error)
483 		trace_xfs_defer_trans_roll_error(*tpp, error);
484 	return error;
485 }
486 
487 /*
488  * Free up any items left in the list.
489  */
490 static void
491 xfs_defer_cancel_list(
492 	struct xfs_mount		*mp,
493 	struct list_head		*dop_list)
494 {
495 	struct xfs_defer_pending	*dfp;
496 	struct xfs_defer_pending	*pli;
497 
498 	/*
499 	 * Free the pending items.  Caller should already have arranged
500 	 * for the intent items to be released.
501 	 */
502 	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list)
503 		xfs_defer_pending_cancel_work(mp, dfp);
504 }
505 
506 static inline void
507 xfs_defer_relog_intent(
508 	struct xfs_trans		*tp,
509 	struct xfs_defer_pending	*dfp)
510 {
511 	struct xfs_log_item		*lip;
512 
513 	xfs_defer_create_done(tp, dfp);
514 
515 	lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done);
516 	if (lip) {
517 		xfs_trans_add_item(tp, lip);
518 		set_bit(XFS_LI_DIRTY, &lip->li_flags);
519 	}
520 	dfp->dfp_done = NULL;
521 	dfp->dfp_intent = lip;
522 }
523 
524 /*
525  * Prevent a log intent item from pinning the tail of the log by logging a
526  * done item to release the intent item; and then log a new intent item.
527  * The caller should provide a fresh transaction and roll it after we're done.
528  */
529 static void
530 xfs_defer_relog(
531 	struct xfs_trans		**tpp,
532 	struct list_head		*dfops)
533 {
534 	struct xlog			*log = (*tpp)->t_mountp->m_log;
535 	struct xfs_defer_pending	*dfp;
536 	xfs_lsn_t			threshold_lsn = NULLCOMMITLSN;
537 
538 
539 	ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
540 
541 	list_for_each_entry(dfp, dfops, dfp_list) {
542 		/*
543 		 * If the log intent item for this deferred op is not a part of
544 		 * the current log checkpoint, relog the intent item to keep
545 		 * the log tail moving forward.  We're ok with this being racy
546 		 * because an incorrect decision means we'll be a little slower
547 		 * at pushing the tail.
548 		 */
549 		if (dfp->dfp_intent == NULL ||
550 		    xfs_log_item_in_current_chkpt(dfp->dfp_intent))
551 			continue;
552 
553 		/*
554 		 * Figure out where we need the tail to be in order to maintain
555 		 * the minimum required free space in the log.  Only sample
556 		 * the log threshold once per call.
557 		 */
558 		if (threshold_lsn == NULLCOMMITLSN) {
559 			threshold_lsn = xlog_grant_push_threshold(log, 0);
560 			if (threshold_lsn == NULLCOMMITLSN)
561 				break;
562 		}
563 		if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
564 			continue;
565 
566 		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
567 		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
568 
569 		xfs_defer_relog_intent(*tpp, dfp);
570 	}
571 }
572 
573 /*
574  * Log an intent-done item for the first pending intent, and finish the work
575  * items.
576  */
577 int
578 xfs_defer_finish_one(
579 	struct xfs_trans		*tp,
580 	struct xfs_defer_pending	*dfp)
581 {
582 	const struct xfs_defer_op_type	*ops = dfp->dfp_ops;
583 	struct xfs_btree_cur		*state = NULL;
584 	struct list_head		*li, *n;
585 	int				error;
586 
587 	trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
588 
589 	xfs_defer_create_done(tp, dfp);
590 	list_for_each_safe(li, n, &dfp->dfp_work) {
591 		list_del(li);
592 		dfp->dfp_count--;
593 		trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
594 		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
595 		if (error == -EAGAIN) {
596 			int		ret;
597 
598 			/*
599 			 * Caller wants a fresh transaction; put the work item
600 			 * back on the list and log a new log intent item to
601 			 * replace the old one.  See "Requesting a Fresh
602 			 * Transaction while Finishing Deferred Work" above.
603 			 */
604 			list_add(li, &dfp->dfp_work);
605 			dfp->dfp_count++;
606 			dfp->dfp_done = NULL;
607 			dfp->dfp_intent = NULL;
608 			ret = xfs_defer_create_intent(tp, dfp, false);
609 			if (ret < 0)
610 				error = ret;
611 		}
612 
613 		if (error)
614 			goto out;
615 	}
616 
617 	/* Done with the dfp, free it. */
618 	list_del(&dfp->dfp_list);
619 	kmem_cache_free(xfs_defer_pending_cache, dfp);
620 out:
621 	if (ops->finish_cleanup)
622 		ops->finish_cleanup(tp, state, error);
623 	return error;
624 }
625 
626 /* Move all paused deferred work from @tp to @paused_list. */
627 static void
628 xfs_defer_isolate_paused(
629 	struct xfs_trans		*tp,
630 	struct list_head		*paused_list)
631 {
632 	struct xfs_defer_pending	*dfp;
633 	struct xfs_defer_pending	*pli;
634 
635 	list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) {
636 		if (!(dfp->dfp_flags & XFS_DEFER_PAUSED))
637 			continue;
638 
639 		list_move_tail(&dfp->dfp_list, paused_list);
640 		trace_xfs_defer_isolate_paused(tp->t_mountp, dfp);
641 	}
642 }
643 
644 /*
645  * Finish all the pending work.  This involves logging intent items for
646  * any work items that wandered in since the last transaction roll (if
647  * one has even happened), rolling the transaction, and finishing the
648  * work items in the first item on the logged-and-pending list.
649  *
650  * If an inode is provided, relog it to the new transaction.
651  */
652 int
653 xfs_defer_finish_noroll(
654 	struct xfs_trans		**tp)
655 {
656 	struct xfs_defer_pending	*dfp = NULL;
657 	int				error = 0;
658 	LIST_HEAD(dop_pending);
659 	LIST_HEAD(dop_paused);
660 
661 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
662 
663 	trace_xfs_defer_finish(*tp, _RET_IP_);
664 
665 	/* Until we run out of pending work to finish... */
666 	while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
667 		/*
668 		 * Deferred items that are created in the process of finishing
669 		 * other deferred work items should be queued at the head of
670 		 * the pending list, which puts them ahead of the deferred work
671 		 * that was created by the caller.  This keeps the number of
672 		 * pending work items to a minimum, which decreases the amount
673 		 * of time that any one intent item can stick around in memory,
674 		 * pinning the log tail.
675 		 */
676 		int has_intents = xfs_defer_create_intents(*tp);
677 
678 		xfs_defer_isolate_paused(*tp, &dop_paused);
679 
680 		list_splice_init(&(*tp)->t_dfops, &dop_pending);
681 
682 		if (has_intents < 0) {
683 			error = has_intents;
684 			goto out_shutdown;
685 		}
686 		if (has_intents || dfp) {
687 			error = xfs_defer_trans_roll(tp);
688 			if (error)
689 				goto out_shutdown;
690 
691 			/* Relog intent items to keep the log moving. */
692 			xfs_defer_relog(tp, &dop_pending);
693 			xfs_defer_relog(tp, &dop_paused);
694 
695 			if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
696 				error = xfs_defer_trans_roll(tp);
697 				if (error)
698 					goto out_shutdown;
699 			}
700 		}
701 
702 		dfp = list_first_entry_or_null(&dop_pending,
703 				struct xfs_defer_pending, dfp_list);
704 		if (!dfp)
705 			break;
706 		error = xfs_defer_finish_one(*tp, dfp);
707 		if (error && error != -EAGAIN)
708 			goto out_shutdown;
709 	}
710 
711 	/* Requeue the paused items in the outgoing transaction. */
712 	list_splice_tail_init(&dop_paused, &(*tp)->t_dfops);
713 
714 	trace_xfs_defer_finish_done(*tp, _RET_IP_);
715 	return 0;
716 
717 out_shutdown:
718 	list_splice_tail_init(&dop_paused, &dop_pending);
719 	xfs_defer_trans_abort(*tp, &dop_pending);
720 	xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
721 	trace_xfs_defer_finish_error(*tp, error);
722 	xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
723 	xfs_defer_cancel(*tp);
724 	return error;
725 }
726 
727 int
728 xfs_defer_finish(
729 	struct xfs_trans	**tp)
730 {
731 #ifdef DEBUG
732 	struct xfs_defer_pending *dfp;
733 #endif
734 	int			error;
735 
736 	/*
737 	 * Finish and roll the transaction once more to avoid returning to the
738 	 * caller with a dirty transaction.
739 	 */
740 	error = xfs_defer_finish_noroll(tp);
741 	if (error)
742 		return error;
743 	if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
744 		error = xfs_defer_trans_roll(tp);
745 		if (error) {
746 			xfs_force_shutdown((*tp)->t_mountp,
747 					   SHUTDOWN_CORRUPT_INCORE);
748 			return error;
749 		}
750 	}
751 
752 	/* Reset LOWMODE now that we've finished all the dfops. */
753 #ifdef DEBUG
754 	list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list)
755 		ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
756 #endif
757 	(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
758 	return 0;
759 }
760 
761 void
762 xfs_defer_cancel(
763 	struct xfs_trans	*tp)
764 {
765 	struct xfs_mount	*mp = tp->t_mountp;
766 
767 	trace_xfs_defer_cancel(tp, _RET_IP_);
768 	xfs_defer_trans_abort(tp, &tp->t_dfops);
769 	xfs_defer_cancel_list(mp, &tp->t_dfops);
770 }
771 
772 /*
773  * Return the last pending work item attached to this transaction if it matches
774  * the deferred op type.
775  */
776 static inline struct xfs_defer_pending *
777 xfs_defer_find_last(
778 	struct xfs_trans		*tp,
779 	const struct xfs_defer_op_type	*ops)
780 {
781 	struct xfs_defer_pending	*dfp = NULL;
782 
783 	/* No dfops at all? */
784 	if (list_empty(&tp->t_dfops))
785 		return NULL;
786 
787 	dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending,
788 			dfp_list);
789 
790 	/* Wrong type? */
791 	if (dfp->dfp_ops != ops)
792 		return NULL;
793 	return dfp;
794 }
795 
796 /*
797  * Decide if we can add a deferred work item to the last dfops item attached
798  * to the transaction.
799  */
800 static inline bool
801 xfs_defer_can_append(
802 	struct xfs_defer_pending	*dfp,
803 	const struct xfs_defer_op_type	*ops)
804 {
805 	/* Already logged? */
806 	if (dfp->dfp_intent)
807 		return false;
808 
809 	/* Paused items cannot absorb more work */
810 	if (dfp->dfp_flags & XFS_DEFER_PAUSED)
811 		return NULL;
812 
813 	/* Already full? */
814 	if (ops->max_items && dfp->dfp_count >= ops->max_items)
815 		return false;
816 
817 	return true;
818 }
819 
820 /* Create a new pending item at the end of the transaction list. */
821 static inline struct xfs_defer_pending *
822 xfs_defer_alloc(
823 	struct list_head		*dfops,
824 	const struct xfs_defer_op_type	*ops)
825 {
826 	struct xfs_defer_pending	*dfp;
827 
828 	dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
829 			GFP_KERNEL | __GFP_NOFAIL);
830 	dfp->dfp_ops = ops;
831 	INIT_LIST_HEAD(&dfp->dfp_work);
832 	list_add_tail(&dfp->dfp_list, dfops);
833 
834 	return dfp;
835 }
836 
837 /* Add an item for later deferred processing. */
838 struct xfs_defer_pending *
839 xfs_defer_add(
840 	struct xfs_trans		*tp,
841 	struct list_head		*li,
842 	const struct xfs_defer_op_type	*ops)
843 {
844 	struct xfs_defer_pending	*dfp = NULL;
845 
846 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
847 
848 	dfp = xfs_defer_find_last(tp, ops);
849 	if (!dfp || !xfs_defer_can_append(dfp, ops))
850 		dfp = xfs_defer_alloc(&tp->t_dfops, ops);
851 
852 	xfs_defer_add_item(dfp, li);
853 	trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
854 	return dfp;
855 }
856 
857 /*
858  * Add a defer ops barrier to force two otherwise adjacent deferred work items
859  * to be tracked separately and have separate log items.
860  */
861 void
862 xfs_defer_add_barrier(
863 	struct xfs_trans		*tp)
864 {
865 	struct xfs_defer_pending	*dfp;
866 
867 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
868 
869 	/* If the last defer op added was a barrier, we're done. */
870 	dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type);
871 	if (dfp)
872 		return;
873 
874 	xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type);
875 
876 	trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
877 }
878 
879 /*
880  * Create a pending deferred work item to replay the recovered intent item
881  * and add it to the list.
882  */
883 void
884 xfs_defer_start_recovery(
885 	struct xfs_log_item		*lip,
886 	struct list_head		*r_dfops,
887 	const struct xfs_defer_op_type	*ops)
888 {
889 	struct xfs_defer_pending	*dfp = xfs_defer_alloc(r_dfops, ops);
890 
891 	dfp->dfp_intent = lip;
892 }
893 
894 /*
895  * Cancel a deferred work item created to recover a log intent item.  @dfp
896  * will be freed after this function returns.
897  */
898 void
899 xfs_defer_cancel_recovery(
900 	struct xfs_mount		*mp,
901 	struct xfs_defer_pending	*dfp)
902 {
903 	xfs_defer_pending_abort(mp, dfp);
904 	xfs_defer_pending_cancel_work(mp, dfp);
905 }
906 
907 /* Replay the deferred work item created from a recovered log intent item. */
908 int
909 xfs_defer_finish_recovery(
910 	struct xfs_mount		*mp,
911 	struct xfs_defer_pending	*dfp,
912 	struct list_head		*capture_list)
913 {
914 	const struct xfs_defer_op_type	*ops = dfp->dfp_ops;
915 	int				error;
916 
917 	/* dfp is freed by recover_work and must not be accessed afterwards */
918 	error = ops->recover_work(dfp, capture_list);
919 	if (error)
920 		trace_xlog_intent_recovery_failed(mp, ops, error);
921 	return error;
922 }
923 
924 /*
925  * Move deferred ops from one transaction to another and reset the source to
926  * initial state. This is primarily used to carry state forward across
927  * transaction rolls with pending dfops.
928  */
929 void
930 xfs_defer_move(
931 	struct xfs_trans	*dtp,
932 	struct xfs_trans	*stp)
933 {
934 	list_splice_init(&stp->t_dfops, &dtp->t_dfops);
935 
936 	/*
937 	 * Low free space mode was historically controlled by a dfops field.
938 	 * This meant that low mode state potentially carried across multiple
939 	 * transaction rolls. Transfer low mode on a dfops move to preserve
940 	 * that behavior.
941 	 */
942 	dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
943 	stp->t_flags &= ~XFS_TRANS_LOWMODE;
944 }
945 
946 /*
947  * Prepare a chain of fresh deferred ops work items to be completed later.  Log
948  * recovery requires the ability to put off until later the actual finishing
949  * work so that it can process unfinished items recovered from the log in
950  * correct order.
951  *
952  * Create and log intent items for all the work that we're capturing so that we
953  * can be assured that the items will get replayed if the system goes down
954  * before log recovery gets a chance to finish the work it put off.  The entire
955  * deferred ops state is transferred to the capture structure and the
956  * transaction is then ready for the caller to commit it.  If there are no
957  * intent items to capture, this function returns NULL.
958  *
959  * If capture_ip is not NULL, the capture structure will obtain an extra
960  * reference to the inode.
961  */
962 static struct xfs_defer_capture *
963 xfs_defer_ops_capture(
964 	struct xfs_trans		*tp)
965 {
966 	struct xfs_defer_capture	*dfc;
967 	unsigned short			i;
968 	int				error;
969 
970 	if (list_empty(&tp->t_dfops))
971 		return NULL;
972 
973 	error = xfs_defer_create_intents(tp);
974 	if (error < 0)
975 		return ERR_PTR(error);
976 
977 	/* Create an object to capture the defer ops. */
978 	dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL);
979 	INIT_LIST_HEAD(&dfc->dfc_list);
980 	INIT_LIST_HEAD(&dfc->dfc_dfops);
981 
982 	/* Move the dfops chain and transaction state to the capture struct. */
983 	list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
984 	dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
985 	tp->t_flags &= ~XFS_TRANS_LOWMODE;
986 
987 	/* Capture the remaining block reservations along with the dfops. */
988 	dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
989 	dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
990 
991 	/* Preserve the log reservation size. */
992 	dfc->dfc_logres = tp->t_log_res;
993 
994 	error = xfs_defer_save_resources(&dfc->dfc_held, tp);
995 	if (error) {
996 		/*
997 		 * Resource capture should never fail, but if it does, we
998 		 * still have to shut down the log and release things
999 		 * properly.
1000 		 */
1001 		xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
1002 	}
1003 
1004 	/*
1005 	 * Grab extra references to the inodes and buffers because callers are
1006 	 * expected to release their held references after we commit the
1007 	 * transaction.
1008 	 */
1009 	for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
1010 		xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL);
1011 		ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
1012 	}
1013 
1014 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
1015 		xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
1016 
1017 	return dfc;
1018 }
1019 
1020 /* Release all resources that we used to capture deferred ops. */
1021 void
1022 xfs_defer_ops_capture_abort(
1023 	struct xfs_mount		*mp,
1024 	struct xfs_defer_capture	*dfc)
1025 {
1026 	unsigned short			i;
1027 
1028 	xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops);
1029 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
1030 
1031 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
1032 		xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
1033 
1034 	for (i = 0; i < dfc->dfc_held.dr_inos; i++)
1035 		xfs_irele(dfc->dfc_held.dr_ip[i]);
1036 
1037 	kfree(dfc);
1038 }
1039 
1040 /*
1041  * Capture any deferred ops and commit the transaction.  This is the last step
1042  * needed to finish a log intent item that we recovered from the log.  If any
1043  * of the deferred ops operate on an inode, the caller must pass in that inode
1044  * so that the reference can be transferred to the capture structure.  The
1045  * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
1046  * xfs_defer_ops_continue.
1047  */
1048 int
1049 xfs_defer_ops_capture_and_commit(
1050 	struct xfs_trans		*tp,
1051 	struct list_head		*capture_list)
1052 {
1053 	struct xfs_mount		*mp = tp->t_mountp;
1054 	struct xfs_defer_capture	*dfc;
1055 	int				error;
1056 
1057 	/* If we don't capture anything, commit transaction and exit. */
1058 	dfc = xfs_defer_ops_capture(tp);
1059 	if (IS_ERR(dfc)) {
1060 		xfs_trans_cancel(tp);
1061 		return PTR_ERR(dfc);
1062 	}
1063 	if (!dfc)
1064 		return xfs_trans_commit(tp);
1065 
1066 	/* Commit the transaction and add the capture structure to the list. */
1067 	error = xfs_trans_commit(tp);
1068 	if (error) {
1069 		xfs_defer_ops_capture_abort(mp, dfc);
1070 		return error;
1071 	}
1072 
1073 	list_add_tail(&dfc->dfc_list, capture_list);
1074 	return 0;
1075 }
1076 
1077 /*
1078  * Attach a chain of captured deferred ops to a new transaction and free the
1079  * capture structure.  If an inode was captured, it will be passed back to the
1080  * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
1081  * The caller now owns the inode reference.
1082  */
1083 void
1084 xfs_defer_ops_continue(
1085 	struct xfs_defer_capture	*dfc,
1086 	struct xfs_trans		*tp,
1087 	struct xfs_defer_resources	*dres)
1088 {
1089 	unsigned int			i;
1090 
1091 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1092 	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
1093 
1094 	/* Lock the captured resources to the new transaction. */
1095 	if (dfc->dfc_held.dr_inos > 2) {
1096 		xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos);
1097 		xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos,
1098 				XFS_ILOCK_EXCL);
1099 	} else if (dfc->dfc_held.dr_inos == 2)
1100 		xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
1101 				    dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
1102 	else if (dfc->dfc_held.dr_inos == 1)
1103 		xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
1104 
1105 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
1106 		xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
1107 
1108 	/* Join the captured resources to the new transaction. */
1109 	xfs_defer_restore_resources(tp, &dfc->dfc_held);
1110 	memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
1111 	dres->dr_bufs = 0;
1112 
1113 	/* Move captured dfops chain and state to the transaction. */
1114 	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
1115 	tp->t_flags |= dfc->dfc_tpflags;
1116 
1117 	kfree(dfc);
1118 }
1119 
1120 /* Release the resources captured and continued during recovery. */
1121 void
1122 xfs_defer_resources_rele(
1123 	struct xfs_defer_resources	*dres)
1124 {
1125 	unsigned short			i;
1126 
1127 	for (i = 0; i < dres->dr_inos; i++) {
1128 		xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
1129 		xfs_irele(dres->dr_ip[i]);
1130 		dres->dr_ip[i] = NULL;
1131 	}
1132 
1133 	for (i = 0; i < dres->dr_bufs; i++) {
1134 		xfs_buf_relse(dres->dr_bp[i]);
1135 		dres->dr_bp[i] = NULL;
1136 	}
1137 
1138 	dres->dr_inos = 0;
1139 	dres->dr_bufs = 0;
1140 	dres->dr_ordered = 0;
1141 }
1142 
1143 static inline int __init
1144 xfs_defer_init_cache(void)
1145 {
1146 	xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
1147 			sizeof(struct xfs_defer_pending),
1148 			0, 0, NULL);
1149 
1150 	return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
1151 }
1152 
1153 static inline void
1154 xfs_defer_destroy_cache(void)
1155 {
1156 	kmem_cache_destroy(xfs_defer_pending_cache);
1157 	xfs_defer_pending_cache = NULL;
1158 }
1159 
1160 /* Set up caches for deferred work items. */
1161 int __init
1162 xfs_defer_init_item_caches(void)
1163 {
1164 	int				error;
1165 
1166 	error = xfs_defer_init_cache();
1167 	if (error)
1168 		return error;
1169 	error = xfs_rmap_intent_init_cache();
1170 	if (error)
1171 		goto err;
1172 	error = xfs_refcount_intent_init_cache();
1173 	if (error)
1174 		goto err;
1175 	error = xfs_bmap_intent_init_cache();
1176 	if (error)
1177 		goto err;
1178 	error = xfs_extfree_intent_init_cache();
1179 	if (error)
1180 		goto err;
1181 	error = xfs_attr_intent_init_cache();
1182 	if (error)
1183 		goto err;
1184 	error = xfs_exchmaps_intent_init_cache();
1185 	if (error)
1186 		goto err;
1187 
1188 	return 0;
1189 err:
1190 	xfs_defer_destroy_item_caches();
1191 	return error;
1192 }
1193 
1194 /* Destroy all the deferred work item caches, if they've been allocated. */
1195 void
1196 xfs_defer_destroy_item_caches(void)
1197 {
1198 	xfs_exchmaps_intent_destroy_cache();
1199 	xfs_attr_intent_destroy_cache();
1200 	xfs_extfree_intent_destroy_cache();
1201 	xfs_bmap_intent_destroy_cache();
1202 	xfs_refcount_intent_destroy_cache();
1203 	xfs_rmap_intent_destroy_cache();
1204 	xfs_defer_destroy_cache();
1205 }
1206 
1207 /*
1208  * Mark a deferred work item so that it will be requeued indefinitely without
1209  * being finished.  Caller must ensure there are no data dependencies on this
1210  * work item in the meantime.
1211  */
1212 void
1213 xfs_defer_item_pause(
1214 	struct xfs_trans		*tp,
1215 	struct xfs_defer_pending	*dfp)
1216 {
1217 	ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED));
1218 
1219 	dfp->dfp_flags |= XFS_DEFER_PAUSED;
1220 
1221 	trace_xfs_defer_item_pause(tp->t_mountp, dfp);
1222 }
1223 
1224 /*
1225  * Release a paused deferred work item so that it will be finished during the
1226  * next transaction roll.
1227  */
1228 void
1229 xfs_defer_item_unpause(
1230 	struct xfs_trans		*tp,
1231 	struct xfs_defer_pending	*dfp)
1232 {
1233 	ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
1234 
1235 	dfp->dfp_flags &= ~XFS_DEFER_PAUSED;
1236 
1237 	trace_xfs_defer_item_unpause(tp->t_mountp, dfp);
1238 }
1239