xref: /linux/fs/xfs/libxfs/xfs_defer.c (revision c532de5a67a70f8533d495f8f2aaa9a0491c3ad0)
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright (C) 2016 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_defer.h"
14 #include "xfs_trans.h"
15 #include "xfs_trans_priv.h"
16 #include "xfs_buf_item.h"
17 #include "xfs_inode.h"
18 #include "xfs_inode_item.h"
19 #include "xfs_trace.h"
20 #include "xfs_icache.h"
21 #include "xfs_log.h"
22 #include "xfs_log_priv.h"
23 #include "xfs_rmap.h"
24 #include "xfs_refcount.h"
25 #include "xfs_bmap.h"
26 #include "xfs_alloc.h"
27 #include "xfs_buf.h"
28 #include "xfs_da_format.h"
29 #include "xfs_da_btree.h"
30 #include "xfs_attr.h"
31 #include "xfs_exchmaps.h"
32 
33 static struct kmem_cache	*xfs_defer_pending_cache;
34 
35 /*
36  * Deferred Operations in XFS
37  *
38  * Due to the way locking rules work in XFS, certain transactions (block
39  * mapping and unmapping, typically) have permanent reservations so that
40  * we can roll the transaction to adhere to AG locking order rules and
41  * to unlock buffers between metadata updates.  Prior to rmap/reflink,
42  * the mapping code had a mechanism to perform these deferrals for
43  * extents that were going to be freed; this code makes that facility
44  * more generic.
45  *
46  * When adding the reverse mapping and reflink features, it became
47  * necessary to perform complex remapping multi-transactions to comply
48  * with AG locking order rules, and to be able to spread a single
49  * refcount update operation (an operation on an n-block extent can
50  * update as many as n records!) among multiple transactions.  XFS can
51  * roll a transaction to facilitate this, but using this facility
52  * requires us to log "intent" items in case log recovery needs to
53  * redo the operation, and to log "done" items to indicate that redo
54  * is not necessary.
55  *
56  * Deferred work is tracked in xfs_defer_pending items.  Each pending
57  * item tracks one type of deferred work.  Incoming work items (which
58  * have not yet had an intent logged) are attached to a pending item
59  * on the dop_intake list, where they wait for the caller to finish
60  * the deferred operations.
61  *
62  * Finishing a set of deferred operations is an involved process.  To
63  * start, we define "rolling a deferred-op transaction" as follows:
64  *
65  * > For each xfs_defer_pending item on the dop_intake list,
66  *   - Sort the work items in AG order.  XFS locking
67  *     order rules require us to lock buffers in AG order.
68  *   - Create a log intent item for that type.
69  *   - Attach it to the pending item.
70  *   - Move the pending item from the dop_intake list to the
71  *     dop_pending list.
72  * > Roll the transaction.
73  *
74  * NOTE: To avoid exceeding the transaction reservation, we limit the
75  * number of items that we attach to a given xfs_defer_pending.
76  *
77  * The actual finishing process looks like this:
78  *
79  * > For each xfs_defer_pending in the dop_pending list,
80  *   - Roll the deferred-op transaction as above.
81  *   - Create a log done item for that type, and attach it to the
82  *     log intent item.
83  *   - For each work item attached to the log intent item,
84  *     * Perform the described action.
85  *     * Attach the work item to the log done item.
86  *     * If the result of doing the work was -EAGAIN, ->finish work
87  *       wants a new transaction.  See the "Requesting a Fresh
88  *       Transaction while Finishing Deferred Work" section below for
89  *       details.
90  *
91  * The key here is that we must log an intent item for all pending
92  * work items every time we roll the transaction, and that we must log
93  * a done item as soon as the work is completed.  With this mechanism
94  * we can perform complex remapping operations, chaining intent items
95  * as needed.
96  *
97  * Requesting a Fresh Transaction while Finishing Deferred Work
98  *
99  * If ->finish_item decides that it needs a fresh transaction to
100  * finish the work, it must ask its caller (xfs_defer_finish) for a
101  * continuation.  The most likely cause of this circumstance are the
102  * refcount adjust functions deciding that they've logged enough items
103  * to be at risk of exceeding the transaction reservation.
104  *
105  * To get a fresh transaction, we want to log the existing log done
106  * item to prevent the log intent item from replaying, immediately log
107  * a new log intent item with the unfinished work items, roll the
108  * transaction, and re-call ->finish_item wherever it left off.  The
109  * log done item and the new log intent item must be in the same
110  * transaction or atomicity cannot be guaranteed; defer_finish ensures
111  * that this happens.
112  *
113  * This requires some coordination between ->finish_item and
114  * defer_finish.  Upon deciding to request a new transaction,
115  * ->finish_item should update the current work item to reflect the
116  * unfinished work.  Next, it should reset the log done item's list
117  * count to the number of items finished, and return -EAGAIN.
118  * defer_finish sees the -EAGAIN, logs the new log intent item
119  * with the remaining work items, and leaves the xfs_defer_pending
120  * item at the head of the dop_work queue.  Then it rolls the
121  * transaction and picks up processing where it left off.  It is
122  * required that ->finish_item must be careful to leave enough
123  * transaction reservation to fit the new log intent item.
124  *
125  * This is an example of remapping the extent (E, E+B) into file X at
126  * offset A and dealing with the extent (C, C+B) already being mapped
127  * there:
128  * +-------------------------------------------------+
129  * | Unmap file X startblock C offset A length B     | t0
130  * | Intent to reduce refcount for extent (C, B)     |
131  * | Intent to remove rmap (X, C, A, B)              |
132  * | Intent to free extent (D, 1) (bmbt block)       |
133  * | Intent to map (X, A, B) at startblock E         |
134  * +-------------------------------------------------+
135  * | Map file X startblock E offset A length B       | t1
136  * | Done mapping (X, E, A, B)                       |
137  * | Intent to increase refcount for extent (E, B)   |
138  * | Intent to add rmap (X, E, A, B)                 |
139  * +-------------------------------------------------+
140  * | Reduce refcount for extent (C, B)               | t2
141  * | Done reducing refcount for extent (C, 9)        |
142  * | Intent to reduce refcount for extent (C+9, B-9) |
143  * | (ran out of space after 9 refcount updates)     |
144  * +-------------------------------------------------+
145  * | Reduce refcount for extent (C+9, B+9)           | t3
146  * | Done reducing refcount for extent (C+9, B-9)    |
147  * | Increase refcount for extent (E, B)             |
148  * | Done increasing refcount for extent (E, B)      |
149  * | Intent to free extent (C, B)                    |
150  * | Intent to free extent (F, 1) (refcountbt block) |
151  * | Intent to remove rmap (F, 1, REFC)              |
152  * +-------------------------------------------------+
153  * | Remove rmap (X, C, A, B)                        | t4
154  * | Done removing rmap (X, C, A, B)                 |
155  * | Add rmap (X, E, A, B)                           |
156  * | Done adding rmap (X, E, A, B)                   |
157  * | Remove rmap (F, 1, REFC)                        |
158  * | Done removing rmap (F, 1, REFC)                 |
159  * +-------------------------------------------------+
160  * | Free extent (C, B)                              | t5
161  * | Done freeing extent (C, B)                      |
162  * | Free extent (D, 1)                              |
163  * | Done freeing extent (D, 1)                      |
164  * | Free extent (F, 1)                              |
165  * | Done freeing extent (F, 1)                      |
166  * +-------------------------------------------------+
167  *
168  * If we should crash before t2 commits, log recovery replays
169  * the following intent items:
170  *
171  * - Intent to reduce refcount for extent (C, B)
172  * - Intent to remove rmap (X, C, A, B)
173  * - Intent to free extent (D, 1) (bmbt block)
174  * - Intent to increase refcount for extent (E, B)
175  * - Intent to add rmap (X, E, A, B)
176  *
177  * In the process of recovering, it should also generate and take care
178  * of these intent items:
179  *
180  * - Intent to free extent (C, B)
181  * - Intent to free extent (F, 1) (refcountbt block)
182  * - Intent to remove rmap (F, 1, REFC)
183  *
184  * Note that the continuation requested between t2 and t3 is likely to
185  * reoccur.
186  */
187 STATIC struct xfs_log_item *
188 xfs_defer_barrier_create_intent(
189 	struct xfs_trans		*tp,
190 	struct list_head		*items,
191 	unsigned int			count,
192 	bool				sort)
193 {
194 	return NULL;
195 }
196 
197 STATIC void
198 xfs_defer_barrier_abort_intent(
199 	struct xfs_log_item		*intent)
200 {
201 	/* empty */
202 }
203 
204 STATIC struct xfs_log_item *
205 xfs_defer_barrier_create_done(
206 	struct xfs_trans		*tp,
207 	struct xfs_log_item		*intent,
208 	unsigned int			count)
209 {
210 	return NULL;
211 }
212 
213 STATIC int
214 xfs_defer_barrier_finish_item(
215 	struct xfs_trans		*tp,
216 	struct xfs_log_item		*done,
217 	struct list_head		*item,
218 	struct xfs_btree_cur		**state)
219 {
220 	ASSERT(0);
221 	return -EFSCORRUPTED;
222 }
223 
224 STATIC void
225 xfs_defer_barrier_cancel_item(
226 	struct list_head		*item)
227 {
228 	ASSERT(0);
229 }
230 
231 static const struct xfs_defer_op_type xfs_barrier_defer_type = {
232 	.max_items	= 1,
233 	.create_intent	= xfs_defer_barrier_create_intent,
234 	.abort_intent	= xfs_defer_barrier_abort_intent,
235 	.create_done	= xfs_defer_barrier_create_done,
236 	.finish_item	= xfs_defer_barrier_finish_item,
237 	.cancel_item	= xfs_defer_barrier_cancel_item,
238 };
239 
240 /* Create a log intent done item for a log intent item. */
241 static inline void
242 xfs_defer_create_done(
243 	struct xfs_trans		*tp,
244 	struct xfs_defer_pending	*dfp)
245 {
246 	struct xfs_log_item		*lip;
247 
248 	/* If there is no log intent item, there can be no log done item. */
249 	if (!dfp->dfp_intent)
250 		return;
251 
252 	/*
253 	 * Mark the transaction dirty, even on error. This ensures the
254 	 * transaction is aborted, which:
255 	 *
256 	 * 1.) releases the log intent item and frees the log done item
257 	 * 2.) shuts down the filesystem
258 	 */
259 	tp->t_flags |= XFS_TRANS_DIRTY;
260 	lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
261 	if (!lip)
262 		return;
263 
264 	tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE;
265 	xfs_trans_add_item(tp, lip);
266 	set_bit(XFS_LI_DIRTY, &lip->li_flags);
267 	dfp->dfp_done = lip;
268 }
269 
270 /*
271  * Ensure there's a log intent item associated with this deferred work item if
272  * the operation must be restarted on crash.  Returns 1 if there's a log item;
273  * 0 if there isn't; or a negative errno.
274  */
275 static int
276 xfs_defer_create_intent(
277 	struct xfs_trans		*tp,
278 	struct xfs_defer_pending	*dfp,
279 	bool				sort)
280 {
281 	struct xfs_log_item		*lip;
282 
283 	if (dfp->dfp_intent)
284 		return 1;
285 
286 	lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count,
287 			sort);
288 	if (!lip)
289 		return 0;
290 	if (IS_ERR(lip))
291 		return PTR_ERR(lip);
292 
293 	tp->t_flags |= XFS_TRANS_DIRTY;
294 	xfs_trans_add_item(tp, lip);
295 	set_bit(XFS_LI_DIRTY, &lip->li_flags);
296 	dfp->dfp_intent = lip;
297 	return 1;
298 }
299 
300 /*
301  * For each pending item in the intake list, log its intent item and the
302  * associated extents, then add the entire intake list to the end of
303  * the pending list.
304  *
305  * Returns 1 if at least one log item was associated with the deferred work;
306  * 0 if there are no log items; or a negative errno.
307  */
308 static int
309 xfs_defer_create_intents(
310 	struct xfs_trans		*tp)
311 {
312 	struct xfs_defer_pending	*dfp;
313 	int				ret = 0;
314 
315 	list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
316 		int			ret2;
317 
318 		trace_xfs_defer_create_intent(tp->t_mountp, dfp);
319 		ret2 = xfs_defer_create_intent(tp, dfp, true);
320 		if (ret2 < 0)
321 			return ret2;
322 		ret |= ret2;
323 	}
324 	return ret;
325 }
326 
327 static inline void
328 xfs_defer_pending_abort(
329 	struct xfs_mount		*mp,
330 	struct xfs_defer_pending	*dfp)
331 {
332 	trace_xfs_defer_pending_abort(mp, dfp);
333 
334 	if (dfp->dfp_intent && !dfp->dfp_done) {
335 		dfp->dfp_ops->abort_intent(dfp->dfp_intent);
336 		dfp->dfp_intent = NULL;
337 	}
338 }
339 
340 static inline void
341 xfs_defer_pending_cancel_work(
342 	struct xfs_mount		*mp,
343 	struct xfs_defer_pending	*dfp)
344 {
345 	struct list_head		*pwi;
346 	struct list_head		*n;
347 
348 	trace_xfs_defer_cancel_list(mp, dfp);
349 
350 	list_del(&dfp->dfp_list);
351 	list_for_each_safe(pwi, n, &dfp->dfp_work) {
352 		list_del(pwi);
353 		dfp->dfp_count--;
354 		trace_xfs_defer_cancel_item(mp, dfp, pwi);
355 		dfp->dfp_ops->cancel_item(pwi);
356 	}
357 	ASSERT(dfp->dfp_count == 0);
358 	kmem_cache_free(xfs_defer_pending_cache, dfp);
359 }
360 
361 STATIC void
362 xfs_defer_pending_abort_list(
363 	struct xfs_mount		*mp,
364 	struct list_head		*dop_list)
365 {
366 	struct xfs_defer_pending	*dfp;
367 
368 	/* Abort intent items that don't have a done item. */
369 	list_for_each_entry(dfp, dop_list, dfp_list)
370 		xfs_defer_pending_abort(mp, dfp);
371 }
372 
373 /* Abort all the intents that were committed. */
374 STATIC void
375 xfs_defer_trans_abort(
376 	struct xfs_trans		*tp,
377 	struct list_head		*dop_pending)
378 {
379 	trace_xfs_defer_trans_abort(tp, _RET_IP_);
380 	xfs_defer_pending_abort_list(tp->t_mountp, dop_pending);
381 }
382 
383 /*
384  * Capture resources that the caller said not to release ("held") when the
385  * transaction commits.  Caller is responsible for zero-initializing @dres.
386  */
387 static int
388 xfs_defer_save_resources(
389 	struct xfs_defer_resources	*dres,
390 	struct xfs_trans		*tp)
391 {
392 	struct xfs_buf_log_item		*bli;
393 	struct xfs_inode_log_item	*ili;
394 	struct xfs_log_item		*lip;
395 
396 	BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
397 
398 	list_for_each_entry(lip, &tp->t_items, li_trans) {
399 		switch (lip->li_type) {
400 		case XFS_LI_BUF:
401 			bli = container_of(lip, struct xfs_buf_log_item,
402 					   bli_item);
403 			if (bli->bli_flags & XFS_BLI_HOLD) {
404 				if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
405 					ASSERT(0);
406 					return -EFSCORRUPTED;
407 				}
408 				if (bli->bli_flags & XFS_BLI_ORDERED)
409 					dres->dr_ordered |=
410 							(1U << dres->dr_bufs);
411 				else
412 					xfs_trans_dirty_buf(tp, bli->bli_buf);
413 				dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
414 			}
415 			break;
416 		case XFS_LI_INODE:
417 			ili = container_of(lip, struct xfs_inode_log_item,
418 					   ili_item);
419 			if (ili->ili_lock_flags == 0) {
420 				if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
421 					ASSERT(0);
422 					return -EFSCORRUPTED;
423 				}
424 				xfs_trans_log_inode(tp, ili->ili_inode,
425 						    XFS_ILOG_CORE);
426 				dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
427 			}
428 			break;
429 		default:
430 			break;
431 		}
432 	}
433 
434 	return 0;
435 }
436 
437 /* Attach the held resources to the transaction. */
438 static void
439 xfs_defer_restore_resources(
440 	struct xfs_trans		*tp,
441 	struct xfs_defer_resources	*dres)
442 {
443 	unsigned short			i;
444 
445 	/* Rejoin the joined inodes. */
446 	for (i = 0; i < dres->dr_inos; i++)
447 		xfs_trans_ijoin(tp, dres->dr_ip[i], 0);
448 
449 	/* Rejoin the buffers and dirty them so the log moves forward. */
450 	for (i = 0; i < dres->dr_bufs; i++) {
451 		xfs_trans_bjoin(tp, dres->dr_bp[i]);
452 		if (dres->dr_ordered & (1U << i))
453 			xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
454 		xfs_trans_bhold(tp, dres->dr_bp[i]);
455 	}
456 }
457 
458 /* Roll a transaction so we can do some deferred op processing. */
459 STATIC int
460 xfs_defer_trans_roll(
461 	struct xfs_trans		**tpp)
462 {
463 	struct xfs_defer_resources	dres = { };
464 	int				error;
465 
466 	error = xfs_defer_save_resources(&dres, *tpp);
467 	if (error)
468 		return error;
469 
470 	trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
471 
472 	/*
473 	 * Roll the transaction.  Rolling always given a new transaction (even
474 	 * if committing the old one fails!) to hand back to the caller, so we
475 	 * join the held resources to the new transaction so that we always
476 	 * return with the held resources joined to @tpp, no matter what
477 	 * happened.
478 	 */
479 	error = xfs_trans_roll(tpp);
480 
481 	xfs_defer_restore_resources(*tpp, &dres);
482 
483 	if (error)
484 		trace_xfs_defer_trans_roll_error(*tpp, error);
485 	return error;
486 }
487 
488 /*
489  * Free up any items left in the list.
490  */
491 static void
492 xfs_defer_cancel_list(
493 	struct xfs_mount		*mp,
494 	struct list_head		*dop_list)
495 {
496 	struct xfs_defer_pending	*dfp;
497 	struct xfs_defer_pending	*pli;
498 
499 	/*
500 	 * Free the pending items.  Caller should already have arranged
501 	 * for the intent items to be released.
502 	 */
503 	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list)
504 		xfs_defer_pending_cancel_work(mp, dfp);
505 }
506 
507 static inline void
508 xfs_defer_relog_intent(
509 	struct xfs_trans		*tp,
510 	struct xfs_defer_pending	*dfp)
511 {
512 	struct xfs_log_item		*lip;
513 
514 	xfs_defer_create_done(tp, dfp);
515 
516 	lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done);
517 	if (lip) {
518 		xfs_trans_add_item(tp, lip);
519 		set_bit(XFS_LI_DIRTY, &lip->li_flags);
520 	}
521 	dfp->dfp_done = NULL;
522 	dfp->dfp_intent = lip;
523 }
524 
525 /*
526  * Prevent a log intent item from pinning the tail of the log by logging a
527  * done item to release the intent item; and then log a new intent item.
528  * The caller should provide a fresh transaction and roll it after we're done.
529  */
530 static void
531 xfs_defer_relog(
532 	struct xfs_trans		**tpp,
533 	struct list_head		*dfops)
534 {
535 	struct xlog			*log = (*tpp)->t_mountp->m_log;
536 	struct xfs_defer_pending	*dfp;
537 	xfs_lsn_t			threshold_lsn = NULLCOMMITLSN;
538 
539 
540 	ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
541 
542 	list_for_each_entry(dfp, dfops, dfp_list) {
543 		/*
544 		 * If the log intent item for this deferred op is not a part of
545 		 * the current log checkpoint, relog the intent item to keep
546 		 * the log tail moving forward.  We're ok with this being racy
547 		 * because an incorrect decision means we'll be a little slower
548 		 * at pushing the tail.
549 		 */
550 		if (dfp->dfp_intent == NULL ||
551 		    xfs_log_item_in_current_chkpt(dfp->dfp_intent))
552 			continue;
553 
554 		/*
555 		 * Figure out where we need the tail to be in order to maintain
556 		 * the minimum required free space in the log.  Only sample
557 		 * the log threshold once per call.
558 		 */
559 		if (threshold_lsn == NULLCOMMITLSN) {
560 			threshold_lsn = xfs_ail_get_push_target(log->l_ailp);
561 			if (threshold_lsn == NULLCOMMITLSN)
562 				break;
563 		}
564 		if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
565 			continue;
566 
567 		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
568 		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
569 
570 		xfs_defer_relog_intent(*tpp, dfp);
571 	}
572 }
573 
574 /*
575  * Log an intent-done item for the first pending intent, and finish the work
576  * items.
577  */
578 int
579 xfs_defer_finish_one(
580 	struct xfs_trans		*tp,
581 	struct xfs_defer_pending	*dfp)
582 {
583 	const struct xfs_defer_op_type	*ops = dfp->dfp_ops;
584 	struct xfs_btree_cur		*state = NULL;
585 	struct list_head		*li, *n;
586 	int				error;
587 
588 	trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
589 
590 	xfs_defer_create_done(tp, dfp);
591 	list_for_each_safe(li, n, &dfp->dfp_work) {
592 		list_del(li);
593 		dfp->dfp_count--;
594 		trace_xfs_defer_finish_item(tp->t_mountp, dfp, li);
595 		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
596 		if (error == -EAGAIN) {
597 			int		ret;
598 
599 			/*
600 			 * Caller wants a fresh transaction; put the work item
601 			 * back on the list and log a new log intent item to
602 			 * replace the old one.  See "Requesting a Fresh
603 			 * Transaction while Finishing Deferred Work" above.
604 			 */
605 			list_add(li, &dfp->dfp_work);
606 			dfp->dfp_count++;
607 			dfp->dfp_done = NULL;
608 			dfp->dfp_intent = NULL;
609 			ret = xfs_defer_create_intent(tp, dfp, false);
610 			if (ret < 0)
611 				error = ret;
612 		}
613 
614 		if (error)
615 			goto out;
616 	}
617 
618 	/* Done with the dfp, free it. */
619 	list_del(&dfp->dfp_list);
620 	kmem_cache_free(xfs_defer_pending_cache, dfp);
621 out:
622 	if (ops->finish_cleanup)
623 		ops->finish_cleanup(tp, state, error);
624 	return error;
625 }
626 
627 /* Move all paused deferred work from @tp to @paused_list. */
628 static void
629 xfs_defer_isolate_paused(
630 	struct xfs_trans		*tp,
631 	struct list_head		*paused_list)
632 {
633 	struct xfs_defer_pending	*dfp;
634 	struct xfs_defer_pending	*pli;
635 
636 	list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) {
637 		if (!(dfp->dfp_flags & XFS_DEFER_PAUSED))
638 			continue;
639 
640 		list_move_tail(&dfp->dfp_list, paused_list);
641 		trace_xfs_defer_isolate_paused(tp->t_mountp, dfp);
642 	}
643 }
644 
645 /*
646  * Finish all the pending work.  This involves logging intent items for
647  * any work items that wandered in since the last transaction roll (if
648  * one has even happened), rolling the transaction, and finishing the
649  * work items in the first item on the logged-and-pending list.
650  *
651  * If an inode is provided, relog it to the new transaction.
652  */
653 int
654 xfs_defer_finish_noroll(
655 	struct xfs_trans		**tp)
656 {
657 	struct xfs_defer_pending	*dfp = NULL;
658 	int				error = 0;
659 	LIST_HEAD(dop_pending);
660 	LIST_HEAD(dop_paused);
661 
662 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
663 
664 	trace_xfs_defer_finish(*tp, _RET_IP_);
665 
666 	/* Until we run out of pending work to finish... */
667 	while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
668 		/*
669 		 * Deferred items that are created in the process of finishing
670 		 * other deferred work items should be queued at the head of
671 		 * the pending list, which puts them ahead of the deferred work
672 		 * that was created by the caller.  This keeps the number of
673 		 * pending work items to a minimum, which decreases the amount
674 		 * of time that any one intent item can stick around in memory,
675 		 * pinning the log tail.
676 		 */
677 		int has_intents = xfs_defer_create_intents(*tp);
678 
679 		xfs_defer_isolate_paused(*tp, &dop_paused);
680 
681 		list_splice_init(&(*tp)->t_dfops, &dop_pending);
682 
683 		if (has_intents < 0) {
684 			error = has_intents;
685 			goto out_shutdown;
686 		}
687 		if (has_intents || dfp) {
688 			error = xfs_defer_trans_roll(tp);
689 			if (error)
690 				goto out_shutdown;
691 
692 			/* Relog intent items to keep the log moving. */
693 			xfs_defer_relog(tp, &dop_pending);
694 			xfs_defer_relog(tp, &dop_paused);
695 
696 			if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
697 				error = xfs_defer_trans_roll(tp);
698 				if (error)
699 					goto out_shutdown;
700 			}
701 		}
702 
703 		dfp = list_first_entry_or_null(&dop_pending,
704 				struct xfs_defer_pending, dfp_list);
705 		if (!dfp)
706 			break;
707 		error = xfs_defer_finish_one(*tp, dfp);
708 		if (error && error != -EAGAIN)
709 			goto out_shutdown;
710 	}
711 
712 	/* Requeue the paused items in the outgoing transaction. */
713 	list_splice_tail_init(&dop_paused, &(*tp)->t_dfops);
714 
715 	trace_xfs_defer_finish_done(*tp, _RET_IP_);
716 	return 0;
717 
718 out_shutdown:
719 	list_splice_tail_init(&dop_paused, &dop_pending);
720 	xfs_defer_trans_abort(*tp, &dop_pending);
721 	xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
722 	trace_xfs_defer_finish_error(*tp, error);
723 	xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
724 	xfs_defer_cancel(*tp);
725 	return error;
726 }
727 
728 int
729 xfs_defer_finish(
730 	struct xfs_trans	**tp)
731 {
732 #ifdef DEBUG
733 	struct xfs_defer_pending *dfp;
734 #endif
735 	int			error;
736 
737 	/*
738 	 * Finish and roll the transaction once more to avoid returning to the
739 	 * caller with a dirty transaction.
740 	 */
741 	error = xfs_defer_finish_noroll(tp);
742 	if (error)
743 		return error;
744 	if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
745 		error = xfs_defer_trans_roll(tp);
746 		if (error) {
747 			xfs_force_shutdown((*tp)->t_mountp,
748 					   SHUTDOWN_CORRUPT_INCORE);
749 			return error;
750 		}
751 	}
752 
753 	/* Reset LOWMODE now that we've finished all the dfops. */
754 #ifdef DEBUG
755 	list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list)
756 		ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
757 #endif
758 	(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
759 	return 0;
760 }
761 
762 void
763 xfs_defer_cancel(
764 	struct xfs_trans	*tp)
765 {
766 	struct xfs_mount	*mp = tp->t_mountp;
767 
768 	trace_xfs_defer_cancel(tp, _RET_IP_);
769 	xfs_defer_trans_abort(tp, &tp->t_dfops);
770 	xfs_defer_cancel_list(mp, &tp->t_dfops);
771 }
772 
773 /*
774  * Return the last pending work item attached to this transaction if it matches
775  * the deferred op type.
776  */
777 static inline struct xfs_defer_pending *
778 xfs_defer_find_last(
779 	struct xfs_trans		*tp,
780 	const struct xfs_defer_op_type	*ops)
781 {
782 	struct xfs_defer_pending	*dfp = NULL;
783 
784 	/* No dfops at all? */
785 	if (list_empty(&tp->t_dfops))
786 		return NULL;
787 
788 	dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending,
789 			dfp_list);
790 
791 	/* Wrong type? */
792 	if (dfp->dfp_ops != ops)
793 		return NULL;
794 	return dfp;
795 }
796 
797 /*
798  * Decide if we can add a deferred work item to the last dfops item attached
799  * to the transaction.
800  */
801 static inline bool
802 xfs_defer_can_append(
803 	struct xfs_defer_pending	*dfp,
804 	const struct xfs_defer_op_type	*ops)
805 {
806 	/* Already logged? */
807 	if (dfp->dfp_intent)
808 		return false;
809 
810 	/* Paused items cannot absorb more work */
811 	if (dfp->dfp_flags & XFS_DEFER_PAUSED)
812 		return NULL;
813 
814 	/* Already full? */
815 	if (ops->max_items && dfp->dfp_count >= ops->max_items)
816 		return false;
817 
818 	return true;
819 }
820 
821 /* Create a new pending item at the end of the transaction list. */
822 static inline struct xfs_defer_pending *
823 xfs_defer_alloc(
824 	struct list_head		*dfops,
825 	const struct xfs_defer_op_type	*ops)
826 {
827 	struct xfs_defer_pending	*dfp;
828 
829 	dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
830 			GFP_KERNEL | __GFP_NOFAIL);
831 	dfp->dfp_ops = ops;
832 	INIT_LIST_HEAD(&dfp->dfp_work);
833 	list_add_tail(&dfp->dfp_list, dfops);
834 
835 	return dfp;
836 }
837 
838 /* Add an item for later deferred processing. */
839 struct xfs_defer_pending *
840 xfs_defer_add(
841 	struct xfs_trans		*tp,
842 	struct list_head		*li,
843 	const struct xfs_defer_op_type	*ops)
844 {
845 	struct xfs_defer_pending	*dfp = NULL;
846 
847 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
848 
849 	dfp = xfs_defer_find_last(tp, ops);
850 	if (!dfp || !xfs_defer_can_append(dfp, ops))
851 		dfp = xfs_defer_alloc(&tp->t_dfops, ops);
852 
853 	xfs_defer_add_item(dfp, li);
854 	trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
855 	return dfp;
856 }
857 
858 /*
859  * Add a defer ops barrier to force two otherwise adjacent deferred work items
860  * to be tracked separately and have separate log items.
861  */
862 void
863 xfs_defer_add_barrier(
864 	struct xfs_trans		*tp)
865 {
866 	struct xfs_defer_pending	*dfp;
867 
868 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
869 
870 	/* If the last defer op added was a barrier, we're done. */
871 	dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type);
872 	if (dfp)
873 		return;
874 
875 	xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type);
876 
877 	trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
878 }
879 
880 /*
881  * Create a pending deferred work item to replay the recovered intent item
882  * and add it to the list.
883  */
884 void
885 xfs_defer_start_recovery(
886 	struct xfs_log_item		*lip,
887 	struct list_head		*r_dfops,
888 	const struct xfs_defer_op_type	*ops)
889 {
890 	struct xfs_defer_pending	*dfp = xfs_defer_alloc(r_dfops, ops);
891 
892 	dfp->dfp_intent = lip;
893 }
894 
895 /*
896  * Cancel a deferred work item created to recover a log intent item.  @dfp
897  * will be freed after this function returns.
898  */
899 void
900 xfs_defer_cancel_recovery(
901 	struct xfs_mount		*mp,
902 	struct xfs_defer_pending	*dfp)
903 {
904 	xfs_defer_pending_abort(mp, dfp);
905 	xfs_defer_pending_cancel_work(mp, dfp);
906 }
907 
908 /* Replay the deferred work item created from a recovered log intent item. */
909 int
910 xfs_defer_finish_recovery(
911 	struct xfs_mount		*mp,
912 	struct xfs_defer_pending	*dfp,
913 	struct list_head		*capture_list)
914 {
915 	const struct xfs_defer_op_type	*ops = dfp->dfp_ops;
916 	int				error;
917 
918 	/* dfp is freed by recover_work and must not be accessed afterwards */
919 	error = ops->recover_work(dfp, capture_list);
920 	if (error)
921 		trace_xlog_intent_recovery_failed(mp, ops, error);
922 	return error;
923 }
924 
925 /*
926  * Move deferred ops from one transaction to another and reset the source to
927  * initial state. This is primarily used to carry state forward across
928  * transaction rolls with pending dfops.
929  */
930 void
931 xfs_defer_move(
932 	struct xfs_trans	*dtp,
933 	struct xfs_trans	*stp)
934 {
935 	list_splice_init(&stp->t_dfops, &dtp->t_dfops);
936 
937 	/*
938 	 * Low free space mode was historically controlled by a dfops field.
939 	 * This meant that low mode state potentially carried across multiple
940 	 * transaction rolls. Transfer low mode on a dfops move to preserve
941 	 * that behavior.
942 	 */
943 	dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
944 	stp->t_flags &= ~XFS_TRANS_LOWMODE;
945 }
946 
947 /*
948  * Prepare a chain of fresh deferred ops work items to be completed later.  Log
949  * recovery requires the ability to put off until later the actual finishing
950  * work so that it can process unfinished items recovered from the log in
951  * correct order.
952  *
953  * Create and log intent items for all the work that we're capturing so that we
954  * can be assured that the items will get replayed if the system goes down
955  * before log recovery gets a chance to finish the work it put off.  The entire
956  * deferred ops state is transferred to the capture structure and the
957  * transaction is then ready for the caller to commit it.  If there are no
958  * intent items to capture, this function returns NULL.
959  *
960  * If capture_ip is not NULL, the capture structure will obtain an extra
961  * reference to the inode.
962  */
963 static struct xfs_defer_capture *
964 xfs_defer_ops_capture(
965 	struct xfs_trans		*tp)
966 {
967 	struct xfs_defer_capture	*dfc;
968 	unsigned short			i;
969 	int				error;
970 
971 	if (list_empty(&tp->t_dfops))
972 		return NULL;
973 
974 	error = xfs_defer_create_intents(tp);
975 	if (error < 0)
976 		return ERR_PTR(error);
977 
978 	/* Create an object to capture the defer ops. */
979 	dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL);
980 	INIT_LIST_HEAD(&dfc->dfc_list);
981 	INIT_LIST_HEAD(&dfc->dfc_dfops);
982 
983 	/* Move the dfops chain and transaction state to the capture struct. */
984 	list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
985 	dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
986 	tp->t_flags &= ~XFS_TRANS_LOWMODE;
987 
988 	/* Capture the remaining block reservations along with the dfops. */
989 	dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
990 	dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
991 
992 	/* Preserve the log reservation size. */
993 	dfc->dfc_logres = tp->t_log_res;
994 
995 	error = xfs_defer_save_resources(&dfc->dfc_held, tp);
996 	if (error) {
997 		/*
998 		 * Resource capture should never fail, but if it does, we
999 		 * still have to shut down the log and release things
1000 		 * properly.
1001 		 */
1002 		xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
1003 	}
1004 
1005 	/*
1006 	 * Grab extra references to the inodes and buffers because callers are
1007 	 * expected to release their held references after we commit the
1008 	 * transaction.
1009 	 */
1010 	for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
1011 		xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL);
1012 		ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
1013 	}
1014 
1015 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
1016 		xfs_buf_hold(dfc->dfc_held.dr_bp[i]);
1017 
1018 	return dfc;
1019 }
1020 
1021 /* Release all resources that we used to capture deferred ops. */
1022 void
1023 xfs_defer_ops_capture_abort(
1024 	struct xfs_mount		*mp,
1025 	struct xfs_defer_capture	*dfc)
1026 {
1027 	unsigned short			i;
1028 
1029 	xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops);
1030 	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
1031 
1032 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
1033 		xfs_buf_relse(dfc->dfc_held.dr_bp[i]);
1034 
1035 	for (i = 0; i < dfc->dfc_held.dr_inos; i++)
1036 		xfs_irele(dfc->dfc_held.dr_ip[i]);
1037 
1038 	kfree(dfc);
1039 }
1040 
1041 /*
1042  * Capture any deferred ops and commit the transaction.  This is the last step
1043  * needed to finish a log intent item that we recovered from the log.  If any
1044  * of the deferred ops operate on an inode, the caller must pass in that inode
1045  * so that the reference can be transferred to the capture structure.  The
1046  * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
1047  * xfs_defer_ops_continue.
1048  */
1049 int
1050 xfs_defer_ops_capture_and_commit(
1051 	struct xfs_trans		*tp,
1052 	struct list_head		*capture_list)
1053 {
1054 	struct xfs_mount		*mp = tp->t_mountp;
1055 	struct xfs_defer_capture	*dfc;
1056 	int				error;
1057 
1058 	/* If we don't capture anything, commit transaction and exit. */
1059 	dfc = xfs_defer_ops_capture(tp);
1060 	if (IS_ERR(dfc)) {
1061 		xfs_trans_cancel(tp);
1062 		return PTR_ERR(dfc);
1063 	}
1064 	if (!dfc)
1065 		return xfs_trans_commit(tp);
1066 
1067 	/* Commit the transaction and add the capture structure to the list. */
1068 	error = xfs_trans_commit(tp);
1069 	if (error) {
1070 		xfs_defer_ops_capture_abort(mp, dfc);
1071 		return error;
1072 	}
1073 
1074 	list_add_tail(&dfc->dfc_list, capture_list);
1075 	return 0;
1076 }
1077 
1078 /*
1079  * Attach a chain of captured deferred ops to a new transaction and free the
1080  * capture structure.  If an inode was captured, it will be passed back to the
1081  * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
1082  * The caller now owns the inode reference.
1083  */
1084 void
1085 xfs_defer_ops_continue(
1086 	struct xfs_defer_capture	*dfc,
1087 	struct xfs_trans		*tp,
1088 	struct xfs_defer_resources	*dres)
1089 {
1090 	unsigned int			i;
1091 
1092 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1093 	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
1094 
1095 	/* Lock the captured resources to the new transaction. */
1096 	if (dfc->dfc_held.dr_inos > 2) {
1097 		xfs_sort_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos);
1098 		xfs_lock_inodes(dfc->dfc_held.dr_ip, dfc->dfc_held.dr_inos,
1099 				XFS_ILOCK_EXCL);
1100 	} else if (dfc->dfc_held.dr_inos == 2)
1101 		xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
1102 				    dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
1103 	else if (dfc->dfc_held.dr_inos == 1)
1104 		xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
1105 
1106 	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
1107 		xfs_buf_lock(dfc->dfc_held.dr_bp[i]);
1108 
1109 	/* Join the captured resources to the new transaction. */
1110 	xfs_defer_restore_resources(tp, &dfc->dfc_held);
1111 	memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
1112 	dres->dr_bufs = 0;
1113 
1114 	/* Move captured dfops chain and state to the transaction. */
1115 	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
1116 	tp->t_flags |= dfc->dfc_tpflags;
1117 
1118 	kfree(dfc);
1119 }
1120 
1121 /* Release the resources captured and continued during recovery. */
1122 void
1123 xfs_defer_resources_rele(
1124 	struct xfs_defer_resources	*dres)
1125 {
1126 	unsigned short			i;
1127 
1128 	for (i = 0; i < dres->dr_inos; i++) {
1129 		xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
1130 		xfs_irele(dres->dr_ip[i]);
1131 		dres->dr_ip[i] = NULL;
1132 	}
1133 
1134 	for (i = 0; i < dres->dr_bufs; i++) {
1135 		xfs_buf_relse(dres->dr_bp[i]);
1136 		dres->dr_bp[i] = NULL;
1137 	}
1138 
1139 	dres->dr_inos = 0;
1140 	dres->dr_bufs = 0;
1141 	dres->dr_ordered = 0;
1142 }
1143 
1144 static inline int __init
1145 xfs_defer_init_cache(void)
1146 {
1147 	xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
1148 			sizeof(struct xfs_defer_pending),
1149 			0, 0, NULL);
1150 
1151 	return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
1152 }
1153 
1154 static inline void
1155 xfs_defer_destroy_cache(void)
1156 {
1157 	kmem_cache_destroy(xfs_defer_pending_cache);
1158 	xfs_defer_pending_cache = NULL;
1159 }
1160 
1161 /* Set up caches for deferred work items. */
1162 int __init
1163 xfs_defer_init_item_caches(void)
1164 {
1165 	int				error;
1166 
1167 	error = xfs_defer_init_cache();
1168 	if (error)
1169 		return error;
1170 	error = xfs_rmap_intent_init_cache();
1171 	if (error)
1172 		goto err;
1173 	error = xfs_refcount_intent_init_cache();
1174 	if (error)
1175 		goto err;
1176 	error = xfs_bmap_intent_init_cache();
1177 	if (error)
1178 		goto err;
1179 	error = xfs_extfree_intent_init_cache();
1180 	if (error)
1181 		goto err;
1182 	error = xfs_attr_intent_init_cache();
1183 	if (error)
1184 		goto err;
1185 	error = xfs_exchmaps_intent_init_cache();
1186 	if (error)
1187 		goto err;
1188 
1189 	return 0;
1190 err:
1191 	xfs_defer_destroy_item_caches();
1192 	return error;
1193 }
1194 
1195 /* Destroy all the deferred work item caches, if they've been allocated. */
1196 void
1197 xfs_defer_destroy_item_caches(void)
1198 {
1199 	xfs_exchmaps_intent_destroy_cache();
1200 	xfs_attr_intent_destroy_cache();
1201 	xfs_extfree_intent_destroy_cache();
1202 	xfs_bmap_intent_destroy_cache();
1203 	xfs_refcount_intent_destroy_cache();
1204 	xfs_rmap_intent_destroy_cache();
1205 	xfs_defer_destroy_cache();
1206 }
1207 
1208 /*
1209  * Mark a deferred work item so that it will be requeued indefinitely without
1210  * being finished.  Caller must ensure there are no data dependencies on this
1211  * work item in the meantime.
1212  */
1213 void
1214 xfs_defer_item_pause(
1215 	struct xfs_trans		*tp,
1216 	struct xfs_defer_pending	*dfp)
1217 {
1218 	ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED));
1219 
1220 	dfp->dfp_flags |= XFS_DEFER_PAUSED;
1221 
1222 	trace_xfs_defer_item_pause(tp->t_mountp, dfp);
1223 }
1224 
1225 /*
1226  * Release a paused deferred work item so that it will be finished during the
1227  * next transaction roll.
1228  */
1229 void
1230 xfs_defer_item_unpause(
1231 	struct xfs_trans		*tp,
1232 	struct xfs_defer_pending	*dfp)
1233 {
1234 	ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
1235 
1236 	dfp->dfp_flags &= ~XFS_DEFER_PAUSED;
1237 
1238 	trace_xfs_defer_item_unpause(tp->t_mountp, dfp);
1239 }
1240