xref: /linux/fs/jfs/jfs_txnmgr.c (revision 60b2737de1b1ddfdb90f3ba622634eb49d6f3603)
1 /*
2  *   Copyright (C) International Business Machines Corp., 2000-2005
3  *   Portions Copyright (C) Christoph Hellwig, 2001-2002
4  *
5  *   This program is free software;  you can redistribute it and/or modify
6  *   it under the terms of the GNU General Public License as published by
7  *   the Free Software Foundation; either version 2 of the License, or
8  *   (at your option) any later version.
9  *
10  *   This program is distributed in the hope that it will be useful,
11  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
13  *   the GNU General Public License for more details.
14  *
15  *   You should have received a copy of the GNU General Public License
16  *   along with this program;  if not, write to the Free Software
17  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18  */
19 
20 /*
21  *      jfs_txnmgr.c: transaction manager
22  *
23  * notes:
24  * transaction starts with txBegin() and ends with txCommit()
25  * or txAbort().
26  *
27  * tlock is acquired at the time of update;
28  * (obviate scan at commit time for xtree and dtree)
29  * tlock and mp points to each other;
30  * (no hashlist for mp -> tlock).
31  *
32  * special cases:
33  * tlock on in-memory inode:
34  * in-place tlock in the in-memory inode itself;
35  * converted to page lock by iWrite() at commit time.
36  *
37  * tlock during write()/mmap() under anonymous transaction (tid = 0):
38  * transferred (?) to transaction at commit time.
39  *
40  * use the page itself to update allocation maps
41  * (obviate intermediate replication of allocation/deallocation data)
42  * hold on to mp+lock thru update of maps
43  */
44 
45 #include <linux/fs.h>
46 #include <linux/vmalloc.h>
47 #include <linux/smp_lock.h>
48 #include <linux/completion.h>
49 #include <linux/suspend.h>
50 #include <linux/module.h>
51 #include <linux/moduleparam.h>
52 #include "jfs_incore.h"
53 #include "jfs_inode.h"
54 #include "jfs_filsys.h"
55 #include "jfs_metapage.h"
56 #include "jfs_dinode.h"
57 #include "jfs_imap.h"
58 #include "jfs_dmap.h"
59 #include "jfs_superblock.h"
60 #include "jfs_debug.h"
61 
62 /*
63  *      transaction management structures
64  */
65 static struct {
66 	int freetid;		/* index of a free tid structure */
67 	int freelock;		/* index first free lock word */
68 	wait_queue_head_t freewait;	/* eventlist of free tblock */
69 	wait_queue_head_t freelockwait;	/* eventlist of free tlock */
70 	wait_queue_head_t lowlockwait;	/* eventlist of ample tlocks */
71 	int tlocksInUse;	/* Number of tlocks in use */
72 	spinlock_t LazyLock;	/* synchronize sync_queue & unlock_queue */
73 /*	struct tblock *sync_queue; * Transactions waiting for data sync */
74 	struct list_head unlock_queue;	/* Txns waiting to be released */
75 	struct list_head anon_list;	/* inodes having anonymous txns */
76 	struct list_head anon_list2;	/* inodes having anonymous txns
77 					   that couldn't be sync'ed */
78 } TxAnchor;
79 
80 int jfs_tlocks_low;		/* Indicates low number of available tlocks */
81 
82 #ifdef CONFIG_JFS_STATISTICS
83 static struct {
84 	uint txBegin;
85 	uint txBegin_barrier;
86 	uint txBegin_lockslow;
87 	uint txBegin_freetid;
88 	uint txBeginAnon;
89 	uint txBeginAnon_barrier;
90 	uint txBeginAnon_lockslow;
91 	uint txLockAlloc;
92 	uint txLockAlloc_freelock;
93 } TxStat;
94 #endif
95 
96 static int nTxBlock = -1;	/* number of transaction blocks */
97 module_param(nTxBlock, int, 0);
98 MODULE_PARM_DESC(nTxBlock,
99 		 "Number of transaction blocks (max:65536)");
100 
101 static int nTxLock = -1;	/* number of transaction locks */
102 module_param(nTxLock, int, 0);
103 MODULE_PARM_DESC(nTxLock,
104 		 "Number of transaction locks (max:65536)");
105 
106 struct tblock *TxBlock;	        /* transaction block table */
107 static int TxLockLWM;		/* Low water mark for number of txLocks used */
108 static int TxLockHWM;		/* High water mark for number of txLocks used */
109 static int TxLockVHWM;		/* Very High water mark */
110 struct tlock *TxLock;           /* transaction lock table */
111 
112 /*
113  *      transaction management lock
114  */
115 static DEFINE_SPINLOCK(jfsTxnLock);
116 
117 #define TXN_LOCK()              spin_lock(&jfsTxnLock)
118 #define TXN_UNLOCK()            spin_unlock(&jfsTxnLock)
119 
120 #define LAZY_LOCK_INIT()	spin_lock_init(&TxAnchor.LazyLock);
121 #define LAZY_LOCK(flags)	spin_lock_irqsave(&TxAnchor.LazyLock, flags)
122 #define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags)
123 
124 DECLARE_WAIT_QUEUE_HEAD(jfs_sync_thread_wait);
125 DECLARE_WAIT_QUEUE_HEAD(jfs_commit_thread_wait);
126 static int jfs_commit_thread_waking;
127 
128 /*
129  * Retry logic exist outside these macros to protect from spurrious wakeups.
130  */
131 static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event)
132 {
133 	DECLARE_WAITQUEUE(wait, current);
134 
135 	add_wait_queue(event, &wait);
136 	set_current_state(TASK_UNINTERRUPTIBLE);
137 	TXN_UNLOCK();
138 	schedule();
139 	current->state = TASK_RUNNING;
140 	remove_wait_queue(event, &wait);
141 }
142 
143 #define TXN_SLEEP(event)\
144 {\
145 	TXN_SLEEP_DROP_LOCK(event);\
146 	TXN_LOCK();\
147 }
148 
149 #define TXN_WAKEUP(event) wake_up_all(event)
150 
151 /*
152  *      statistics
153  */
154 static struct {
155 	tid_t maxtid;		/* 4: biggest tid ever used */
156 	lid_t maxlid;		/* 4: biggest lid ever used */
157 	int ntid;		/* 4: # of transactions performed */
158 	int nlid;		/* 4: # of tlocks acquired */
159 	int waitlock;		/* 4: # of tlock wait */
160 } stattx;
161 
162 /*
163  * forward references
164  */
165 static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
166 		struct tlock * tlck, struct commit * cd);
167 static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
168 		struct tlock * tlck);
169 static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
170 		struct tlock * tlck);
171 static void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
172 		struct tlock * tlck);
173 static void txAllocPMap(struct inode *ip, struct maplock * maplock,
174 		struct tblock * tblk);
175 static void txForce(struct tblock * tblk);
176 static int txLog(struct jfs_log * log, struct tblock * tblk,
177 		struct commit * cd);
178 static void txUpdateMap(struct tblock * tblk);
179 static void txRelease(struct tblock * tblk);
180 static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
181 	   struct tlock * tlck);
182 static void LogSyncRelease(struct metapage * mp);
183 
184 /*
185  *              transaction block/lock management
186  *              ---------------------------------
187  */
188 
189 /*
190  * Get a transaction lock from the free list.  If the number in use is
191  * greater than the high water mark, wake up the sync daemon.  This should
192  * free some anonymous transaction locks.  (TXN_LOCK must be held.)
193  */
194 static lid_t txLockAlloc(void)
195 {
196 	lid_t lid;
197 
198 	INCREMENT(TxStat.txLockAlloc);
199 	if (!TxAnchor.freelock) {
200 		INCREMENT(TxStat.txLockAlloc_freelock);
201 	}
202 
203 	while (!(lid = TxAnchor.freelock))
204 		TXN_SLEEP(&TxAnchor.freelockwait);
205 	TxAnchor.freelock = TxLock[lid].next;
206 	HIGHWATERMARK(stattx.maxlid, lid);
207 	if ((++TxAnchor.tlocksInUse > TxLockHWM) && (jfs_tlocks_low == 0)) {
208 		jfs_info("txLockAlloc tlocks low");
209 		jfs_tlocks_low = 1;
210 		wake_up(&jfs_sync_thread_wait);
211 	}
212 
213 	return lid;
214 }
215 
216 static void txLockFree(lid_t lid)
217 {
218 	TxLock[lid].tid = 0;
219 	TxLock[lid].next = TxAnchor.freelock;
220 	TxAnchor.freelock = lid;
221 	TxAnchor.tlocksInUse--;
222 	if (jfs_tlocks_low && (TxAnchor.tlocksInUse < TxLockLWM)) {
223 		jfs_info("txLockFree jfs_tlocks_low no more");
224 		jfs_tlocks_low = 0;
225 		TXN_WAKEUP(&TxAnchor.lowlockwait);
226 	}
227 	TXN_WAKEUP(&TxAnchor.freelockwait);
228 }
229 
230 /*
231  * NAME:        txInit()
232  *
233  * FUNCTION:    initialize transaction management structures
234  *
235  * RETURN:
236  *
237  * serialization: single thread at jfs_init()
238  */
239 int txInit(void)
240 {
241 	int k, size;
242 	struct sysinfo si;
243 
244 	/* Set defaults for nTxLock and nTxBlock if unset */
245 
246 	if (nTxLock == -1) {
247 		if (nTxBlock == -1) {
248 			/* Base default on memory size */
249 			si_meminfo(&si);
250 			if (si.totalram > (256 * 1024)) /* 1 GB */
251 				nTxLock = 64 * 1024;
252 			else
253 				nTxLock = si.totalram >> 2;
254 		} else if (nTxBlock > (8 * 1024))
255 			nTxLock = 64 * 1024;
256 		else
257 			nTxLock = nTxBlock << 3;
258 	}
259 	if (nTxBlock == -1)
260 		nTxBlock = nTxLock >> 3;
261 
262 	/* Verify tunable parameters */
263 	if (nTxBlock < 16)
264 		nTxBlock = 16;	/* No one should set it this low */
265 	if (nTxBlock > 65536)
266 		nTxBlock = 65536;
267 	if (nTxLock < 256)
268 		nTxLock = 256;	/* No one should set it this low */
269 	if (nTxLock > 65536)
270 		nTxLock = 65536;
271 
272 	printk(KERN_INFO "JFS: nTxBlock = %d, nTxLock = %d\n",
273 	       nTxBlock, nTxLock);
274 	/*
275 	 * initialize transaction block (tblock) table
276 	 *
277 	 * transaction id (tid) = tblock index
278 	 * tid = 0 is reserved.
279 	 */
280 	TxLockLWM = (nTxLock * 4) / 10;
281 	TxLockHWM = (nTxLock * 7) / 10;
282 	TxLockVHWM = (nTxLock * 8) / 10;
283 
284 	size = sizeof(struct tblock) * nTxBlock;
285 	TxBlock = (struct tblock *) vmalloc(size);
286 	if (TxBlock == NULL)
287 		return -ENOMEM;
288 
289 	for (k = 1; k < nTxBlock - 1; k++) {
290 		TxBlock[k].next = k + 1;
291 		init_waitqueue_head(&TxBlock[k].gcwait);
292 		init_waitqueue_head(&TxBlock[k].waitor);
293 	}
294 	TxBlock[k].next = 0;
295 	init_waitqueue_head(&TxBlock[k].gcwait);
296 	init_waitqueue_head(&TxBlock[k].waitor);
297 
298 	TxAnchor.freetid = 1;
299 	init_waitqueue_head(&TxAnchor.freewait);
300 
301 	stattx.maxtid = 1;	/* statistics */
302 
303 	/*
304 	 * initialize transaction lock (tlock) table
305 	 *
306 	 * transaction lock id = tlock index
307 	 * tlock id = 0 is reserved.
308 	 */
309 	size = sizeof(struct tlock) * nTxLock;
310 	TxLock = (struct tlock *) vmalloc(size);
311 	if (TxLock == NULL) {
312 		vfree(TxBlock);
313 		return -ENOMEM;
314 	}
315 
316 	/* initialize tlock table */
317 	for (k = 1; k < nTxLock - 1; k++)
318 		TxLock[k].next = k + 1;
319 	TxLock[k].next = 0;
320 	init_waitqueue_head(&TxAnchor.freelockwait);
321 	init_waitqueue_head(&TxAnchor.lowlockwait);
322 
323 	TxAnchor.freelock = 1;
324 	TxAnchor.tlocksInUse = 0;
325 	INIT_LIST_HEAD(&TxAnchor.anon_list);
326 	INIT_LIST_HEAD(&TxAnchor.anon_list2);
327 
328 	LAZY_LOCK_INIT();
329 	INIT_LIST_HEAD(&TxAnchor.unlock_queue);
330 
331 	stattx.maxlid = 1;	/* statistics */
332 
333 	return 0;
334 }
335 
336 /*
337  * NAME:        txExit()
338  *
339  * FUNCTION:    clean up when module is unloaded
340  */
341 void txExit(void)
342 {
343 	vfree(TxLock);
344 	TxLock = NULL;
345 	vfree(TxBlock);
346 	TxBlock = NULL;
347 }
348 
349 /*
350  * NAME:        txBegin()
351  *
352  * FUNCTION:    start a transaction.
353  *
354  * PARAMETER:   sb	- superblock
355  *              flag	- force for nested tx;
356  *
357  * RETURN:	tid	- transaction id
358  *
359  * note: flag force allows to start tx for nested tx
360  * to prevent deadlock on logsync barrier;
361  */
362 tid_t txBegin(struct super_block *sb, int flag)
363 {
364 	tid_t t;
365 	struct tblock *tblk;
366 	struct jfs_log *log;
367 
368 	jfs_info("txBegin: flag = 0x%x", flag);
369 	log = JFS_SBI(sb)->log;
370 
371 	TXN_LOCK();
372 
373 	INCREMENT(TxStat.txBegin);
374 
375       retry:
376 	if (!(flag & COMMIT_FORCE)) {
377 		/*
378 		 * synchronize with logsync barrier
379 		 */
380 		if (test_bit(log_SYNCBARRIER, &log->flag) ||
381 		    test_bit(log_QUIESCE, &log->flag)) {
382 			INCREMENT(TxStat.txBegin_barrier);
383 			TXN_SLEEP(&log->syncwait);
384 			goto retry;
385 		}
386 	}
387 	if (flag == 0) {
388 		/*
389 		 * Don't begin transaction if we're getting starved for tlocks
390 		 * unless COMMIT_FORCE or COMMIT_INODE (which may ultimately
391 		 * free tlocks)
392 		 */
393 		if (TxAnchor.tlocksInUse > TxLockVHWM) {
394 			INCREMENT(TxStat.txBegin_lockslow);
395 			TXN_SLEEP(&TxAnchor.lowlockwait);
396 			goto retry;
397 		}
398 	}
399 
400 	/*
401 	 * allocate transaction id/block
402 	 */
403 	if ((t = TxAnchor.freetid) == 0) {
404 		jfs_info("txBegin: waiting for free tid");
405 		INCREMENT(TxStat.txBegin_freetid);
406 		TXN_SLEEP(&TxAnchor.freewait);
407 		goto retry;
408 	}
409 
410 	tblk = tid_to_tblock(t);
411 
412 	if ((tblk->next == 0) && !(flag & COMMIT_FORCE)) {
413 		/* Don't let a non-forced transaction take the last tblk */
414 		jfs_info("txBegin: waiting for free tid");
415 		INCREMENT(TxStat.txBegin_freetid);
416 		TXN_SLEEP(&TxAnchor.freewait);
417 		goto retry;
418 	}
419 
420 	TxAnchor.freetid = tblk->next;
421 
422 	/*
423 	 * initialize transaction
424 	 */
425 
426 	/*
427 	 * We can't zero the whole thing or we screw up another thread being
428 	 * awakened after sleeping on tblk->waitor
429 	 *
430 	 * memset(tblk, 0, sizeof(struct tblock));
431 	 */
432 	tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0;
433 
434 	tblk->sb = sb;
435 	++log->logtid;
436 	tblk->logtid = log->logtid;
437 
438 	++log->active;
439 
440 	HIGHWATERMARK(stattx.maxtid, t);	/* statistics */
441 	INCREMENT(stattx.ntid);	/* statistics */
442 
443 	TXN_UNLOCK();
444 
445 	jfs_info("txBegin: returning tid = %d", t);
446 
447 	return t;
448 }
449 
450 /*
451  * NAME:        txBeginAnon()
452  *
453  * FUNCTION:    start an anonymous transaction.
454  *		Blocks if logsync or available tlocks are low to prevent
455  *		anonymous tlocks from depleting supply.
456  *
457  * PARAMETER:   sb	- superblock
458  *
459  * RETURN:	none
460  */
461 void txBeginAnon(struct super_block *sb)
462 {
463 	struct jfs_log *log;
464 
465 	log = JFS_SBI(sb)->log;
466 
467 	TXN_LOCK();
468 	INCREMENT(TxStat.txBeginAnon);
469 
470       retry:
471 	/*
472 	 * synchronize with logsync barrier
473 	 */
474 	if (test_bit(log_SYNCBARRIER, &log->flag) ||
475 	    test_bit(log_QUIESCE, &log->flag)) {
476 		INCREMENT(TxStat.txBeginAnon_barrier);
477 		TXN_SLEEP(&log->syncwait);
478 		goto retry;
479 	}
480 
481 	/*
482 	 * Don't begin transaction if we're getting starved for tlocks
483 	 */
484 	if (TxAnchor.tlocksInUse > TxLockVHWM) {
485 		INCREMENT(TxStat.txBeginAnon_lockslow);
486 		TXN_SLEEP(&TxAnchor.lowlockwait);
487 		goto retry;
488 	}
489 	TXN_UNLOCK();
490 }
491 
492 /*
493  *      txEnd()
494  *
495  * function: free specified transaction block.
496  *
497  *      logsync barrier processing:
498  *
499  * serialization:
500  */
501 void txEnd(tid_t tid)
502 {
503 	struct tblock *tblk = tid_to_tblock(tid);
504 	struct jfs_log *log;
505 
506 	jfs_info("txEnd: tid = %d", tid);
507 	TXN_LOCK();
508 
509 	/*
510 	 * wakeup transactions waiting on the page locked
511 	 * by the current transaction
512 	 */
513 	TXN_WAKEUP(&tblk->waitor);
514 
515 	log = JFS_SBI(tblk->sb)->log;
516 
517 	/*
518 	 * Lazy commit thread can't free this guy until we mark it UNLOCKED,
519 	 * otherwise, we would be left with a transaction that may have been
520 	 * reused.
521 	 *
522 	 * Lazy commit thread will turn off tblkGC_LAZY before calling this
523 	 * routine.
524 	 */
525 	if (tblk->flag & tblkGC_LAZY) {
526 		jfs_info("txEnd called w/lazy tid: %d, tblk = 0x%p", tid, tblk);
527 		TXN_UNLOCK();
528 
529 		spin_lock_irq(&log->gclock);	// LOGGC_LOCK
530 		tblk->flag |= tblkGC_UNLOCKED;
531 		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
532 		return;
533 	}
534 
535 	jfs_info("txEnd: tid: %d, tblk = 0x%p", tid, tblk);
536 
537 	assert(tblk->next == 0);
538 
539 	/*
540 	 * insert tblock back on freelist
541 	 */
542 	tblk->next = TxAnchor.freetid;
543 	TxAnchor.freetid = tid;
544 
545 	/*
546 	 * mark the tblock not active
547 	 */
548 	if (--log->active == 0) {
549 		clear_bit(log_FLUSH, &log->flag);
550 
551 		/*
552 		 * synchronize with logsync barrier
553 		 */
554 		if (test_bit(log_SYNCBARRIER, &log->flag)) {
555 			jfs_info("log barrier off: 0x%x", log->lsn);
556 
557 			/* enable new transactions start */
558 			clear_bit(log_SYNCBARRIER, &log->flag);
559 
560 			/* wakeup all waitors for logsync barrier */
561 			TXN_WAKEUP(&log->syncwait);
562 
563 			TXN_UNLOCK();
564 
565 			/* forward log syncpt */
566 			jfs_syncpt(log);
567 
568 			goto wakeup;
569 		}
570 	}
571 
572 	TXN_UNLOCK();
573 wakeup:
574 	/*
575 	 * wakeup all waitors for a free tblock
576 	 */
577 	TXN_WAKEUP(&TxAnchor.freewait);
578 }
579 
580 /*
581  *      txLock()
582  *
583  * function: acquire a transaction lock on the specified <mp>
584  *
585  * parameter:
586  *
587  * return:      transaction lock id
588  *
589  * serialization:
590  */
591 struct tlock *txLock(tid_t tid, struct inode *ip, struct metapage * mp,
592 		     int type)
593 {
594 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
595 	int dir_xtree = 0;
596 	lid_t lid;
597 	tid_t xtid;
598 	struct tlock *tlck;
599 	struct xtlock *xtlck;
600 	struct linelock *linelock;
601 	xtpage_t *p;
602 	struct tblock *tblk;
603 
604 	TXN_LOCK();
605 
606 	if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) &&
607 	    !(mp->xflag & COMMIT_PAGE)) {
608 		/*
609 		 * Directory inode is special.  It can have both an xtree tlock
610 		 * and a dtree tlock associated with it.
611 		 */
612 		dir_xtree = 1;
613 		lid = jfs_ip->xtlid;
614 	} else
615 		lid = mp->lid;
616 
617 	/* is page not locked by a transaction ? */
618 	if (lid == 0)
619 		goto allocateLock;
620 
621 	jfs_info("txLock: tid:%d ip:0x%p mp:0x%p lid:%d", tid, ip, mp, lid);
622 
623 	/* is page locked by the requester transaction ? */
624 	tlck = lid_to_tlock(lid);
625 	if ((xtid = tlck->tid) == tid) {
626 		TXN_UNLOCK();
627 		goto grantLock;
628 	}
629 
630 	/*
631 	 * is page locked by anonymous transaction/lock ?
632 	 *
633 	 * (page update without transaction (i.e., file write) is
634 	 * locked under anonymous transaction tid = 0:
635 	 * anonymous tlocks maintained on anonymous tlock list of
636 	 * the inode of the page and available to all anonymous
637 	 * transactions until txCommit() time at which point
638 	 * they are transferred to the transaction tlock list of
639 	 * the commiting transaction of the inode)
640 	 */
641 	if (xtid == 0) {
642 		tlck->tid = tid;
643 		TXN_UNLOCK();
644 		tblk = tid_to_tblock(tid);
645 		/*
646 		 * The order of the tlocks in the transaction is important
647 		 * (during truncate, child xtree pages must be freed before
648 		 * parent's tlocks change the working map).
649 		 * Take tlock off anonymous list and add to tail of
650 		 * transaction list
651 		 *
652 		 * Note:  We really need to get rid of the tid & lid and
653 		 * use list_head's.  This code is getting UGLY!
654 		 */
655 		if (jfs_ip->atlhead == lid) {
656 			if (jfs_ip->atltail == lid) {
657 				/* only anonymous txn.
658 				 * Remove from anon_list
659 				 */
660 				list_del_init(&jfs_ip->anon_inode_list);
661 			}
662 			jfs_ip->atlhead = tlck->next;
663 		} else {
664 			lid_t last;
665 			for (last = jfs_ip->atlhead;
666 			     lid_to_tlock(last)->next != lid;
667 			     last = lid_to_tlock(last)->next) {
668 				assert(last);
669 			}
670 			lid_to_tlock(last)->next = tlck->next;
671 			if (jfs_ip->atltail == lid)
672 				jfs_ip->atltail = last;
673 		}
674 
675 		/* insert the tlock at tail of transaction tlock list */
676 
677 		if (tblk->next)
678 			lid_to_tlock(tblk->last)->next = lid;
679 		else
680 			tblk->next = lid;
681 		tlck->next = 0;
682 		tblk->last = lid;
683 
684 		goto grantLock;
685 	}
686 
687 	goto waitLock;
688 
689 	/*
690 	 * allocate a tlock
691 	 */
692       allocateLock:
693 	lid = txLockAlloc();
694 	tlck = lid_to_tlock(lid);
695 
696 	/*
697 	 * initialize tlock
698 	 */
699 	tlck->tid = tid;
700 
701 	TXN_UNLOCK();
702 
703 	/* mark tlock for meta-data page */
704 	if (mp->xflag & COMMIT_PAGE) {
705 
706 		tlck->flag = tlckPAGELOCK;
707 
708 		/* mark the page dirty and nohomeok */
709 		metapage_nohomeok(mp);
710 
711 		jfs_info("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p",
712 			 mp, mp->nohomeok, tid, tlck);
713 
714 		/* if anonymous transaction, and buffer is on the group
715 		 * commit synclist, mark inode to show this.  This will
716 		 * prevent the buffer from being marked nohomeok for too
717 		 * long a time.
718 		 */
719 		if ((tid == 0) && mp->lsn)
720 			set_cflag(COMMIT_Synclist, ip);
721 	}
722 	/* mark tlock for in-memory inode */
723 	else
724 		tlck->flag = tlckINODELOCK;
725 
726 	tlck->type = 0;
727 
728 	/* bind the tlock and the page */
729 	tlck->ip = ip;
730 	tlck->mp = mp;
731 	if (dir_xtree)
732 		jfs_ip->xtlid = lid;
733 	else
734 		mp->lid = lid;
735 
736 	/*
737 	 * enqueue transaction lock to transaction/inode
738 	 */
739 	/* insert the tlock at tail of transaction tlock list */
740 	if (tid) {
741 		tblk = tid_to_tblock(tid);
742 		if (tblk->next)
743 			lid_to_tlock(tblk->last)->next = lid;
744 		else
745 			tblk->next = lid;
746 		tlck->next = 0;
747 		tblk->last = lid;
748 	}
749 	/* anonymous transaction:
750 	 * insert the tlock at head of inode anonymous tlock list
751 	 */
752 	else {
753 		tlck->next = jfs_ip->atlhead;
754 		jfs_ip->atlhead = lid;
755 		if (tlck->next == 0) {
756 			/* This inode's first anonymous transaction */
757 			jfs_ip->atltail = lid;
758 			TXN_LOCK();
759 			list_add_tail(&jfs_ip->anon_inode_list,
760 				      &TxAnchor.anon_list);
761 			TXN_UNLOCK();
762 		}
763 	}
764 
765 	/* initialize type dependent area for linelock */
766 	linelock = (struct linelock *) & tlck->lock;
767 	linelock->next = 0;
768 	linelock->flag = tlckLINELOCK;
769 	linelock->maxcnt = TLOCKSHORT;
770 	linelock->index = 0;
771 
772 	switch (type & tlckTYPE) {
773 	case tlckDTREE:
774 		linelock->l2linesize = L2DTSLOTSIZE;
775 		break;
776 
777 	case tlckXTREE:
778 		linelock->l2linesize = L2XTSLOTSIZE;
779 
780 		xtlck = (struct xtlock *) linelock;
781 		xtlck->header.offset = 0;
782 		xtlck->header.length = 2;
783 
784 		if (type & tlckNEW) {
785 			xtlck->lwm.offset = XTENTRYSTART;
786 		} else {
787 			if (mp->xflag & COMMIT_PAGE)
788 				p = (xtpage_t *) mp->data;
789 			else
790 				p = &jfs_ip->i_xtroot;
791 			xtlck->lwm.offset =
792 			    le16_to_cpu(p->header.nextindex);
793 		}
794 		xtlck->lwm.length = 0;	/* ! */
795 		xtlck->twm.offset = 0;
796 		xtlck->hwm.offset = 0;
797 
798 		xtlck->index = 2;
799 		break;
800 
801 	case tlckINODE:
802 		linelock->l2linesize = L2INODESLOTSIZE;
803 		break;
804 
805 	case tlckDATA:
806 		linelock->l2linesize = L2DATASLOTSIZE;
807 		break;
808 
809 	default:
810 		jfs_err("UFO tlock:0x%p", tlck);
811 	}
812 
813 	/*
814 	 * update tlock vector
815 	 */
816       grantLock:
817 	tlck->type |= type;
818 
819 	return tlck;
820 
821 	/*
822 	 * page is being locked by another transaction:
823 	 */
824       waitLock:
825 	/* Only locks on ipimap or ipaimap should reach here */
826 	/* assert(jfs_ip->fileset == AGGREGATE_I); */
827 	if (jfs_ip->fileset != AGGREGATE_I) {
828 		jfs_err("txLock: trying to lock locked page!");
829 		dump_mem("ip", ip, sizeof(struct inode));
830 		dump_mem("mp", mp, sizeof(struct metapage));
831 		dump_mem("Locker's tblk", tid_to_tblock(tid),
832 			 sizeof(struct tblock));
833 		dump_mem("Tlock", tlck, sizeof(struct tlock));
834 		BUG();
835 	}
836 	INCREMENT(stattx.waitlock);	/* statistics */
837 	TXN_UNLOCK();
838 	release_metapage(mp);
839 	TXN_LOCK();
840 	xtid = tlck->tid;	/* reaquire after dropping TXN_LOCK */
841 
842 	jfs_info("txLock: in waitLock, tid = %d, xtid = %d, lid = %d",
843 		 tid, xtid, lid);
844 
845 	/* Recheck everything since dropping TXN_LOCK */
846 	if (xtid && (tlck->mp == mp) && (mp->lid == lid))
847 		TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor);
848 	else
849 		TXN_UNLOCK();
850 	jfs_info("txLock: awakened     tid = %d, lid = %d", tid, lid);
851 
852 	return NULL;
853 }
854 
855 /*
856  * NAME:        txRelease()
857  *
858  * FUNCTION:    Release buffers associated with transaction locks, but don't
859  *		mark homeok yet.  The allows other transactions to modify
860  *		buffers, but won't let them go to disk until commit record
861  *		actually gets written.
862  *
863  * PARAMETER:
864  *              tblk    -
865  *
866  * RETURN:      Errors from subroutines.
867  */
868 static void txRelease(struct tblock * tblk)
869 {
870 	struct metapage *mp;
871 	lid_t lid;
872 	struct tlock *tlck;
873 
874 	TXN_LOCK();
875 
876 	for (lid = tblk->next; lid; lid = tlck->next) {
877 		tlck = lid_to_tlock(lid);
878 		if ((mp = tlck->mp) != NULL &&
879 		    (tlck->type & tlckBTROOT) == 0) {
880 			assert(mp->xflag & COMMIT_PAGE);
881 			mp->lid = 0;
882 		}
883 	}
884 
885 	/*
886 	 * wakeup transactions waiting on a page locked
887 	 * by the current transaction
888 	 */
889 	TXN_WAKEUP(&tblk->waitor);
890 
891 	TXN_UNLOCK();
892 }
893 
894 /*
895  * NAME:        txUnlock()
896  *
897  * FUNCTION:    Initiates pageout of pages modified by tid in journalled
898  *              objects and frees their lockwords.
899  */
900 static void txUnlock(struct tblock * tblk)
901 {
902 	struct tlock *tlck;
903 	struct linelock *linelock;
904 	lid_t lid, next, llid, k;
905 	struct metapage *mp;
906 	struct jfs_log *log;
907 	int difft, diffp;
908 	unsigned long flags;
909 
910 	jfs_info("txUnlock: tblk = 0x%p", tblk);
911 	log = JFS_SBI(tblk->sb)->log;
912 
913 	/*
914 	 * mark page under tlock homeok (its log has been written):
915 	 */
916 	for (lid = tblk->next; lid; lid = next) {
917 		tlck = lid_to_tlock(lid);
918 		next = tlck->next;
919 
920 		jfs_info("unlocking lid = %d, tlck = 0x%p", lid, tlck);
921 
922 		/* unbind page from tlock */
923 		if ((mp = tlck->mp) != NULL &&
924 		    (tlck->type & tlckBTROOT) == 0) {
925 			assert(mp->xflag & COMMIT_PAGE);
926 
927 			/* hold buffer
928 			 */
929 			hold_metapage(mp);
930 
931 			assert(mp->nohomeok > 0);
932 			_metapage_homeok(mp);
933 
934 			/* inherit younger/larger clsn */
935 			LOGSYNC_LOCK(log, flags);
936 			if (mp->clsn) {
937 				logdiff(difft, tblk->clsn, log);
938 				logdiff(diffp, mp->clsn, log);
939 				if (difft > diffp)
940 					mp->clsn = tblk->clsn;
941 			} else
942 				mp->clsn = tblk->clsn;
943 			LOGSYNC_UNLOCK(log, flags);
944 
945 			assert(!(tlck->flag & tlckFREEPAGE));
946 
947 			put_metapage(mp);
948 		}
949 
950 		/* insert tlock, and linelock(s) of the tlock if any,
951 		 * at head of freelist
952 		 */
953 		TXN_LOCK();
954 
955 		llid = ((struct linelock *) & tlck->lock)->next;
956 		while (llid) {
957 			linelock = (struct linelock *) lid_to_tlock(llid);
958 			k = linelock->next;
959 			txLockFree(llid);
960 			llid = k;
961 		}
962 		txLockFree(lid);
963 
964 		TXN_UNLOCK();
965 	}
966 	tblk->next = tblk->last = 0;
967 
968 	/*
969 	 * remove tblock from logsynclist
970 	 * (allocation map pages inherited lsn of tblk and
971 	 * has been inserted in logsync list at txUpdateMap())
972 	 */
973 	if (tblk->lsn) {
974 		LOGSYNC_LOCK(log, flags);
975 		log->count--;
976 		list_del(&tblk->synclist);
977 		LOGSYNC_UNLOCK(log, flags);
978 	}
979 }
980 
981 /*
982  *      txMaplock()
983  *
984  * function: allocate a transaction lock for freed page/entry;
985  *      for freed page, maplock is used as xtlock/dtlock type;
986  */
987 struct tlock *txMaplock(tid_t tid, struct inode *ip, int type)
988 {
989 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
990 	lid_t lid;
991 	struct tblock *tblk;
992 	struct tlock *tlck;
993 	struct maplock *maplock;
994 
995 	TXN_LOCK();
996 
997 	/*
998 	 * allocate a tlock
999 	 */
1000 	lid = txLockAlloc();
1001 	tlck = lid_to_tlock(lid);
1002 
1003 	/*
1004 	 * initialize tlock
1005 	 */
1006 	tlck->tid = tid;
1007 
1008 	/* bind the tlock and the object */
1009 	tlck->flag = tlckINODELOCK;
1010 	tlck->ip = ip;
1011 	tlck->mp = NULL;
1012 
1013 	tlck->type = type;
1014 
1015 	/*
1016 	 * enqueue transaction lock to transaction/inode
1017 	 */
1018 	/* insert the tlock at tail of transaction tlock list */
1019 	if (tid) {
1020 		tblk = tid_to_tblock(tid);
1021 		if (tblk->next)
1022 			lid_to_tlock(tblk->last)->next = lid;
1023 		else
1024 			tblk->next = lid;
1025 		tlck->next = 0;
1026 		tblk->last = lid;
1027 	}
1028 	/* anonymous transaction:
1029 	 * insert the tlock at head of inode anonymous tlock list
1030 	 */
1031 	else {
1032 		tlck->next = jfs_ip->atlhead;
1033 		jfs_ip->atlhead = lid;
1034 		if (tlck->next == 0) {
1035 			/* This inode's first anonymous transaction */
1036 			jfs_ip->atltail = lid;
1037 			list_add_tail(&jfs_ip->anon_inode_list,
1038 				      &TxAnchor.anon_list);
1039 		}
1040 	}
1041 
1042 	TXN_UNLOCK();
1043 
1044 	/* initialize type dependent area for maplock */
1045 	maplock = (struct maplock *) & tlck->lock;
1046 	maplock->next = 0;
1047 	maplock->maxcnt = 0;
1048 	maplock->index = 0;
1049 
1050 	return tlck;
1051 }
1052 
1053 /*
1054  *      txLinelock()
1055  *
1056  * function: allocate a transaction lock for log vector list
1057  */
1058 struct linelock *txLinelock(struct linelock * tlock)
1059 {
1060 	lid_t lid;
1061 	struct tlock *tlck;
1062 	struct linelock *linelock;
1063 
1064 	TXN_LOCK();
1065 
1066 	/* allocate a TxLock structure */
1067 	lid = txLockAlloc();
1068 	tlck = lid_to_tlock(lid);
1069 
1070 	TXN_UNLOCK();
1071 
1072 	/* initialize linelock */
1073 	linelock = (struct linelock *) tlck;
1074 	linelock->next = 0;
1075 	linelock->flag = tlckLINELOCK;
1076 	linelock->maxcnt = TLOCKLONG;
1077 	linelock->index = 0;
1078 
1079 	/* append linelock after tlock */
1080 	linelock->next = tlock->next;
1081 	tlock->next = lid;
1082 
1083 	return linelock;
1084 }
1085 
1086 /*
1087  *              transaction commit management
1088  *              -----------------------------
1089  */
1090 
1091 /*
1092  * NAME:        txCommit()
1093  *
1094  * FUNCTION:    commit the changes to the objects specified in
1095  *              clist.  For journalled segments only the
1096  *              changes of the caller are committed, ie by tid.
1097  *              for non-journalled segments the data are flushed to
1098  *              disk and then the change to the disk inode and indirect
1099  *              blocks committed (so blocks newly allocated to the
1100  *              segment will be made a part of the segment atomically).
1101  *
1102  *              all of the segments specified in clist must be in
1103  *              one file system. no more than 6 segments are needed
1104  *              to handle all unix svcs.
1105  *
1106  *              if the i_nlink field (i.e. disk inode link count)
1107  *              is zero, and the type of inode is a regular file or
1108  *              directory, or symbolic link , the inode is truncated
1109  *              to zero length. the truncation is committed but the
1110  *              VM resources are unaffected until it is closed (see
1111  *              iput and iclose).
1112  *
1113  * PARAMETER:
1114  *
1115  * RETURN:
1116  *
1117  * serialization:
1118  *              on entry the inode lock on each segment is assumed
1119  *              to be held.
1120  *
1121  * i/o error:
1122  */
1123 int txCommit(tid_t tid,		/* transaction identifier */
1124 	     int nip,		/* number of inodes to commit */
1125 	     struct inode **iplist,	/* list of inode to commit */
1126 	     int flag)
1127 {
1128 	int rc = 0;
1129 	struct commit cd;
1130 	struct jfs_log *log;
1131 	struct tblock *tblk;
1132 	struct lrd *lrd;
1133 	int lsn;
1134 	struct inode *ip;
1135 	struct jfs_inode_info *jfs_ip;
1136 	int k, n;
1137 	ino_t top;
1138 	struct super_block *sb;
1139 
1140 	jfs_info("txCommit, tid = %d, flag = %d", tid, flag);
1141 	/* is read-only file system ? */
1142 	if (isReadOnly(iplist[0])) {
1143 		rc = -EROFS;
1144 		goto TheEnd;
1145 	}
1146 
1147 	sb = cd.sb = iplist[0]->i_sb;
1148 	cd.tid = tid;
1149 
1150 	if (tid == 0)
1151 		tid = txBegin(sb, 0);
1152 	tblk = tid_to_tblock(tid);
1153 
1154 	/*
1155 	 * initialize commit structure
1156 	 */
1157 	log = JFS_SBI(sb)->log;
1158 	cd.log = log;
1159 
1160 	/* initialize log record descriptor in commit */
1161 	lrd = &cd.lrd;
1162 	lrd->logtid = cpu_to_le32(tblk->logtid);
1163 	lrd->backchain = 0;
1164 
1165 	tblk->xflag |= flag;
1166 
1167 	if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0)
1168 		tblk->xflag |= COMMIT_LAZY;
1169 	/*
1170 	 *      prepare non-journaled objects for commit
1171 	 *
1172 	 * flush data pages of non-journaled file
1173 	 * to prevent the file getting non-initialized disk blocks
1174 	 * in case of crash.
1175 	 * (new blocks - )
1176 	 */
1177 	cd.iplist = iplist;
1178 	cd.nip = nip;
1179 
1180 	/*
1181 	 *      acquire transaction lock on (on-disk) inodes
1182 	 *
1183 	 * update on-disk inode from in-memory inode
1184 	 * acquiring transaction locks for AFTER records
1185 	 * on the on-disk inode of file object
1186 	 *
1187 	 * sort the inodes array by inode number in descending order
1188 	 * to prevent deadlock when acquiring transaction lock
1189 	 * of on-disk inodes on multiple on-disk inode pages by
1190 	 * multiple concurrent transactions
1191 	 */
1192 	for (k = 0; k < cd.nip; k++) {
1193 		top = (cd.iplist[k])->i_ino;
1194 		for (n = k + 1; n < cd.nip; n++) {
1195 			ip = cd.iplist[n];
1196 			if (ip->i_ino > top) {
1197 				top = ip->i_ino;
1198 				cd.iplist[n] = cd.iplist[k];
1199 				cd.iplist[k] = ip;
1200 			}
1201 		}
1202 
1203 		ip = cd.iplist[k];
1204 		jfs_ip = JFS_IP(ip);
1205 
1206 		/*
1207 		 * BUGBUG - This code has temporarily been removed.  The
1208 		 * intent is to ensure that any file data is written before
1209 		 * the metadata is committed to the journal.  This prevents
1210 		 * uninitialized data from appearing in a file after the
1211 		 * journal has been replayed.  (The uninitialized data
1212 		 * could be sensitive data removed by another user.)
1213 		 *
1214 		 * The problem now is that we are holding the IWRITELOCK
1215 		 * on the inode, and calling filemap_fdatawrite on an
1216 		 * unmapped page will cause a deadlock in jfs_get_block.
1217 		 *
1218 		 * The long term solution is to pare down the use of
1219 		 * IWRITELOCK.  We are currently holding it too long.
1220 		 * We could also be smarter about which data pages need
1221 		 * to be written before the transaction is committed and
1222 		 * when we don't need to worry about it at all.
1223 		 *
1224 		 * if ((!S_ISDIR(ip->i_mode))
1225 		 *    && (tblk->flag & COMMIT_DELETE) == 0) {
1226 		 *	filemap_fdatawrite(ip->i_mapping);
1227 		 *	filemap_fdatawait(ip->i_mapping);
1228 		 * }
1229 		 */
1230 
1231 		/*
1232 		 * Mark inode as not dirty.  It will still be on the dirty
1233 		 * inode list, but we'll know not to commit it again unless
1234 		 * it gets marked dirty again
1235 		 */
1236 		clear_cflag(COMMIT_Dirty, ip);
1237 
1238 		/* inherit anonymous tlock(s) of inode */
1239 		if (jfs_ip->atlhead) {
1240 			lid_to_tlock(jfs_ip->atltail)->next = tblk->next;
1241 			tblk->next = jfs_ip->atlhead;
1242 			if (!tblk->last)
1243 				tblk->last = jfs_ip->atltail;
1244 			jfs_ip->atlhead = jfs_ip->atltail = 0;
1245 			TXN_LOCK();
1246 			list_del_init(&jfs_ip->anon_inode_list);
1247 			TXN_UNLOCK();
1248 		}
1249 
1250 		/*
1251 		 * acquire transaction lock on on-disk inode page
1252 		 * (become first tlock of the tblk's tlock list)
1253 		 */
1254 		if (((rc = diWrite(tid, ip))))
1255 			goto out;
1256 	}
1257 
1258 	/*
1259 	 *      write log records from transaction locks
1260 	 *
1261 	 * txUpdateMap() resets XAD_NEW in XAD.
1262 	 */
1263 	if ((rc = txLog(log, tblk, &cd)))
1264 		goto TheEnd;
1265 
1266 	/*
1267 	 * Ensure that inode isn't reused before
1268 	 * lazy commit thread finishes processing
1269 	 */
1270 	if (tblk->xflag & COMMIT_DELETE) {
1271 		atomic_inc(&tblk->u.ip->i_count);
1272 		/*
1273 		 * Avoid a rare deadlock
1274 		 *
1275 		 * If the inode is locked, we may be blocked in
1276 		 * jfs_commit_inode.  If so, we don't want the
1277 		 * lazy_commit thread doing the last iput() on the inode
1278 		 * since that may block on the locked inode.  Instead,
1279 		 * commit the transaction synchronously, so the last iput
1280 		 * will be done by the calling thread (or later)
1281 		 */
1282 		if (tblk->u.ip->i_state & I_LOCK)
1283 			tblk->xflag &= ~COMMIT_LAZY;
1284 	}
1285 
1286 	ASSERT((!(tblk->xflag & COMMIT_DELETE)) ||
1287 	       ((tblk->u.ip->i_nlink == 0) &&
1288 		!test_cflag(COMMIT_Nolink, tblk->u.ip)));
1289 
1290 	/*
1291 	 *      write COMMIT log record
1292 	 */
1293 	lrd->type = cpu_to_le16(LOG_COMMIT);
1294 	lrd->length = 0;
1295 	lsn = lmLog(log, tblk, lrd, NULL);
1296 
1297 	lmGroupCommit(log, tblk);
1298 
1299 	/*
1300 	 *      - transaction is now committed -
1301 	 */
1302 
1303 	/*
1304 	 * force pages in careful update
1305 	 * (imap addressing structure update)
1306 	 */
1307 	if (flag & COMMIT_FORCE)
1308 		txForce(tblk);
1309 
1310 	/*
1311 	 *      update allocation map.
1312 	 *
1313 	 * update inode allocation map and inode:
1314 	 * free pager lock on memory object of inode if any.
1315 	 * update  block allocation map.
1316 	 *
1317 	 * txUpdateMap() resets XAD_NEW in XAD.
1318 	 */
1319 	if (tblk->xflag & COMMIT_FORCE)
1320 		txUpdateMap(tblk);
1321 
1322 	/*
1323 	 *      free transaction locks and pageout/free pages
1324 	 */
1325 	txRelease(tblk);
1326 
1327 	if ((tblk->flag & tblkGC_LAZY) == 0)
1328 		txUnlock(tblk);
1329 
1330 
1331 	/*
1332 	 *      reset in-memory object state
1333 	 */
1334 	for (k = 0; k < cd.nip; k++) {
1335 		ip = cd.iplist[k];
1336 		jfs_ip = JFS_IP(ip);
1337 
1338 		/*
1339 		 * reset in-memory inode state
1340 		 */
1341 		jfs_ip->bxflag = 0;
1342 		jfs_ip->blid = 0;
1343 	}
1344 
1345       out:
1346 	if (rc != 0)
1347 		txAbort(tid, 1);
1348 
1349       TheEnd:
1350 	jfs_info("txCommit: tid = %d, returning %d", tid, rc);
1351 	return rc;
1352 }
1353 
1354 /*
1355  * NAME:        txLog()
1356  *
1357  * FUNCTION:    Writes AFTER log records for all lines modified
1358  *              by tid for segments specified by inodes in comdata.
1359  *              Code assumes only WRITELOCKS are recorded in lockwords.
1360  *
1361  * PARAMETERS:
1362  *
1363  * RETURN :
1364  */
1365 static int txLog(struct jfs_log * log, struct tblock * tblk, struct commit * cd)
1366 {
1367 	int rc = 0;
1368 	struct inode *ip;
1369 	lid_t lid;
1370 	struct tlock *tlck;
1371 	struct lrd *lrd = &cd->lrd;
1372 
1373 	/*
1374 	 * write log record(s) for each tlock of transaction,
1375 	 */
1376 	for (lid = tblk->next; lid; lid = tlck->next) {
1377 		tlck = lid_to_tlock(lid);
1378 
1379 		tlck->flag |= tlckLOG;
1380 
1381 		/* initialize lrd common */
1382 		ip = tlck->ip;
1383 		lrd->aggregate = cpu_to_le32(JFS_SBI(ip->i_sb)->aggregate);
1384 		lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset);
1385 		lrd->log.redopage.inode = cpu_to_le32(ip->i_ino);
1386 
1387 		/* write log record of page from the tlock */
1388 		switch (tlck->type & tlckTYPE) {
1389 		case tlckXTREE:
1390 			xtLog(log, tblk, lrd, tlck);
1391 			break;
1392 
1393 		case tlckDTREE:
1394 			dtLog(log, tblk, lrd, tlck);
1395 			break;
1396 
1397 		case tlckINODE:
1398 			diLog(log, tblk, lrd, tlck, cd);
1399 			break;
1400 
1401 		case tlckMAP:
1402 			mapLog(log, tblk, lrd, tlck);
1403 			break;
1404 
1405 		case tlckDATA:
1406 			dataLog(log, tblk, lrd, tlck);
1407 			break;
1408 
1409 		default:
1410 			jfs_err("UFO tlock:0x%p", tlck);
1411 		}
1412 	}
1413 
1414 	return rc;
1415 }
1416 
1417 /*
1418  *      diLog()
1419  *
1420  * function:    log inode tlock and format maplock to update bmap;
1421  */
1422 static int diLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1423 	  struct tlock * tlck, struct commit * cd)
1424 {
1425 	int rc = 0;
1426 	struct metapage *mp;
1427 	pxd_t *pxd;
1428 	struct pxd_lock *pxdlock;
1429 
1430 	mp = tlck->mp;
1431 
1432 	/* initialize as REDOPAGE record format */
1433 	lrd->log.redopage.type = cpu_to_le16(LOG_INODE);
1434 	lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE);
1435 
1436 	pxd = &lrd->log.redopage.pxd;
1437 
1438 	/*
1439 	 *      inode after image
1440 	 */
1441 	if (tlck->type & tlckENTRY) {
1442 		/* log after-image for logredo(): */
1443 		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1444 		PXDaddress(pxd, mp->index);
1445 		PXDlength(pxd,
1446 			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1447 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1448 
1449 		/* mark page as homeward bound */
1450 		tlck->flag |= tlckWRITEPAGE;
1451 	} else if (tlck->type & tlckFREE) {
1452 		/*
1453 		 *      free inode extent
1454 		 *
1455 		 * (pages of the freed inode extent have been invalidated and
1456 		 * a maplock for free of the extent has been formatted at
1457 		 * txLock() time);
1458 		 *
1459 		 * the tlock had been acquired on the inode allocation map page
1460 		 * (iag) that specifies the freed extent, even though the map
1461 		 * page is not itself logged, to prevent pageout of the map
1462 		 * page before the log;
1463 		 */
1464 
1465 		/* log LOG_NOREDOINOEXT of the freed inode extent for
1466 		 * logredo() to start NoRedoPage filters, and to update
1467 		 * imap and bmap for free of the extent;
1468 		 */
1469 		lrd->type = cpu_to_le16(LOG_NOREDOINOEXT);
1470 		/*
1471 		 * For the LOG_NOREDOINOEXT record, we need
1472 		 * to pass the IAG number and inode extent
1473 		 * index (within that IAG) from which the
1474 		 * the extent being released.  These have been
1475 		 * passed to us in the iplist[1] and iplist[2].
1476 		 */
1477 		lrd->log.noredoinoext.iagnum =
1478 		    cpu_to_le32((u32) (size_t) cd->iplist[1]);
1479 		lrd->log.noredoinoext.inoext_idx =
1480 		    cpu_to_le32((u32) (size_t) cd->iplist[2]);
1481 
1482 		pxdlock = (struct pxd_lock *) & tlck->lock;
1483 		*pxd = pxdlock->pxd;
1484 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1485 
1486 		/* update bmap */
1487 		tlck->flag |= tlckUPDATEMAP;
1488 
1489 		/* mark page as homeward bound */
1490 		tlck->flag |= tlckWRITEPAGE;
1491 	} else
1492 		jfs_err("diLog: UFO type tlck:0x%p", tlck);
1493 #ifdef  _JFS_WIP
1494 	/*
1495 	 *      alloc/free external EA extent
1496 	 *
1497 	 * a maplock for txUpdateMap() to update bPWMAP for alloc/free
1498 	 * of the extent has been formatted at txLock() time;
1499 	 */
1500 	else {
1501 		assert(tlck->type & tlckEA);
1502 
1503 		/* log LOG_UPDATEMAP for logredo() to update bmap for
1504 		 * alloc of new (and free of old) external EA extent;
1505 		 */
1506 		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1507 		pxdlock = (struct pxd_lock *) & tlck->lock;
1508 		nlock = pxdlock->index;
1509 		for (i = 0; i < nlock; i++, pxdlock++) {
1510 			if (pxdlock->flag & mlckALLOCPXD)
1511 				lrd->log.updatemap.type =
1512 				    cpu_to_le16(LOG_ALLOCPXD);
1513 			else
1514 				lrd->log.updatemap.type =
1515 				    cpu_to_le16(LOG_FREEPXD);
1516 			lrd->log.updatemap.nxd = cpu_to_le16(1);
1517 			lrd->log.updatemap.pxd = pxdlock->pxd;
1518 			lrd->backchain =
1519 			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1520 		}
1521 
1522 		/* update bmap */
1523 		tlck->flag |= tlckUPDATEMAP;
1524 	}
1525 #endif				/* _JFS_WIP */
1526 
1527 	return rc;
1528 }
1529 
1530 /*
1531  *      dataLog()
1532  *
1533  * function:    log data tlock
1534  */
1535 static int dataLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1536 	    struct tlock * tlck)
1537 {
1538 	struct metapage *mp;
1539 	pxd_t *pxd;
1540 
1541 	mp = tlck->mp;
1542 
1543 	/* initialize as REDOPAGE record format */
1544 	lrd->log.redopage.type = cpu_to_le16(LOG_DATA);
1545 	lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE);
1546 
1547 	pxd = &lrd->log.redopage.pxd;
1548 
1549 	/* log after-image for logredo(): */
1550 	lrd->type = cpu_to_le16(LOG_REDOPAGE);
1551 
1552 	if (jfs_dirtable_inline(tlck->ip)) {
1553 		/*
1554 		 * The table has been truncated, we've must have deleted
1555 		 * the last entry, so don't bother logging this
1556 		 */
1557 		mp->lid = 0;
1558 		grab_metapage(mp);
1559 		metapage_homeok(mp);
1560 		discard_metapage(mp);
1561 		tlck->mp = NULL;
1562 		return 0;
1563 	}
1564 
1565 	PXDaddress(pxd, mp->index);
1566 	PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits);
1567 
1568 	lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1569 
1570 	/* mark page as homeward bound */
1571 	tlck->flag |= tlckWRITEPAGE;
1572 
1573 	return 0;
1574 }
1575 
1576 /*
1577  *      dtLog()
1578  *
1579  * function:    log dtree tlock and format maplock to update bmap;
1580  */
1581 static void dtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1582 	   struct tlock * tlck)
1583 {
1584 	struct metapage *mp;
1585 	struct pxd_lock *pxdlock;
1586 	pxd_t *pxd;
1587 
1588 	mp = tlck->mp;
1589 
1590 	/* initialize as REDOPAGE/NOREDOPAGE record format */
1591 	lrd->log.redopage.type = cpu_to_le16(LOG_DTREE);
1592 	lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE);
1593 
1594 	pxd = &lrd->log.redopage.pxd;
1595 
1596 	if (tlck->type & tlckBTROOT)
1597 		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1598 
1599 	/*
1600 	 *      page extension via relocation: entry insertion;
1601 	 *      page extension in-place: entry insertion;
1602 	 *      new right page from page split, reinitialized in-line
1603 	 *      root from root page split: entry insertion;
1604 	 */
1605 	if (tlck->type & (tlckNEW | tlckEXTEND)) {
1606 		/* log after-image of the new page for logredo():
1607 		 * mark log (LOG_NEW) for logredo() to initialize
1608 		 * freelist and update bmap for alloc of the new page;
1609 		 */
1610 		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1611 		if (tlck->type & tlckEXTEND)
1612 			lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND);
1613 		else
1614 			lrd->log.redopage.type |= cpu_to_le16(LOG_NEW);
1615 		PXDaddress(pxd, mp->index);
1616 		PXDlength(pxd,
1617 			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1618 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1619 
1620 		/* format a maplock for txUpdateMap() to update bPMAP for
1621 		 * alloc of the new page;
1622 		 */
1623 		if (tlck->type & tlckBTROOT)
1624 			return;
1625 		tlck->flag |= tlckUPDATEMAP;
1626 		pxdlock = (struct pxd_lock *) & tlck->lock;
1627 		pxdlock->flag = mlckALLOCPXD;
1628 		pxdlock->pxd = *pxd;
1629 
1630 		pxdlock->index = 1;
1631 
1632 		/* mark page as homeward bound */
1633 		tlck->flag |= tlckWRITEPAGE;
1634 		return;
1635 	}
1636 
1637 	/*
1638 	 *      entry insertion/deletion,
1639 	 *      sibling page link update (old right page before split);
1640 	 */
1641 	if (tlck->type & (tlckENTRY | tlckRELINK)) {
1642 		/* log after-image for logredo(): */
1643 		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1644 		PXDaddress(pxd, mp->index);
1645 		PXDlength(pxd,
1646 			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1647 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1648 
1649 		/* mark page as homeward bound */
1650 		tlck->flag |= tlckWRITEPAGE;
1651 		return;
1652 	}
1653 
1654 	/*
1655 	 *      page deletion: page has been invalidated
1656 	 *      page relocation: source extent
1657 	 *
1658 	 *      a maplock for free of the page has been formatted
1659 	 *      at txLock() time);
1660 	 */
1661 	if (tlck->type & (tlckFREE | tlckRELOCATE)) {
1662 		/* log LOG_NOREDOPAGE of the deleted page for logredo()
1663 		 * to start NoRedoPage filter and to update bmap for free
1664 		 * of the deletd page
1665 		 */
1666 		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1667 		pxdlock = (struct pxd_lock *) & tlck->lock;
1668 		*pxd = pxdlock->pxd;
1669 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1670 
1671 		/* a maplock for txUpdateMap() for free of the page
1672 		 * has been formatted at txLock() time;
1673 		 */
1674 		tlck->flag |= tlckUPDATEMAP;
1675 	}
1676 	return;
1677 }
1678 
1679 /*
1680  *      xtLog()
1681  *
1682  * function:    log xtree tlock and format maplock to update bmap;
1683  */
1684 static void xtLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
1685 	   struct tlock * tlck)
1686 {
1687 	struct inode *ip;
1688 	struct metapage *mp;
1689 	xtpage_t *p;
1690 	struct xtlock *xtlck;
1691 	struct maplock *maplock;
1692 	struct xdlistlock *xadlock;
1693 	struct pxd_lock *pxdlock;
1694 	pxd_t *page_pxd;
1695 	int next, lwm, hwm;
1696 
1697 	ip = tlck->ip;
1698 	mp = tlck->mp;
1699 
1700 	/* initialize as REDOPAGE/NOREDOPAGE record format */
1701 	lrd->log.redopage.type = cpu_to_le16(LOG_XTREE);
1702 	lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE);
1703 
1704 	page_pxd = &lrd->log.redopage.pxd;
1705 
1706 	if (tlck->type & tlckBTROOT) {
1707 		lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT);
1708 		p = &JFS_IP(ip)->i_xtroot;
1709 		if (S_ISDIR(ip->i_mode))
1710 			lrd->log.redopage.type |=
1711 			    cpu_to_le16(LOG_DIR_XTREE);
1712 	} else
1713 		p = (xtpage_t *) mp->data;
1714 	next = le16_to_cpu(p->header.nextindex);
1715 
1716 	xtlck = (struct xtlock *) & tlck->lock;
1717 
1718 	maplock = (struct maplock *) & tlck->lock;
1719 	xadlock = (struct xdlistlock *) maplock;
1720 
1721 	/*
1722 	 *      entry insertion/extension;
1723 	 *      sibling page link update (old right page before split);
1724 	 */
1725 	if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) {
1726 		/* log after-image for logredo():
1727 		 * logredo() will update bmap for alloc of new/extended
1728 		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1729 		 * after-image of XADlist;
1730 		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1731 		 * applying the after-image to the meta-data page.
1732 		 */
1733 		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1734 		PXDaddress(page_pxd, mp->index);
1735 		PXDlength(page_pxd,
1736 			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1737 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1738 
1739 		/* format a maplock for txUpdateMap() to update bPMAP
1740 		 * for alloc of new/extended extents of XAD[lwm:next)
1741 		 * from the page itself;
1742 		 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
1743 		 */
1744 		lwm = xtlck->lwm.offset;
1745 		if (lwm == 0)
1746 			lwm = XTPAGEMAXSLOT;
1747 
1748 		if (lwm == next)
1749 			goto out;
1750 		if (lwm > next) {
1751 			jfs_err("xtLog: lwm > next\n");
1752 			goto out;
1753 		}
1754 		tlck->flag |= tlckUPDATEMAP;
1755 		xadlock->flag = mlckALLOCXADLIST;
1756 		xadlock->count = next - lwm;
1757 		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1758 			int i;
1759 			pxd_t *pxd;
1760 			/*
1761 			 * Lazy commit may allow xtree to be modified before
1762 			 * txUpdateMap runs.  Copy xad into linelock to
1763 			 * preserve correct data.
1764 			 *
1765 			 * We can fit twice as may pxd's as xads in the lock
1766 			 */
1767 			xadlock->flag = mlckALLOCPXDLIST;
1768 			pxd = xadlock->xdlist = &xtlck->pxdlock;
1769 			for (i = 0; i < xadlock->count; i++) {
1770 				PXDaddress(pxd, addressXAD(&p->xad[lwm + i]));
1771 				PXDlength(pxd, lengthXAD(&p->xad[lwm + i]));
1772 				p->xad[lwm + i].flag &=
1773 				    ~(XAD_NEW | XAD_EXTENDED);
1774 				pxd++;
1775 			}
1776 		} else {
1777 			/*
1778 			 * xdlist will point to into inode's xtree, ensure
1779 			 * that transaction is not committed lazily.
1780 			 */
1781 			xadlock->flag = mlckALLOCXADLIST;
1782 			xadlock->xdlist = &p->xad[lwm];
1783 			tblk->xflag &= ~COMMIT_LAZY;
1784 		}
1785 		jfs_info("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d "
1786 			 "count:%d", tlck->ip, mp, tlck, lwm, xadlock->count);
1787 
1788 		maplock->index = 1;
1789 
1790 	      out:
1791 		/* mark page as homeward bound */
1792 		tlck->flag |= tlckWRITEPAGE;
1793 
1794 		return;
1795 	}
1796 
1797 	/*
1798 	 *      page deletion: file deletion/truncation (ref. xtTruncate())
1799 	 *
1800 	 * (page will be invalidated after log is written and bmap
1801 	 * is updated from the page);
1802 	 */
1803 	if (tlck->type & tlckFREE) {
1804 		/* LOG_NOREDOPAGE log for NoRedoPage filter:
1805 		 * if page free from file delete, NoRedoFile filter from
1806 		 * inode image of zero link count will subsume NoRedoPage
1807 		 * filters for each page;
1808 		 * if page free from file truncattion, write NoRedoPage
1809 		 * filter;
1810 		 *
1811 		 * upadte of block allocation map for the page itself:
1812 		 * if page free from deletion and truncation, LOG_UPDATEMAP
1813 		 * log for the page itself is generated from processing
1814 		 * its parent page xad entries;
1815 		 */
1816 		/* if page free from file truncation, log LOG_NOREDOPAGE
1817 		 * of the deleted page for logredo() to start NoRedoPage
1818 		 * filter for the page;
1819 		 */
1820 		if (tblk->xflag & COMMIT_TRUNCATE) {
1821 			/* write NOREDOPAGE for the page */
1822 			lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
1823 			PXDaddress(page_pxd, mp->index);
1824 			PXDlength(page_pxd,
1825 				  mp->logical_size >> tblk->sb->
1826 				  s_blocksize_bits);
1827 			lrd->backchain =
1828 			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1829 
1830 			if (tlck->type & tlckBTROOT) {
1831 				/* Empty xtree must be logged */
1832 				lrd->type = cpu_to_le16(LOG_REDOPAGE);
1833 				lrd->backchain =
1834 				    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1835 			}
1836 		}
1837 
1838 		/* init LOG_UPDATEMAP of the freed extents
1839 		 * XAD[XTENTRYSTART:hwm) from the deleted page itself
1840 		 * for logredo() to update bmap;
1841 		 */
1842 		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1843 		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST);
1844 		xtlck = (struct xtlock *) & tlck->lock;
1845 		hwm = xtlck->hwm.offset;
1846 		lrd->log.updatemap.nxd =
1847 		    cpu_to_le16(hwm - XTENTRYSTART + 1);
1848 		/* reformat linelock for lmLog() */
1849 		xtlck->header.offset = XTENTRYSTART;
1850 		xtlck->header.length = hwm - XTENTRYSTART + 1;
1851 		xtlck->index = 1;
1852 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1853 
1854 		/* format a maplock for txUpdateMap() to update bmap
1855 		 * to free extents of XAD[XTENTRYSTART:hwm) from the
1856 		 * deleted page itself;
1857 		 */
1858 		tlck->flag |= tlckUPDATEMAP;
1859 		xadlock->count = hwm - XTENTRYSTART + 1;
1860 		if ((xadlock->count <= 4) && (tblk->xflag & COMMIT_LAZY)) {
1861 			int i;
1862 			pxd_t *pxd;
1863 			/*
1864 			 * Lazy commit may allow xtree to be modified before
1865 			 * txUpdateMap runs.  Copy xad into linelock to
1866 			 * preserve correct data.
1867 			 *
1868 			 * We can fit twice as may pxd's as xads in the lock
1869 			 */
1870 			xadlock->flag = mlckFREEPXDLIST;
1871 			pxd = xadlock->xdlist = &xtlck->pxdlock;
1872 			for (i = 0; i < xadlock->count; i++) {
1873 				PXDaddress(pxd,
1874 					addressXAD(&p->xad[XTENTRYSTART + i]));
1875 				PXDlength(pxd,
1876 					lengthXAD(&p->xad[XTENTRYSTART + i]));
1877 				pxd++;
1878 			}
1879 		} else {
1880 			/*
1881 			 * xdlist will point to into inode's xtree, ensure
1882 			 * that transaction is not committed lazily.
1883 			 */
1884 			xadlock->flag = mlckFREEXADLIST;
1885 			xadlock->xdlist = &p->xad[XTENTRYSTART];
1886 			tblk->xflag &= ~COMMIT_LAZY;
1887 		}
1888 		jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2",
1889 			 tlck->ip, mp, xadlock->count);
1890 
1891 		maplock->index = 1;
1892 
1893 		/* mark page as invalid */
1894 		if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode))
1895 		    && !(tlck->type & tlckBTROOT))
1896 			tlck->flag |= tlckFREEPAGE;
1897 		/*
1898 		   else (tblk->xflag & COMMIT_PMAP)
1899 		   ? release the page;
1900 		 */
1901 		return;
1902 	}
1903 
1904 	/*
1905 	 *      page/entry truncation: file truncation (ref. xtTruncate())
1906 	 *
1907 	 *     |----------+------+------+---------------|
1908 	 *                |      |      |
1909 	 *                |      |     hwm - hwm before truncation
1910 	 *                |     next - truncation point
1911 	 *               lwm - lwm before truncation
1912 	 * header ?
1913 	 */
1914 	if (tlck->type & tlckTRUNCATE) {
1915 		pxd_t pxd;	/* truncated extent of xad */
1916 		int twm;
1917 
1918 		/*
1919 		 * For truncation the entire linelock may be used, so it would
1920 		 * be difficult to store xad list in linelock itself.
1921 		 * Therefore, we'll just force transaction to be committed
1922 		 * synchronously, so that xtree pages won't be changed before
1923 		 * txUpdateMap runs.
1924 		 */
1925 		tblk->xflag &= ~COMMIT_LAZY;
1926 		lwm = xtlck->lwm.offset;
1927 		if (lwm == 0)
1928 			lwm = XTPAGEMAXSLOT;
1929 		hwm = xtlck->hwm.offset;
1930 		twm = xtlck->twm.offset;
1931 
1932 		/*
1933 		 *      write log records
1934 		 */
1935 		/* log after-image for logredo():
1936 		 *
1937 		 * logredo() will update bmap for alloc of new/extended
1938 		 * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from
1939 		 * after-image of XADlist;
1940 		 * logredo() resets (XAD_NEW|XAD_EXTEND) flag when
1941 		 * applying the after-image to the meta-data page.
1942 		 */
1943 		lrd->type = cpu_to_le16(LOG_REDOPAGE);
1944 		PXDaddress(page_pxd, mp->index);
1945 		PXDlength(page_pxd,
1946 			  mp->logical_size >> tblk->sb->s_blocksize_bits);
1947 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1948 
1949 		/*
1950 		 * truncate entry XAD[twm == next - 1]:
1951 		 */
1952 		if (twm == next - 1) {
1953 			/* init LOG_UPDATEMAP for logredo() to update bmap for
1954 			 * free of truncated delta extent of the truncated
1955 			 * entry XAD[next - 1]:
1956 			 * (xtlck->pxdlock = truncated delta extent);
1957 			 */
1958 			pxdlock = (struct pxd_lock *) & xtlck->pxdlock;
1959 			/* assert(pxdlock->type & tlckTRUNCATE); */
1960 			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1961 			lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
1962 			lrd->log.updatemap.nxd = cpu_to_le16(1);
1963 			lrd->log.updatemap.pxd = pxdlock->pxd;
1964 			pxd = pxdlock->pxd;	/* save to format maplock */
1965 			lrd->backchain =
1966 			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
1967 		}
1968 
1969 		/*
1970 		 * free entries XAD[next:hwm]:
1971 		 */
1972 		if (hwm >= next) {
1973 			/* init LOG_UPDATEMAP of the freed extents
1974 			 * XAD[next:hwm] from the deleted page itself
1975 			 * for logredo() to update bmap;
1976 			 */
1977 			lrd->type = cpu_to_le16(LOG_UPDATEMAP);
1978 			lrd->log.updatemap.type =
1979 			    cpu_to_le16(LOG_FREEXADLIST);
1980 			xtlck = (struct xtlock *) & tlck->lock;
1981 			hwm = xtlck->hwm.offset;
1982 			lrd->log.updatemap.nxd =
1983 			    cpu_to_le16(hwm - next + 1);
1984 			/* reformat linelock for lmLog() */
1985 			xtlck->header.offset = next;
1986 			xtlck->header.length = hwm - next + 1;
1987 			xtlck->index = 1;
1988 			lrd->backchain =
1989 			    cpu_to_le32(lmLog(log, tblk, lrd, tlck));
1990 		}
1991 
1992 		/*
1993 		 *      format maplock(s) for txUpdateMap() to update bmap
1994 		 */
1995 		maplock->index = 0;
1996 
1997 		/*
1998 		 * allocate entries XAD[lwm:next):
1999 		 */
2000 		if (lwm < next) {
2001 			/* format a maplock for txUpdateMap() to update bPMAP
2002 			 * for alloc of new/extended extents of XAD[lwm:next)
2003 			 * from the page itself;
2004 			 * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag.
2005 			 */
2006 			tlck->flag |= tlckUPDATEMAP;
2007 			xadlock->flag = mlckALLOCXADLIST;
2008 			xadlock->count = next - lwm;
2009 			xadlock->xdlist = &p->xad[lwm];
2010 
2011 			jfs_info("xtLog: alloc ip:0x%p mp:0x%p count:%d "
2012 				 "lwm:%d next:%d",
2013 				 tlck->ip, mp, xadlock->count, lwm, next);
2014 			maplock->index++;
2015 			xadlock++;
2016 		}
2017 
2018 		/*
2019 		 * truncate entry XAD[twm == next - 1]:
2020 		 */
2021 		if (twm == next - 1) {
2022 			struct pxd_lock *pxdlock;
2023 
2024 			/* format a maplock for txUpdateMap() to update bmap
2025 			 * to free truncated delta extent of the truncated
2026 			 * entry XAD[next - 1];
2027 			 * (xtlck->pxdlock = truncated delta extent);
2028 			 */
2029 			tlck->flag |= tlckUPDATEMAP;
2030 			pxdlock = (struct pxd_lock *) xadlock;
2031 			pxdlock->flag = mlckFREEPXD;
2032 			pxdlock->count = 1;
2033 			pxdlock->pxd = pxd;
2034 
2035 			jfs_info("xtLog: truncate ip:0x%p mp:0x%p count:%d "
2036 				 "hwm:%d", ip, mp, pxdlock->count, hwm);
2037 			maplock->index++;
2038 			xadlock++;
2039 		}
2040 
2041 		/*
2042 		 * free entries XAD[next:hwm]:
2043 		 */
2044 		if (hwm >= next) {
2045 			/* format a maplock for txUpdateMap() to update bmap
2046 			 * to free extents of XAD[next:hwm] from thedeleted
2047 			 * page itself;
2048 			 */
2049 			tlck->flag |= tlckUPDATEMAP;
2050 			xadlock->flag = mlckFREEXADLIST;
2051 			xadlock->count = hwm - next + 1;
2052 			xadlock->xdlist = &p->xad[next];
2053 
2054 			jfs_info("xtLog: free ip:0x%p mp:0x%p count:%d "
2055 				 "next:%d hwm:%d",
2056 				 tlck->ip, mp, xadlock->count, next, hwm);
2057 			maplock->index++;
2058 		}
2059 
2060 		/* mark page as homeward bound */
2061 		tlck->flag |= tlckWRITEPAGE;
2062 	}
2063 	return;
2064 }
2065 
2066 /*
2067  *      mapLog()
2068  *
2069  * function:    log from maplock of freed data extents;
2070  */
2071 void mapLog(struct jfs_log * log, struct tblock * tblk, struct lrd * lrd,
2072 	    struct tlock * tlck)
2073 {
2074 	struct pxd_lock *pxdlock;
2075 	int i, nlock;
2076 	pxd_t *pxd;
2077 
2078 	/*
2079 	 *      page relocation: free the source page extent
2080 	 *
2081 	 * a maplock for txUpdateMap() for free of the page
2082 	 * has been formatted at txLock() time saving the src
2083 	 * relocated page address;
2084 	 */
2085 	if (tlck->type & tlckRELOCATE) {
2086 		/* log LOG_NOREDOPAGE of the old relocated page
2087 		 * for logredo() to start NoRedoPage filter;
2088 		 */
2089 		lrd->type = cpu_to_le16(LOG_NOREDOPAGE);
2090 		pxdlock = (struct pxd_lock *) & tlck->lock;
2091 		pxd = &lrd->log.redopage.pxd;
2092 		*pxd = pxdlock->pxd;
2093 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2094 
2095 		/* (N.B. currently, logredo() does NOT update bmap
2096 		 * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE);
2097 		 * if page free from relocation, LOG_UPDATEMAP log is
2098 		 * specifically generated now for logredo()
2099 		 * to update bmap for free of src relocated page;
2100 		 * (new flag LOG_RELOCATE may be introduced which will
2101 		 * inform logredo() to start NORedoPage filter and also
2102 		 * update block allocation map at the same time, thus
2103 		 * avoiding an extra log write);
2104 		 */
2105 		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2106 		lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD);
2107 		lrd->log.updatemap.nxd = cpu_to_le16(1);
2108 		lrd->log.updatemap.pxd = pxdlock->pxd;
2109 		lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2110 
2111 		/* a maplock for txUpdateMap() for free of the page
2112 		 * has been formatted at txLock() time;
2113 		 */
2114 		tlck->flag |= tlckUPDATEMAP;
2115 		return;
2116 	}
2117 	/*
2118 
2119 	 * Otherwise it's not a relocate request
2120 	 *
2121 	 */
2122 	else {
2123 		/* log LOG_UPDATEMAP for logredo() to update bmap for
2124 		 * free of truncated/relocated delta extent of the data;
2125 		 * e.g.: external EA extent, relocated/truncated extent
2126 		 * from xtTailgate();
2127 		 */
2128 		lrd->type = cpu_to_le16(LOG_UPDATEMAP);
2129 		pxdlock = (struct pxd_lock *) & tlck->lock;
2130 		nlock = pxdlock->index;
2131 		for (i = 0; i < nlock; i++, pxdlock++) {
2132 			if (pxdlock->flag & mlckALLOCPXD)
2133 				lrd->log.updatemap.type =
2134 				    cpu_to_le16(LOG_ALLOCPXD);
2135 			else
2136 				lrd->log.updatemap.type =
2137 				    cpu_to_le16(LOG_FREEPXD);
2138 			lrd->log.updatemap.nxd = cpu_to_le16(1);
2139 			lrd->log.updatemap.pxd = pxdlock->pxd;
2140 			lrd->backchain =
2141 			    cpu_to_le32(lmLog(log, tblk, lrd, NULL));
2142 			jfs_info("mapLog: xaddr:0x%lx xlen:0x%x",
2143 				 (ulong) addressPXD(&pxdlock->pxd),
2144 				 lengthPXD(&pxdlock->pxd));
2145 		}
2146 
2147 		/* update bmap */
2148 		tlck->flag |= tlckUPDATEMAP;
2149 	}
2150 }
2151 
2152 /*
2153  *      txEA()
2154  *
2155  * function:    acquire maplock for EA/ACL extents or
2156  *              set COMMIT_INLINE flag;
2157  */
2158 void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea)
2159 {
2160 	struct tlock *tlck = NULL;
2161 	struct pxd_lock *maplock = NULL, *pxdlock = NULL;
2162 
2163 	/*
2164 	 * format maplock for alloc of new EA extent
2165 	 */
2166 	if (newea) {
2167 		/* Since the newea could be a completely zeroed entry we need to
2168 		 * check for the two flags which indicate we should actually
2169 		 * commit new EA data
2170 		 */
2171 		if (newea->flag & DXD_EXTENT) {
2172 			tlck = txMaplock(tid, ip, tlckMAP);
2173 			maplock = (struct pxd_lock *) & tlck->lock;
2174 			pxdlock = (struct pxd_lock *) maplock;
2175 			pxdlock->flag = mlckALLOCPXD;
2176 			PXDaddress(&pxdlock->pxd, addressDXD(newea));
2177 			PXDlength(&pxdlock->pxd, lengthDXD(newea));
2178 			pxdlock++;
2179 			maplock->index = 1;
2180 		} else if (newea->flag & DXD_INLINE) {
2181 			tlck = NULL;
2182 
2183 			set_cflag(COMMIT_Inlineea, ip);
2184 		}
2185 	}
2186 
2187 	/*
2188 	 * format maplock for free of old EA extent
2189 	 */
2190 	if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) {
2191 		if (tlck == NULL) {
2192 			tlck = txMaplock(tid, ip, tlckMAP);
2193 			maplock = (struct pxd_lock *) & tlck->lock;
2194 			pxdlock = (struct pxd_lock *) maplock;
2195 			maplock->index = 0;
2196 		}
2197 		pxdlock->flag = mlckFREEPXD;
2198 		PXDaddress(&pxdlock->pxd, addressDXD(oldea));
2199 		PXDlength(&pxdlock->pxd, lengthDXD(oldea));
2200 		maplock->index++;
2201 	}
2202 }
2203 
2204 /*
2205  *      txForce()
2206  *
2207  * function: synchronously write pages locked by transaction
2208  *              after txLog() but before txUpdateMap();
2209  */
2210 void txForce(struct tblock * tblk)
2211 {
2212 	struct tlock *tlck;
2213 	lid_t lid, next;
2214 	struct metapage *mp;
2215 
2216 	/*
2217 	 * reverse the order of transaction tlocks in
2218 	 * careful update order of address index pages
2219 	 * (right to left, bottom up)
2220 	 */
2221 	tlck = lid_to_tlock(tblk->next);
2222 	lid = tlck->next;
2223 	tlck->next = 0;
2224 	while (lid) {
2225 		tlck = lid_to_tlock(lid);
2226 		next = tlck->next;
2227 		tlck->next = tblk->next;
2228 		tblk->next = lid;
2229 		lid = next;
2230 	}
2231 
2232 	/*
2233 	 * synchronously write the page, and
2234 	 * hold the page for txUpdateMap();
2235 	 */
2236 	for (lid = tblk->next; lid; lid = next) {
2237 		tlck = lid_to_tlock(lid);
2238 		next = tlck->next;
2239 
2240 		if ((mp = tlck->mp) != NULL &&
2241 		    (tlck->type & tlckBTROOT) == 0) {
2242 			assert(mp->xflag & COMMIT_PAGE);
2243 
2244 			if (tlck->flag & tlckWRITEPAGE) {
2245 				tlck->flag &= ~tlckWRITEPAGE;
2246 
2247 				/* do not release page to freelist */
2248 				force_metapage(mp);
2249 #if 0
2250 				/*
2251 				 * The "right" thing to do here is to
2252 				 * synchronously write the metadata.
2253 				 * With the current implementation this
2254 				 * is hard since write_metapage requires
2255 				 * us to kunmap & remap the page.  If we
2256 				 * have tlocks pointing into the metadata
2257 				 * pages, we don't want to do this.  I think
2258 				 * we can get by with synchronously writing
2259 				 * the pages when they are released.
2260 				 */
2261 				assert(mp->nohomeok);
2262 				set_bit(META_dirty, &mp->flag);
2263 				set_bit(META_sync, &mp->flag);
2264 #endif
2265 			}
2266 		}
2267 	}
2268 }
2269 
2270 /*
2271  *      txUpdateMap()
2272  *
2273  * function:    update persistent allocation map (and working map
2274  *              if appropriate);
2275  *
2276  * parameter:
2277  */
2278 static void txUpdateMap(struct tblock * tblk)
2279 {
2280 	struct inode *ip;
2281 	struct inode *ipimap;
2282 	lid_t lid;
2283 	struct tlock *tlck;
2284 	struct maplock *maplock;
2285 	struct pxd_lock pxdlock;
2286 	int maptype;
2287 	int k, nlock;
2288 	struct metapage *mp = NULL;
2289 
2290 	ipimap = JFS_SBI(tblk->sb)->ipimap;
2291 
2292 	maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP;
2293 
2294 
2295 	/*
2296 	 *      update block allocation map
2297 	 *
2298 	 * update allocation state in pmap (and wmap) and
2299 	 * update lsn of the pmap page;
2300 	 */
2301 	/*
2302 	 * scan each tlock/page of transaction for block allocation/free:
2303 	 *
2304 	 * for each tlock/page of transaction, update map.
2305 	 *  ? are there tlock for pmap and pwmap at the same time ?
2306 	 */
2307 	for (lid = tblk->next; lid; lid = tlck->next) {
2308 		tlck = lid_to_tlock(lid);
2309 
2310 		if ((tlck->flag & tlckUPDATEMAP) == 0)
2311 			continue;
2312 
2313 		if (tlck->flag & tlckFREEPAGE) {
2314 			/*
2315 			 * Another thread may attempt to reuse freed space
2316 			 * immediately, so we want to get rid of the metapage
2317 			 * before anyone else has a chance to get it.
2318 			 * Lock metapage, update maps, then invalidate
2319 			 * the metapage.
2320 			 */
2321 			mp = tlck->mp;
2322 			ASSERT(mp->xflag & COMMIT_PAGE);
2323 			grab_metapage(mp);
2324 		}
2325 
2326 		/*
2327 		 * extent list:
2328 		 * . in-line PXD list:
2329 		 * . out-of-line XAD list:
2330 		 */
2331 		maplock = (struct maplock *) & tlck->lock;
2332 		nlock = maplock->index;
2333 
2334 		for (k = 0; k < nlock; k++, maplock++) {
2335 			/*
2336 			 * allocate blocks in persistent map:
2337 			 *
2338 			 * blocks have been allocated from wmap at alloc time;
2339 			 */
2340 			if (maplock->flag & mlckALLOC) {
2341 				txAllocPMap(ipimap, maplock, tblk);
2342 			}
2343 			/*
2344 			 * free blocks in persistent and working map:
2345 			 * blocks will be freed in pmap and then in wmap;
2346 			 *
2347 			 * ? tblock specifies the PMAP/PWMAP based upon
2348 			 * transaction
2349 			 *
2350 			 * free blocks in persistent map:
2351 			 * blocks will be freed from wmap at last reference
2352 			 * release of the object for regular files;
2353 			 *
2354 			 * Alway free blocks from both persistent & working
2355 			 * maps for directories
2356 			 */
2357 			else {	/* (maplock->flag & mlckFREE) */
2358 
2359 				if (S_ISDIR(tlck->ip->i_mode))
2360 					txFreeMap(ipimap, maplock,
2361 						  tblk, COMMIT_PWMAP);
2362 				else
2363 					txFreeMap(ipimap, maplock,
2364 						  tblk, maptype);
2365 			}
2366 		}
2367 		if (tlck->flag & tlckFREEPAGE) {
2368 			if (!(tblk->flag & tblkGC_LAZY)) {
2369 				/* This is equivalent to txRelease */
2370 				ASSERT(mp->lid == lid);
2371 				tlck->mp->lid = 0;
2372 			}
2373 			assert(mp->nohomeok == 1);
2374 			metapage_homeok(mp);
2375 			discard_metapage(mp);
2376 			tlck->mp = NULL;
2377 		}
2378 	}
2379 	/*
2380 	 *      update inode allocation map
2381 	 *
2382 	 * update allocation state in pmap and
2383 	 * update lsn of the pmap page;
2384 	 * update in-memory inode flag/state
2385 	 *
2386 	 * unlock mapper/write lock
2387 	 */
2388 	if (tblk->xflag & COMMIT_CREATE) {
2389 		diUpdatePMap(ipimap, tblk->ino, FALSE, tblk);
2390 		ipimap->i_state |= I_DIRTY;
2391 		/* update persistent block allocation map
2392 		 * for the allocation of inode extent;
2393 		 */
2394 		pxdlock.flag = mlckALLOCPXD;
2395 		pxdlock.pxd = tblk->u.ixpxd;
2396 		pxdlock.index = 1;
2397 		txAllocPMap(ipimap, (struct maplock *) & pxdlock, tblk);
2398 	} else if (tblk->xflag & COMMIT_DELETE) {
2399 		ip = tblk->u.ip;
2400 		diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk);
2401 		ipimap->i_state |= I_DIRTY;
2402 		iput(ip);
2403 	}
2404 }
2405 
2406 /*
2407  *      txAllocPMap()
2408  *
2409  * function: allocate from persistent map;
2410  *
2411  * parameter:
2412  *      ipbmap  -
2413  *      malock -
2414  *              xad list:
2415  *              pxd:
2416  *
2417  *      maptype -
2418  *              allocate from persistent map;
2419  *              free from persistent map;
2420  *              (e.g., tmp file - free from working map at releae
2421  *               of last reference);
2422  *              free from persistent and working map;
2423  *
2424  *      lsn     - log sequence number;
2425  */
2426 static void txAllocPMap(struct inode *ip, struct maplock * maplock,
2427 			struct tblock * tblk)
2428 {
2429 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2430 	struct xdlistlock *xadlistlock;
2431 	xad_t *xad;
2432 	s64 xaddr;
2433 	int xlen;
2434 	struct pxd_lock *pxdlock;
2435 	struct xdlistlock *pxdlistlock;
2436 	pxd_t *pxd;
2437 	int n;
2438 
2439 	/*
2440 	 * allocate from persistent map;
2441 	 */
2442 	if (maplock->flag & mlckALLOCXADLIST) {
2443 		xadlistlock = (struct xdlistlock *) maplock;
2444 		xad = xadlistlock->xdlist;
2445 		for (n = 0; n < xadlistlock->count; n++, xad++) {
2446 			if (xad->flag & (XAD_NEW | XAD_EXTENDED)) {
2447 				xaddr = addressXAD(xad);
2448 				xlen = lengthXAD(xad);
2449 				dbUpdatePMap(ipbmap, FALSE, xaddr,
2450 					     (s64) xlen, tblk);
2451 				xad->flag &= ~(XAD_NEW | XAD_EXTENDED);
2452 				jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2453 					 (ulong) xaddr, xlen);
2454 			}
2455 		}
2456 	} else if (maplock->flag & mlckALLOCPXD) {
2457 		pxdlock = (struct pxd_lock *) maplock;
2458 		xaddr = addressPXD(&pxdlock->pxd);
2459 		xlen = lengthPXD(&pxdlock->pxd);
2460 		dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, tblk);
2461 		jfs_info("allocPMap: xaddr:0x%lx xlen:%d", (ulong) xaddr, xlen);
2462 	} else {		/* (maplock->flag & mlckALLOCPXDLIST) */
2463 
2464 		pxdlistlock = (struct xdlistlock *) maplock;
2465 		pxd = pxdlistlock->xdlist;
2466 		for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2467 			xaddr = addressPXD(pxd);
2468 			xlen = lengthPXD(pxd);
2469 			dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen,
2470 				     tblk);
2471 			jfs_info("allocPMap: xaddr:0x%lx xlen:%d",
2472 				 (ulong) xaddr, xlen);
2473 		}
2474 	}
2475 }
2476 
2477 /*
2478  *      txFreeMap()
2479  *
2480  * function:    free from persistent and/or working map;
2481  *
2482  * todo: optimization
2483  */
2484 void txFreeMap(struct inode *ip,
2485 	       struct maplock * maplock, struct tblock * tblk, int maptype)
2486 {
2487 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
2488 	struct xdlistlock *xadlistlock;
2489 	xad_t *xad;
2490 	s64 xaddr;
2491 	int xlen;
2492 	struct pxd_lock *pxdlock;
2493 	struct xdlistlock *pxdlistlock;
2494 	pxd_t *pxd;
2495 	int n;
2496 
2497 	jfs_info("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x",
2498 		 tblk, maplock, maptype);
2499 
2500 	/*
2501 	 * free from persistent map;
2502 	 */
2503 	if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) {
2504 		if (maplock->flag & mlckFREEXADLIST) {
2505 			xadlistlock = (struct xdlistlock *) maplock;
2506 			xad = xadlistlock->xdlist;
2507 			for (n = 0; n < xadlistlock->count; n++, xad++) {
2508 				if (!(xad->flag & XAD_NEW)) {
2509 					xaddr = addressXAD(xad);
2510 					xlen = lengthXAD(xad);
2511 					dbUpdatePMap(ipbmap, TRUE, xaddr,
2512 						     (s64) xlen, tblk);
2513 					jfs_info("freePMap: xaddr:0x%lx "
2514 						 "xlen:%d",
2515 						 (ulong) xaddr, xlen);
2516 				}
2517 			}
2518 		} else if (maplock->flag & mlckFREEPXD) {
2519 			pxdlock = (struct pxd_lock *) maplock;
2520 			xaddr = addressPXD(&pxdlock->pxd);
2521 			xlen = lengthPXD(&pxdlock->pxd);
2522 			dbUpdatePMap(ipbmap, TRUE, xaddr, (s64) xlen,
2523 				     tblk);
2524 			jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2525 				 (ulong) xaddr, xlen);
2526 		} else {	/* (maplock->flag & mlckALLOCPXDLIST) */
2527 
2528 			pxdlistlock = (struct xdlistlock *) maplock;
2529 			pxd = pxdlistlock->xdlist;
2530 			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2531 				xaddr = addressPXD(pxd);
2532 				xlen = lengthPXD(pxd);
2533 				dbUpdatePMap(ipbmap, TRUE, xaddr,
2534 					     (s64) xlen, tblk);
2535 				jfs_info("freePMap: xaddr:0x%lx xlen:%d",
2536 					 (ulong) xaddr, xlen);
2537 			}
2538 		}
2539 	}
2540 
2541 	/*
2542 	 * free from working map;
2543 	 */
2544 	if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) {
2545 		if (maplock->flag & mlckFREEXADLIST) {
2546 			xadlistlock = (struct xdlistlock *) maplock;
2547 			xad = xadlistlock->xdlist;
2548 			for (n = 0; n < xadlistlock->count; n++, xad++) {
2549 				xaddr = addressXAD(xad);
2550 				xlen = lengthXAD(xad);
2551 				dbFree(ip, xaddr, (s64) xlen);
2552 				xad->flag = 0;
2553 				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2554 					 (ulong) xaddr, xlen);
2555 			}
2556 		} else if (maplock->flag & mlckFREEPXD) {
2557 			pxdlock = (struct pxd_lock *) maplock;
2558 			xaddr = addressPXD(&pxdlock->pxd);
2559 			xlen = lengthPXD(&pxdlock->pxd);
2560 			dbFree(ip, xaddr, (s64) xlen);
2561 			jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2562 				 (ulong) xaddr, xlen);
2563 		} else {	/* (maplock->flag & mlckFREEPXDLIST) */
2564 
2565 			pxdlistlock = (struct xdlistlock *) maplock;
2566 			pxd = pxdlistlock->xdlist;
2567 			for (n = 0; n < pxdlistlock->count; n++, pxd++) {
2568 				xaddr = addressPXD(pxd);
2569 				xlen = lengthPXD(pxd);
2570 				dbFree(ip, xaddr, (s64) xlen);
2571 				jfs_info("freeWMap: xaddr:0x%lx xlen:%d",
2572 					 (ulong) xaddr, xlen);
2573 			}
2574 		}
2575 	}
2576 }
2577 
2578 /*
2579  *      txFreelock()
2580  *
2581  * function:    remove tlock from inode anonymous locklist
2582  */
2583 void txFreelock(struct inode *ip)
2584 {
2585 	struct jfs_inode_info *jfs_ip = JFS_IP(ip);
2586 	struct tlock *xtlck, *tlck;
2587 	lid_t xlid = 0, lid;
2588 
2589 	if (!jfs_ip->atlhead)
2590 		return;
2591 
2592 	TXN_LOCK();
2593 	xtlck = (struct tlock *) &jfs_ip->atlhead;
2594 
2595 	while ((lid = xtlck->next) != 0) {
2596 		tlck = lid_to_tlock(lid);
2597 		if (tlck->flag & tlckFREELOCK) {
2598 			xtlck->next = tlck->next;
2599 			txLockFree(lid);
2600 		} else {
2601 			xtlck = tlck;
2602 			xlid = lid;
2603 		}
2604 	}
2605 
2606 	if (jfs_ip->atlhead)
2607 		jfs_ip->atltail = xlid;
2608 	else {
2609 		jfs_ip->atltail = 0;
2610 		/*
2611 		 * If inode was on anon_list, remove it
2612 		 */
2613 		list_del_init(&jfs_ip->anon_inode_list);
2614 	}
2615 	TXN_UNLOCK();
2616 }
2617 
2618 /*
2619  *      txAbort()
2620  *
2621  * function: abort tx before commit;
2622  *
2623  * frees line-locks and segment locks for all
2624  * segments in comdata structure.
2625  * Optionally sets state of file-system to FM_DIRTY in super-block.
2626  * log age of page-frames in memory for which caller has
2627  * are reset to 0 (to avoid logwarap).
2628  */
2629 void txAbort(tid_t tid, int dirty)
2630 {
2631 	lid_t lid, next;
2632 	struct metapage *mp;
2633 	struct tblock *tblk = tid_to_tblock(tid);
2634 	struct tlock *tlck;
2635 
2636 	/*
2637 	 * free tlocks of the transaction
2638 	 */
2639 	for (lid = tblk->next; lid; lid = next) {
2640 		tlck = lid_to_tlock(lid);
2641 		next = tlck->next;
2642 		mp = tlck->mp;
2643 		JFS_IP(tlck->ip)->xtlid = 0;
2644 
2645 		if (mp) {
2646 			mp->lid = 0;
2647 
2648 			/*
2649 			 * reset lsn of page to avoid logwarap:
2650 			 *
2651 			 * (page may have been previously committed by another
2652 			 * transaction(s) but has not been paged, i.e.,
2653 			 * it may be on logsync list even though it has not
2654 			 * been logged for the current tx.)
2655 			 */
2656 			if (mp->xflag & COMMIT_PAGE && mp->lsn)
2657 				LogSyncRelease(mp);
2658 		}
2659 		/* insert tlock at head of freelist */
2660 		TXN_LOCK();
2661 		txLockFree(lid);
2662 		TXN_UNLOCK();
2663 	}
2664 
2665 	/* caller will free the transaction block */
2666 
2667 	tblk->next = tblk->last = 0;
2668 
2669 	/*
2670 	 * mark filesystem dirty
2671 	 */
2672 	if (dirty)
2673 		jfs_error(tblk->sb, "txAbort");
2674 
2675 	return;
2676 }
2677 
2678 /*
2679  *      txLazyCommit(void)
2680  *
2681  *	All transactions except those changing ipimap (COMMIT_FORCE) are
2682  *	processed by this routine.  This insures that the inode and block
2683  *	allocation maps are updated in order.  For synchronous transactions,
2684  *	let the user thread finish processing after txUpdateMap() is called.
2685  */
2686 static void txLazyCommit(struct tblock * tblk)
2687 {
2688 	struct jfs_log *log;
2689 
2690 	while (((tblk->flag & tblkGC_READY) == 0) &&
2691 	       ((tblk->flag & tblkGC_UNLOCKED) == 0)) {
2692 		/* We must have gotten ahead of the user thread
2693 		 */
2694 		jfs_info("jfs_lazycommit: tblk 0x%p not unlocked", tblk);
2695 		yield();
2696 	}
2697 
2698 	jfs_info("txLazyCommit: processing tblk 0x%p", tblk);
2699 
2700 	txUpdateMap(tblk);
2701 
2702 	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
2703 
2704 	spin_lock_irq(&log->gclock);	// LOGGC_LOCK
2705 
2706 	tblk->flag |= tblkGC_COMMITTED;
2707 
2708 	if (tblk->flag & tblkGC_READY)
2709 		log->gcrtc--;
2710 
2711 	wake_up_all(&tblk->gcwait);	// LOGGC_WAKEUP
2712 
2713 	/*
2714 	 * Can't release log->gclock until we've tested tblk->flag
2715 	 */
2716 	if (tblk->flag & tblkGC_LAZY) {
2717 		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2718 		txUnlock(tblk);
2719 		tblk->flag &= ~tblkGC_LAZY;
2720 		txEnd(tblk - TxBlock);	/* Convert back to tid */
2721 	} else
2722 		spin_unlock_irq(&log->gclock);	// LOGGC_UNLOCK
2723 
2724 	jfs_info("txLazyCommit: done: tblk = 0x%p", tblk);
2725 }
2726 
2727 /*
2728  *      jfs_lazycommit(void)
2729  *
2730  *	To be run as a kernel daemon.  If lbmIODone is called in an interrupt
2731  *	context, or where blocking is not wanted, this routine will process
2732  *	committed transactions from the unlock queue.
2733  */
2734 int jfs_lazycommit(void *arg)
2735 {
2736 	int WorkDone;
2737 	struct tblock *tblk;
2738 	unsigned long flags;
2739 	struct jfs_sb_info *sbi;
2740 
2741 	daemonize("jfsCommit");
2742 
2743 	complete(&jfsIOwait);
2744 
2745 	do {
2746 		LAZY_LOCK(flags);
2747 		jfs_commit_thread_waking = 0;	/* OK to wake another thread */
2748 		while (!list_empty(&TxAnchor.unlock_queue)) {
2749 			WorkDone = 0;
2750 			list_for_each_entry(tblk, &TxAnchor.unlock_queue,
2751 					    cqueue) {
2752 
2753 				sbi = JFS_SBI(tblk->sb);
2754 				/*
2755 				 * For each volume, the transactions must be
2756 				 * handled in order.  If another commit thread
2757 				 * is handling a tblk for this superblock,
2758 				 * skip it
2759 				 */
2760 				if (sbi->commit_state & IN_LAZYCOMMIT)
2761 					continue;
2762 
2763 				sbi->commit_state |= IN_LAZYCOMMIT;
2764 				WorkDone = 1;
2765 
2766 				/*
2767 				 * Remove transaction from queue
2768 				 */
2769 				list_del(&tblk->cqueue);
2770 
2771 				LAZY_UNLOCK(flags);
2772 				txLazyCommit(tblk);
2773 				LAZY_LOCK(flags);
2774 
2775 				sbi->commit_state &= ~IN_LAZYCOMMIT;
2776 				/*
2777 				 * Don't continue in the for loop.  (We can't
2778 				 * anyway, it's unsafe!)  We want to go back to
2779 				 * the beginning of the list.
2780 				 */
2781 				break;
2782 			}
2783 
2784 			/* If there was nothing to do, don't continue */
2785 			if (!WorkDone)
2786 				break;
2787 		}
2788 		/* In case a wakeup came while all threads were active */
2789 		jfs_commit_thread_waking = 0;
2790 
2791 		if (current->flags & PF_FREEZE) {
2792 			LAZY_UNLOCK(flags);
2793 			refrigerator(PF_FREEZE);
2794 		} else {
2795 			DECLARE_WAITQUEUE(wq, current);
2796 
2797 			add_wait_queue(&jfs_commit_thread_wait, &wq);
2798 			set_current_state(TASK_INTERRUPTIBLE);
2799 			LAZY_UNLOCK(flags);
2800 			schedule();
2801 			current->state = TASK_RUNNING;
2802 			remove_wait_queue(&jfs_commit_thread_wait, &wq);
2803 		}
2804 	} while (!jfs_stop_threads);
2805 
2806 	if (!list_empty(&TxAnchor.unlock_queue))
2807 		jfs_err("jfs_lazycommit being killed w/pending transactions!");
2808 	else
2809 		jfs_info("jfs_lazycommit being killed\n");
2810 	complete_and_exit(&jfsIOwait, 0);
2811 }
2812 
2813 void txLazyUnlock(struct tblock * tblk)
2814 {
2815 	unsigned long flags;
2816 
2817 	LAZY_LOCK(flags);
2818 
2819 	list_add_tail(&tblk->cqueue, &TxAnchor.unlock_queue);
2820 	/*
2821 	 * Don't wake up a commit thread if there is already one servicing
2822 	 * this superblock, or if the last one we woke up hasn't started yet.
2823 	 */
2824 	if (!(JFS_SBI(tblk->sb)->commit_state & IN_LAZYCOMMIT) &&
2825 	    !jfs_commit_thread_waking) {
2826 		jfs_commit_thread_waking = 1;
2827 		wake_up(&jfs_commit_thread_wait);
2828 	}
2829 	LAZY_UNLOCK(flags);
2830 }
2831 
2832 static void LogSyncRelease(struct metapage * mp)
2833 {
2834 	struct jfs_log *log = mp->log;
2835 
2836 	assert(mp->nohomeok);
2837 	assert(log);
2838 	metapage_homeok(mp);
2839 }
2840 
2841 /*
2842  *	txQuiesce
2843  *
2844  *	Block all new transactions and push anonymous transactions to
2845  *	completion
2846  *
2847  *	This does almost the same thing as jfs_sync below.  We don't
2848  *	worry about deadlocking when jfs_tlocks_low is set, since we would
2849  *	expect jfs_sync to get us out of that jam.
2850  */
2851 void txQuiesce(struct super_block *sb)
2852 {
2853 	struct inode *ip;
2854 	struct jfs_inode_info *jfs_ip;
2855 	struct jfs_log *log = JFS_SBI(sb)->log;
2856 	tid_t tid;
2857 
2858 	set_bit(log_QUIESCE, &log->flag);
2859 
2860 	TXN_LOCK();
2861 restart:
2862 	while (!list_empty(&TxAnchor.anon_list)) {
2863 		jfs_ip = list_entry(TxAnchor.anon_list.next,
2864 				    struct jfs_inode_info,
2865 				    anon_inode_list);
2866 		ip = &jfs_ip->vfs_inode;
2867 
2868 		/*
2869 		 * inode will be removed from anonymous list
2870 		 * when it is committed
2871 		 */
2872 		TXN_UNLOCK();
2873 		tid = txBegin(ip->i_sb, COMMIT_INODE | COMMIT_FORCE);
2874 		down(&jfs_ip->commit_sem);
2875 		txCommit(tid, 1, &ip, 0);
2876 		txEnd(tid);
2877 		up(&jfs_ip->commit_sem);
2878 		/*
2879 		 * Just to be safe.  I don't know how
2880 		 * long we can run without blocking
2881 		 */
2882 		cond_resched();
2883 		TXN_LOCK();
2884 	}
2885 
2886 	/*
2887 	 * If jfs_sync is running in parallel, there could be some inodes
2888 	 * on anon_list2.  Let's check.
2889 	 */
2890 	if (!list_empty(&TxAnchor.anon_list2)) {
2891 		list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2892 		INIT_LIST_HEAD(&TxAnchor.anon_list2);
2893 		goto restart;
2894 	}
2895 	TXN_UNLOCK();
2896 
2897 	/*
2898 	 * We may need to kick off the group commit
2899 	 */
2900 	jfs_flush_journal(log, 0);
2901 }
2902 
2903 /*
2904  * txResume()
2905  *
2906  * Allows transactions to start again following txQuiesce
2907  */
2908 void txResume(struct super_block *sb)
2909 {
2910 	struct jfs_log *log = JFS_SBI(sb)->log;
2911 
2912 	clear_bit(log_QUIESCE, &log->flag);
2913 	TXN_WAKEUP(&log->syncwait);
2914 }
2915 
2916 /*
2917  *      jfs_sync(void)
2918  *
2919  *	To be run as a kernel daemon.  This is awakened when tlocks run low.
2920  *	We write any inodes that have anonymous tlocks so they will become
2921  *	available.
2922  */
2923 int jfs_sync(void *arg)
2924 {
2925 	struct inode *ip;
2926 	struct jfs_inode_info *jfs_ip;
2927 	int rc;
2928 	tid_t tid;
2929 
2930 	daemonize("jfsSync");
2931 
2932 	complete(&jfsIOwait);
2933 
2934 	do {
2935 		/*
2936 		 * write each inode on the anonymous inode list
2937 		 */
2938 		TXN_LOCK();
2939 		while (jfs_tlocks_low && !list_empty(&TxAnchor.anon_list)) {
2940 			jfs_ip = list_entry(TxAnchor.anon_list.next,
2941 					    struct jfs_inode_info,
2942 					    anon_inode_list);
2943 			ip = &jfs_ip->vfs_inode;
2944 
2945 			if (! igrab(ip)) {
2946 				/*
2947 				 * Inode is being freed
2948 				 */
2949 				list_del_init(&jfs_ip->anon_inode_list);
2950 			} else if (! down_trylock(&jfs_ip->commit_sem)) {
2951 				/*
2952 				 * inode will be removed from anonymous list
2953 				 * when it is committed
2954 				 */
2955 				TXN_UNLOCK();
2956 				tid = txBegin(ip->i_sb, COMMIT_INODE);
2957 				rc = txCommit(tid, 1, &ip, 0);
2958 				txEnd(tid);
2959 				up(&jfs_ip->commit_sem);
2960 
2961 				iput(ip);
2962 				/*
2963 				 * Just to be safe.  I don't know how
2964 				 * long we can run without blocking
2965 				 */
2966 				cond_resched();
2967 				TXN_LOCK();
2968 			} else {
2969 				/* We can't get the commit semaphore.  It may
2970 				 * be held by a thread waiting for tlock's
2971 				 * so let's not block here.  Save it to
2972 				 * put back on the anon_list.
2973 				 */
2974 
2975 				/* Take off anon_list */
2976 				list_del(&jfs_ip->anon_inode_list);
2977 
2978 				/* Put on anon_list2 */
2979 				list_add(&jfs_ip->anon_inode_list,
2980 					 &TxAnchor.anon_list2);
2981 
2982 				TXN_UNLOCK();
2983 				iput(ip);
2984 				TXN_LOCK();
2985 			}
2986 		}
2987 		/* Add anon_list2 back to anon_list */
2988 		list_splice_init(&TxAnchor.anon_list2, &TxAnchor.anon_list);
2989 
2990 		if (current->flags & PF_FREEZE) {
2991 			TXN_UNLOCK();
2992 			refrigerator(PF_FREEZE);
2993 		} else {
2994 			DECLARE_WAITQUEUE(wq, current);
2995 
2996 			add_wait_queue(&jfs_sync_thread_wait, &wq);
2997 			set_current_state(TASK_INTERRUPTIBLE);
2998 			TXN_UNLOCK();
2999 			schedule();
3000 			current->state = TASK_RUNNING;
3001 			remove_wait_queue(&jfs_sync_thread_wait, &wq);
3002 		}
3003 	} while (!jfs_stop_threads);
3004 
3005 	jfs_info("jfs_sync being killed");
3006 	complete_and_exit(&jfsIOwait, 0);
3007 }
3008 
3009 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_DEBUG)
3010 int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length,
3011 		      int *eof, void *data)
3012 {
3013 	int len = 0;
3014 	off_t begin;
3015 	char *freewait;
3016 	char *freelockwait;
3017 	char *lowlockwait;
3018 
3019 	freewait =
3020 	    waitqueue_active(&TxAnchor.freewait) ? "active" : "empty";
3021 	freelockwait =
3022 	    waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty";
3023 	lowlockwait =
3024 	    waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty";
3025 
3026 	len += sprintf(buffer,
3027 		       "JFS TxAnchor\n"
3028 		       "============\n"
3029 		       "freetid = %d\n"
3030 		       "freewait = %s\n"
3031 		       "freelock = %d\n"
3032 		       "freelockwait = %s\n"
3033 		       "lowlockwait = %s\n"
3034 		       "tlocksInUse = %d\n"
3035 		       "jfs_tlocks_low = %d\n"
3036 		       "unlock_queue is %sempty\n",
3037 		       TxAnchor.freetid,
3038 		       freewait,
3039 		       TxAnchor.freelock,
3040 		       freelockwait,
3041 		       lowlockwait,
3042 		       TxAnchor.tlocksInUse,
3043 		       jfs_tlocks_low,
3044 		       list_empty(&TxAnchor.unlock_queue) ? "" : "not ");
3045 
3046 	begin = offset;
3047 	*start = buffer + begin;
3048 	len -= begin;
3049 
3050 	if (len > length)
3051 		len = length;
3052 	else
3053 		*eof = 1;
3054 
3055 	if (len < 0)
3056 		len = 0;
3057 
3058 	return len;
3059 }
3060 #endif
3061 
3062 #if defined(CONFIG_PROC_FS) && defined(CONFIG_JFS_STATISTICS)
3063 int jfs_txstats_read(char *buffer, char **start, off_t offset, int length,
3064 		     int *eof, void *data)
3065 {
3066 	int len = 0;
3067 	off_t begin;
3068 
3069 	len += sprintf(buffer,
3070 		       "JFS TxStats\n"
3071 		       "===========\n"
3072 		       "calls to txBegin = %d\n"
3073 		       "txBegin blocked by sync barrier = %d\n"
3074 		       "txBegin blocked by tlocks low = %d\n"
3075 		       "txBegin blocked by no free tid = %d\n"
3076 		       "calls to txBeginAnon = %d\n"
3077 		       "txBeginAnon blocked by sync barrier = %d\n"
3078 		       "txBeginAnon blocked by tlocks low = %d\n"
3079 		       "calls to txLockAlloc = %d\n"
3080 		       "tLockAlloc blocked by no free lock = %d\n",
3081 		       TxStat.txBegin,
3082 		       TxStat.txBegin_barrier,
3083 		       TxStat.txBegin_lockslow,
3084 		       TxStat.txBegin_freetid,
3085 		       TxStat.txBeginAnon,
3086 		       TxStat.txBeginAnon_barrier,
3087 		       TxStat.txBeginAnon_lockslow,
3088 		       TxStat.txLockAlloc,
3089 		       TxStat.txLockAlloc_freelock);
3090 
3091 	begin = offset;
3092 	*start = buffer + begin;
3093 	len -= begin;
3094 
3095 	if (len > length)
3096 		len = length;
3097 	else
3098 		*eof = 1;
3099 
3100 	if (len < 0)
3101 		len = 0;
3102 
3103 	return len;
3104 }
3105 #endif
3106