Blame - fs/ext3/inode.c - android_kernel_htc_msm8960

blob: 0d5fa73b18dc1542700ddbf2285c0b2ffe3b9e18 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/ext3/inode.c
				3	*
				4	* Copyright (C) 1992, 1993, 1994, 1995
				5	* Remy Card (card@masi.ibp.fr)
				6	* Laboratoire MASI - Institut Blaise Pascal
				7	* Universite Pierre et Marie Curie (Paris VI)
				8	*
				9	* from
				10	*
				11	* linux/fs/minix/inode.c
				12	*
				13	* Copyright (C) 1991, 1992 Linus Torvalds
				14	*
				15	* Goal-directed block allocation by Stephen Tweedie
				16	* (sct@redhat.com), 1993, 1998
				17	* Big-endian to little-endian byte-swapping/bitmaps by
				18	* David S. Miller (davem@caip.rutgers.edu), 1995
				19	* 64-bit file support on 64-bit platforms by Jakub Jelinek
				20	* (jj@sunsite.ms.mff.cuni.cz)
				21	*
				22	* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
				23	*/
				24
				25	#include <linux/module.h>
				26	#include <linux/fs.h>
				27	#include <linux/time.h>
				28	#include <linux/ext3_jbd.h>
				29	#include <linux/jbd.h>
				30	#include <linux/smp_lock.h>
				31	#include <linux/highuid.h>
				32	#include <linux/pagemap.h>
				33	#include <linux/quotaops.h>
				34	#include <linux/string.h>
				35	#include <linux/buffer_head.h>
				36	#include <linux/writeback.h>
				37	#include <linux/mpage.h>
				38	#include <linux/uio.h>
				39	#include "xattr.h"
				40	#include "acl.h"
				41
				42	static int ext3_writepage_trans_blocks(struct inode *inode);
				43
				44	/*
				45	* Test whether an inode is a fast symlink.
				46	*/
				47	static inline int ext3_inode_is_fast_symlink(struct inode *inode)
				48	{
				49	int ea_blocks = EXT3_I(inode)->i_file_acl ?
				50	(inode->i_sb->s_blocksize >> 9) : 0;
				51
				52	return (S_ISLNK(inode->i_mode) &&
				53	inode->i_blocks - ea_blocks == 0);
				54	}
				55
				56	/* The ext3 forget function must perform a revoke if we are freeing data
				57	* which has been journaled. Metadata (eg. indirect blocks) must be
				58	* revoked in all cases.
				59	*
				60	* "bh" may be NULL: a metadata block may have been freed from memory
				61	* but there may still be a record of it in the journal, and that record
				62	* still needs to be revoked.
				63	*/
				64
				65	int ext3_forget(handle_t *handle, int is_metadata,
				66	struct inode inode, struct buffer_head bh,
				67	int blocknr)
				68	{
				69	int err;
				70
				71	might_sleep();
				72
				73	BUFFER_TRACE(bh, "enter");
				74
				75	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
				76	"data mode %lx\n",
				77	bh, is_metadata, inode->i_mode,
				78	test_opt(inode->i_sb, DATA_FLAGS));
				79
				80	/* Never use the revoke function if we are doing full data
				81	* journaling: there is no need to, and a V1 superblock won't
				82	* support it. Otherwise, only skip the revoke on un-journaled
				83	* data blocks. */
				84
				85	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA \|\|
				86	(!is_metadata && !ext3_should_journal_data(inode))) {
				87	if (bh) {
				88	BUFFER_TRACE(bh, "call journal_forget");
				89	return ext3_journal_forget(handle, bh);
				90	}
				91	return 0;
				92	}
				93
				94	/*
				95	* data!=journal && (is_metadata \|\| should_journal_data(inode))
				96	*/
				97	BUFFER_TRACE(bh, "call ext3_journal_revoke");
				98	err = ext3_journal_revoke(handle, blocknr, bh);
				99	if (err)
				100	ext3_abort(inode->i_sb, __FUNCTION__,
				101	"error %d when attempting revoke", err);
				102	BUFFER_TRACE(bh, "exit");
				103	return err;
				104	}
				105
				106	/*
				107	* Work out how many blocks we need to progress with the next chunk of a
				108	* truncate transaction.
				109	*/
				110
				111	static unsigned long blocks_for_truncate(struct inode *inode)
				112	{
				113	unsigned long needed;
				114
				115	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
				116
				117	/* Give ourselves just enough room to cope with inodes in which
				118	* i_blocks is corrupt: we've seen disk corruptions in the past
				119	* which resulted in random data in an inode which looked enough
				120	* like a regular file for ext3 to try to delete it. Things
				121	* will go a bit crazy if that happens, but at least we should
				122	* try not to panic the whole kernel. */
				123	if (needed < 2)
				124	needed = 2;
				125
				126	/* But we need to bound the transaction so we don't overflow the
				127	* journal. */
				128	if (needed > EXT3_MAX_TRANS_DATA)
				129	needed = EXT3_MAX_TRANS_DATA;
				130
				131	return EXT3_DATA_TRANS_BLOCKS + needed;
				132	}
				133
				134	/*
				135	* Truncate transactions can be complex and absolutely huge. So we need to
				136	* be able to restart the transaction at a conventient checkpoint to make
				137	* sure we don't overflow the journal.
				138	*
				139	* start_transaction gets us a new handle for a truncate transaction,
				140	* and extend_transaction tries to extend the existing one a bit. If
				141	* extend fails, we need to propagate the failure up and restart the
				142	* transaction in the top-level truncate loop. --sct
				143	*/
				144
				145	static handle_t start_transaction(struct inode inode)
				146	{
				147	handle_t *result;
				148
				149	result = ext3_journal_start(inode, blocks_for_truncate(inode));
				150	if (!IS_ERR(result))
				151	return result;
				152
				153	ext3_std_error(inode->i_sb, PTR_ERR(result));
				154	return result;
				155	}
				156
				157	/*
				158	* Try to extend this transaction for the purposes of truncation.
				159	*
				160	* Returns 0 if we managed to create more room. If we can't create more
				161	* room, and the transaction must be restarted we return 1.
				162	*/
				163	static int try_to_extend_transaction(handle_t handle, struct inode inode)
				164	{
				165	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
				166	return 0;
				167	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
				168	return 0;
				169	return 1;
				170	}
				171
				172	/*
				173	* Restart the transaction associated with *handle. This does a commit,
				174	* so before we call here everything must be consistently dirtied against
				175	* this transaction.
				176	*/
				177	static int ext3_journal_test_restart(handle_t handle, struct inode inode)
				178	{
				179	jbd_debug(2, "restarting handle %p\n", handle);
				180	return ext3_journal_restart(handle, blocks_for_truncate(inode));
				181	}
				182
				183	/*
				184	* Called at the last iput() if i_nlink is zero.
				185	*/
				186	void ext3_delete_inode (struct inode * inode)
				187	{
				188	handle_t *handle;
				189
				190	if (is_bad_inode(inode))
				191	goto no_delete;
				192
				193	handle = start_transaction(inode);
				194	if (IS_ERR(handle)) {
				195	/* If we're going to skip the normal cleanup, we still
				196	* need to make sure that the in-core orphan linked list
				197	* is properly cleaned up. */
				198	ext3_orphan_del(NULL, inode);
				199	goto no_delete;
				200	}
				201
				202	if (IS_SYNC(inode))
				203	handle->h_sync = 1;
				204	inode->i_size = 0;
				205	if (inode->i_blocks)
				206	ext3_truncate(inode);
				207	/*
				208	* Kill off the orphan record which ext3_truncate created.
				209	* AKPM: I think this can be inside the above `if'.
				210	* Note that ext3_orphan_del() has to be able to cope with the
				211	* deletion of a non-existent orphan - this is because we don't
				212	* know if ext3_truncate() actually created an orphan record.
				213	* (Well, we could do this if we need to, but heck - it works)
				214	*/
				215	ext3_orphan_del(handle, inode);
				216	EXT3_I(inode)->i_dtime = get_seconds();
				217
				218	/*
				219	* One subtle ordering requirement: if anything has gone wrong
				220	* (transaction abort, IO errors, whatever), then we can still
				221	* do these next steps (the fs will already have been marked as
				222	* having errors), but we can't free the inode if the mark_dirty
				223	* fails.
				224	*/
				225	if (ext3_mark_inode_dirty(handle, inode))
				226	/* If that failed, just do the required in-core inode clear. */
				227	clear_inode(inode);
				228	else
				229	ext3_free_inode(handle, inode);
				230	ext3_journal_stop(handle);
				231	return;
				232	no_delete:
				233	clear_inode(inode); /* We must guarantee clearing of inode... */
				234	}
				235
				236	static int ext3_alloc_block (handle_t *handle,
				237	struct inode * inode, unsigned long goal, int *err)
				238	{
				239	unsigned long result;
				240
				241	result = ext3_new_block(handle, inode, goal, err);
				242	return result;
				243	}
				244
				245
				246	typedef struct {
				247	__le32 *p;
				248	__le32 key;
				249	struct buffer_head *bh;
				250	} Indirect;
				251
				252	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)
				253	{
				254	p->key = *(p->p = v);
				255	p->bh = bh;
				256	}
				257
				258	static inline int verify_chain(Indirect from, Indirect to)
				259	{
				260	while (from <= to && from->key == *from->p)
				261	from++;
				262	return (from > to);
				263	}
				264
				265	/**
				266	* ext3_block_to_path - parse the block number into array of offsets
				267	* @inode: inode in question (we are only interested in its superblock)
				268	* @i_block: block number to be parsed
				269	* @offsets: array to store the offsets in
				270	* @boundary: set this non-zero if the referred-to block is likely to be
				271	* followed (on disk) by an indirect block.
				272	*
				273	* To store the locations of file's data ext3 uses a data structure common
				274	* for UNIX filesystems - tree of pointers anchored in the inode, with
				275	* data blocks at leaves and indirect blocks in intermediate nodes.
				276	* This function translates the block number into path in that tree -
				277	* return value is the path length and @offsets[n] is the offset of
				278	* pointer to (n+1)th node in the nth one. If @block is out of range
				279	* (negative or too large) warning is printed and zero returned.
				280	*
				281	* Note: function doesn't find node addresses, so no IO is needed. All
				282	* we need to know is the capacity of indirect blocks (taken from the
				283	* inode->i_sb).
				284	*/
				285
				286	/*
				287	* Portability note: the last comparison (check that we fit into triple
				288	* indirect block) is spelled differently, because otherwise on an
				289	* architecture with 32-bit longs and 8Kb pages we might get into trouble
				290	* if our filesystem had 8Kb blocks. We might use long long, but that would
				291	* kill us on x86. Oh, well, at least the sign propagation does not matter -
				292	* i_block would have to be negative in the very beginning, so we would not
				293	* get there at all.
				294	*/
				295
				296	static int ext3_block_to_path(struct inode *inode,
				297	long i_block, int offsets[4], int *boundary)
				298	{
				299	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				300	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
				301	const long direct_blocks = EXT3_NDIR_BLOCKS,
				302	indirect_blocks = ptrs,
				303	double_blocks = (1 << (ptrs_bits * 2));
				304	int n = 0;
				305	int final = 0;
				306
				307	if (i_block < 0) {
				308	ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
				309	} else if (i_block < direct_blocks) {
				310	offsets[n++] = i_block;
				311	final = direct_blocks;
				312	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
				313	offsets[n++] = EXT3_IND_BLOCK;
				314	offsets[n++] = i_block;
				315	final = ptrs;
				316	} else if ((i_block -= indirect_blocks) < double_blocks) {
				317	offsets[n++] = EXT3_DIND_BLOCK;
				318	offsets[n++] = i_block >> ptrs_bits;
				319	offsets[n++] = i_block & (ptrs - 1);
				320	final = ptrs;
				321	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
				322	offsets[n++] = EXT3_TIND_BLOCK;
				323	offsets[n++] = i_block >> (ptrs_bits * 2);
				324	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
				325	offsets[n++] = i_block & (ptrs - 1);
				326	final = ptrs;
				327	} else {
				328	ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
				329	}
				330	if (boundary)
				331	*boundary = (i_block & (ptrs - 1)) == (final - 1);
				332	return n;
				333	}
				334
				335	/**
				336	* ext3_get_branch - read the chain of indirect blocks leading to data
				337	* @inode: inode in question
				338	* @depth: depth of the chain (1 - direct pointer, etc.)
				339	* @offsets: offsets of pointers in inode/indirect blocks
				340	* @chain: place to store the result
				341	* @err: here we store the error value
				342	*
				343	* Function fills the array of triples <key, p, bh> and returns %NULL
				344	* if everything went OK or the pointer to the last filled triple
				345	* (incomplete one) otherwise. Upon the return chain[i].key contains
				346	* the number of (i+1)-th block in the chain (as it is stored in memory,
				347	* i.e. little-endian 32-bit), chain[i].p contains the address of that
				348	* number (it points into struct inode for i==0 and into the bh->b_data
				349	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect
				350	* block for i>0 and NULL for i==0. In other words, it holds the block
				351	* numbers of the chain, addresses they were taken from (and where we can
				352	* verify that chain did not change) and buffer_heads hosting these
				353	* numbers.
				354	*
				355	* Function stops when it stumbles upon zero pointer (absent block)
				356	* (pointer to last triple returned, *@err == 0)
				357	* or when it gets an IO error reading an indirect block
				358	* (ditto, *@err == -EIO)
				359	* or when it notices that chain had been changed while it was reading
				360	* (ditto, *@err == -EAGAIN)
				361	* or when it reads all @depth-1 indirect blocks successfully and finds
				362	* the whole chain, all way to the data (returns %NULL, *err == 0).
				363	*/
				364	static Indirect ext3_get_branch(struct inode inode, int depth, int *offsets,
				365	Indirect chain[4], int *err)
				366	{
				367	struct super_block *sb = inode->i_sb;
				368	Indirect *p = chain;
				369	struct buffer_head *bh;
				370
				371	*err = 0;
				372	/* i_data is not going away, no lock needed */
				373	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
				374	if (!p->key)
				375	goto no_block;
				376	while (--depth) {
				377	bh = sb_bread(sb, le32_to_cpu(p->key));
				378	if (!bh)
				379	goto failure;
				380	/* Reader: pointers */
				381	if (!verify_chain(chain, p))
				382	goto changed;
				383	add_chain(++p, bh, (__le32)bh->b_data + ++offsets);
				384	/* Reader: end */
				385	if (!p->key)
				386	goto no_block;
				387	}
				388	return NULL;
				389
				390	changed:
				391	brelse(bh);
				392	*err = -EAGAIN;
				393	goto no_block;
				394	failure:
				395	*err = -EIO;
				396	no_block:
				397	return p;
				398	}
				399
				400	/**
				401	* ext3_find_near - find a place for allocation with sufficient locality
				402	* @inode: owner
				403	* @ind: descriptor of indirect block.
				404	*
				405	* This function returns the prefered place for block allocation.
				406	* It is used when heuristic for sequential allocation fails.
				407	* Rules are:
				408	* + if there is a block to the left of our position - allocate near it.
				409	* + if pointer will live in indirect block - allocate near that block.
				410	* + if pointer will live in inode - allocate in the same
				411	* cylinder group.
				412	*
				413	* In the latter case we colour the starting block by the callers PID to
				414	* prevent it from clashing with concurrent allocations for a different inode
				415	* in the same block group. The PID is used here so that functionally related
				416	* files will be close-by on-disk.
				417	*
				418	* Caller must make sure that @ind is valid and will stay that way.
				419	*/
				420
				421	static unsigned long ext3_find_near(struct inode inode, Indirect ind)
				422	{
				423	struct ext3_inode_info *ei = EXT3_I(inode);
				424	__le32 start = ind->bh ? (__le32) ind->bh->b_data : ei->i_data;
				425	__le32 *p;
				426	unsigned long bg_start;
				427	unsigned long colour;
				428
				429	/* Try to find previous block */
				430	for (p = ind->p - 1; p >= start; p--)
				431	if (*p)
				432	return le32_to_cpu(*p);
				433
				434	/* No such thing, so let's try location of indirect block */
				435	if (ind->bh)
				436	return ind->bh->b_blocknr;
				437
				438	/*
				439	* It is going to be refered from inode itself? OK, just put it into
				440	* the same cylinder group then.
				441	*/
				442	bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
				443	le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
				444	colour = (current->pid % 16) *
				445	(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
				446	return bg_start + colour;
				447	}
				448
				449	/**
				450	* ext3_find_goal - find a prefered place for allocation.
				451	* @inode: owner
				452	* @block: block we want
				453	* @chain: chain of indirect blocks
				454	* @partial: pointer to the last triple within a chain
				455	* @goal: place to store the result.
				456	*
				457	* Normally this function find the prefered place for block allocation,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	458	* stores it in *@goal and returns zero.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	459	*/
				460
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	461	static unsigned long ext3_find_goal(struct inode *inode, long block,
				462	Indirect chain[4], Indirect *partial)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	463	{
				464	struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
				465
				466	/*
				467	* try the heuristic for sequential allocation,
				468	* failing that at least try to get decent locality.
				469	*/
				470	if (block_i && (block == block_i->last_alloc_logical_block + 1)
				471	&& (block_i->last_alloc_physical_block != 0)) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	472	return block_i->last_alloc_physical_block + 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	473	}
				474
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	475	return ext3_find_near(inode, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476	}
				477
				478	/**
				479	* ext3_alloc_branch - allocate and set up a chain of blocks.
				480	* @inode: owner
				481	* @num: depth of the chain (number of blocks to allocate)
				482	* @offsets: offsets (in the blocks) to store the pointers to next.
				483	* @branch: place to store the chain in.
				484	*
				485	* This function allocates @num blocks, zeroes out all but the last one,
				486	* links them into chain and (if we are synchronous) writes them to disk.
				487	* In other words, it prepares a branch that can be spliced onto the
				488	* inode. It stores the information about that chain in the branch[], in
				489	* the same format as ext3_get_branch() would do. We are calling it after
				490	* we had read the existing part of chain and partial points to the last
				491	* triple of that (one with zero ->key). Upon the exit we have the same
				492	* picture as after the successful ext3_get_block(), excpet that in one
				493	* place chain is disconnected - *branch->p is still zero (we did not
				494	* set the last link), but branch->key contains the number that should
				495	* be placed into *branch->p to fill that gap.
				496	*
				497	* If allocation fails we free all blocks we've allocated (and forget
				498	* their buffer_heads) and return the error value the from failed
				499	* ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
				500	* as described above and return 0.
				501	*/
				502
				503	static int ext3_alloc_branch(handle_t handle, struct inode inode,
				504	int num,
				505	unsigned long goal,
				506	int *offsets,
				507	Indirect *branch)
				508	{
				509	int blocksize = inode->i_sb->s_blocksize;
				510	int n = 0, keys = 0;
				511	int err = 0;
				512	int i;
				513	int parent = ext3_alloc_block(handle, inode, goal, &err);
				514
				515	branch[0].key = cpu_to_le32(parent);
				516	if (parent) {
				517	for (n = 1; n < num; n++) {
				518	struct buffer_head *bh;
				519	/* Allocate the next block */
				520	int nr = ext3_alloc_block(handle, inode, parent, &err);
				521	if (!nr)
				522	break;
				523	branch[n].key = cpu_to_le32(nr);
				524	keys = n+1;
				525
				526	/*
				527	* Get buffer_head for parent block, zero it out
				528	* and set the pointer to new one, then send
				529	* parent to disk.
				530	*/
				531	bh = sb_getblk(inode->i_sb, parent);
				532	branch[n].bh = bh;
				533	lock_buffer(bh);
				534	BUFFER_TRACE(bh, "call get_create_access");
				535	err = ext3_journal_get_create_access(handle, bh);
				536	if (err) {
				537	unlock_buffer(bh);
				538	brelse(bh);
				539	break;
				540	}
				541
				542	memset(bh->b_data, 0, blocksize);
				543	branch[n].p = (__le32*) bh->b_data + offsets[n];
				544	*branch[n].p = branch[n].key;
				545	BUFFER_TRACE(bh, "marking uptodate");
				546	set_buffer_uptodate(bh);
				547	unlock_buffer(bh);
				548
				549	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				550	err = ext3_journal_dirty_metadata(handle, bh);
				551	if (err)
				552	break;
				553
				554	parent = nr;
				555	}
				556	}
				557	if (n == num)
				558	return 0;
				559
				560	/* Allocation failed, free what we already allocated */
				561	for (i = 1; i < keys; i++) {
				562	BUFFER_TRACE(branch[i].bh, "call journal_forget");
				563	ext3_journal_forget(handle, branch[i].bh);
				564	}
				565	for (i = 0; i < keys; i++)
				566	ext3_free_blocks(handle, inode, le32_to_cpu(branch[i].key), 1);
				567	return err;
				568	}
				569
				570	/**
				571	* ext3_splice_branch - splice the allocated branch onto inode.
				572	* @inode: owner
				573	* @block: (logical) number of block we are adding
				574	* @chain: chain of indirect blocks (with a missing link - see
				575	* ext3_alloc_branch)
				576	* @where: location of missing link
				577	* @num: number of blocks we are adding
				578	*
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	579	* This function fills the missing link and does all housekeeping needed in
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	580	* inode (->i_blocks, etc.). In case of success we end up with the full
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	581	* chain to new block and return 0.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	582	*/
				583
				584	static int ext3_splice_branch(handle_t handle, struct inode inode, long block,
				585	Indirect chain[4], Indirect *where, int num)
				586	{
				587	int i;
				588	int err = 0;
				589	struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
				590
				591	/*
				592	* If we're splicing into a [td]indirect block (as opposed to the
				593	* inode) then we need to get write access to the [td]indirect block
				594	* before the splice.
				595	*/
				596	if (where->bh) {
				597	BUFFER_TRACE(where->bh, "get_write_access");
				598	err = ext3_journal_get_write_access(handle, where->bh);
				599	if (err)
				600	goto err_out;
				601	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	602	/* That's it */
				603
				604	*where->p = where->key;
				605
				606	/*
				607	* update the most recently allocated logical & physical block
				608	* in i_block_alloc_info, to assist find the proper goal block for next
				609	* allocation
				610	*/
				611	if (block_i) {
				612	block_i->last_alloc_logical_block = block;
				613	block_i->last_alloc_physical_block = le32_to_cpu(where[num-1].key);
				614	}
				615
				616	/* We are done with atomic stuff, now do the rest of housekeeping */
				617
				618	inode->i_ctime = CURRENT_TIME_SEC;
				619	ext3_mark_inode_dirty(handle, inode);
				620
				621	/* had we spliced it onto indirect block? */
				622	if (where->bh) {
				623	/*
				624	* akpm: If we spliced it onto an indirect block, we haven't
				625	* altered the inode. Note however that if it is being spliced
				626	* onto an indirect block at the very end of the file (the
				627	* file is growing) then we will alter the inode to reflect
				628	* the new i_size. But that is not done here - it is done in
				629	* generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
				630	*/
				631	jbd_debug(5, "splicing indirect only\n");
				632	BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
				633	err = ext3_journal_dirty_metadata(handle, where->bh);
				634	if (err)
				635	goto err_out;
				636	} else {
				637	/*
				638	* OK, we spliced it into the inode itself on a direct block.
				639	* Inode was dirtied above.
				640	*/
				641	jbd_debug(5, "splicing direct\n");
				642	}
				643	return err;
				644
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	645	err_out:
				646	for (i = 1; i < num; i++) {
				647	BUFFER_TRACE(where[i].bh, "call journal_forget");
				648	ext3_journal_forget(handle, where[i].bh);
				649	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	650	return err;
				651	}
				652
				653	/*
				654	* Allocation strategy is simple: if we have to allocate something, we will
				655	* have to go the whole way to leaf. So let's do it before attaching anything
				656	* to tree, set linkage between the newborn blocks, write them if sync is
				657	* required, recheck the path, free and repeat if check fails, otherwise
				658	* set the last missing link (that will protect us from any truncate-generated
				659	* removals - all blocks on the path are immune now) and possibly force the
				660	* write on the parent block.
				661	* That has a nice additional property: no special recovery from the failed
				662	* allocations is needed - we simply release blocks and do not touch anything
				663	* reachable from inode.
				664	*
				665	* akpm: `handle' can be NULL if create == 0.
				666	*
				667	* The BKL may not be held on entry here. Be sure to take it early.
				668	*/
				669
				670	static int
				671	ext3_get_block_handle(handle_t handle, struct inode inode, sector_t iblock,
				672	struct buffer_head *bh_result, int create, int extend_disksize)
				673	{
				674	int err = -EIO;
				675	int offsets[4];
				676	Indirect chain[4];
				677	Indirect *partial;
				678	unsigned long goal;
				679	int left;
				680	int boundary = 0;
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	681	const int depth = ext3_block_to_path(inode, iblock, offsets, &boundary);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	682	struct ext3_inode_info *ei = EXT3_I(inode);
				683
				684	J_ASSERT(handle != NULL \|\| create == 0);
				685
				686	if (depth == 0)
				687	goto out;
				688
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	689	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				690
				691	/* Simplest case - block found, no allocation needed */
				692	if (!partial) {
				693	clear_buffer_new(bh_result);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	694	goto got_it;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	695	}
				696
				697	/* Next simple case - plain lookup or failed read of indirect block */
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	698	if (!create \|\| err == -EIO)
				699	goto cleanup;
				700
				701	down(&ei->truncate_sem);
				702
				703	/*
				704	* If the indirect block is missing while we are reading
				705	* the chain(ext3_get_branch() returns -EAGAIN err), or
				706	* if the chain has been changed after we grab the semaphore,
				707	* (either because another process truncated this branch, or
				708	* another get_block allocated this branch) re-grab the chain to see if
				709	* the request block has been allocated or not.
				710	*
				711	* Since we already block the truncate/other get_block
				712	* at this point, we will have the current copy of the chain when we
				713	* splice the branch into the tree.
				714	*/
				715	if (err == -EAGAIN \|\| !verify_chain(chain, partial)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	716	while (partial > chain) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	717	brelse(partial->bh);
				718	partial--;
				719	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	720	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				721	if (!partial) {
				722	up(&ei->truncate_sem);
				723	if (err)
				724	goto cleanup;
				725	clear_buffer_new(bh_result);
				726	goto got_it;
				727	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	728	}
				729
				730	/*
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	731	* Okay, we need to do block allocation. Lazily initialize the block
				732	* allocation info here if necessary
				733	*/
				734	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	735	ext3_init_block_alloc_info(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	736
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	737	goal = ext3_find_goal(inode, iblock, chain, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	738
				739	left = (chain + depth) - partial;
				740
				741	/*
				742	* Block out ext3_truncate while we alter the tree
				743	*/
				744	err = ext3_alloc_branch(handle, inode, left, goal,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	745	offsets + (partial - chain), partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	746
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	747	/*
				748	* The ext3_splice_branch call will free and forget any buffers
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	749	* on the new chain if there is a failure, but that risks using
				750	* up transaction credits, especially for bitmaps where the
				751	* credits cannot be returned. Can we handle this somehow? We
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	752	* may need to return -EAGAIN upwards in the worst case. --sct
				753	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	754	if (!err)
				755	err = ext3_splice_branch(handle, inode, iblock, chain,
				756	partial, left);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	757	/*
				758	* i_disksize growing is protected by truncate_sem. Don't forget to
				759	* protect it if you're about to implement concurrent
				760	* ext3_get_block() -bzzz
				761	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	762	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
				763	ei->i_disksize = inode->i_size;
				764	up(&ei->truncate_sem);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	765	if (err)
				766	goto cleanup;
				767
				768	set_buffer_new(bh_result);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	769	got_it:
				770	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
				771	if (boundary)
				772	set_buffer_boundary(bh_result);
				773	/* Clean up and exit */
				774	partial = chain + depth - 1; /* the whole chain */
				775	cleanup:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	776	while (partial > chain) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	777	BUFFER_TRACE(partial->bh, "call brelse");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	778	brelse(partial->bh);
				779	partial--;
				780	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	781	BUFFER_TRACE(bh_result, "returned");
				782	out:
				783	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	784	}
				785
				786	static int ext3_get_block(struct inode *inode, sector_t iblock,
				787	struct buffer_head *bh_result, int create)
				788	{
				789	handle_t *handle = NULL;
				790	int ret;
				791
				792	if (create) {
				793	handle = ext3_journal_current_handle();
				794	J_ASSERT(handle != 0);
				795	}
				796	ret = ext3_get_block_handle(handle, inode, iblock,
				797	bh_result, create, 1);
				798	return ret;
				799	}
				800
				801	#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
				802
				803	static int
				804	ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
				805	unsigned long max_blocks, struct buffer_head *bh_result,
				806	int create)
				807	{
				808	handle_t *handle = journal_current_handle();
				809	int ret = 0;
				810
				811	if (!handle)
				812	goto get_block; /* A read */
				813
				814	if (handle->h_transaction->t_state == T_LOCKED) {
				815	/*
				816	* Huge direct-io writes can hold off commits for long
				817	* periods of time. Let this commit run.
				818	*/
				819	ext3_journal_stop(handle);
				820	handle = ext3_journal_start(inode, DIO_CREDITS);
				821	if (IS_ERR(handle))
				822	ret = PTR_ERR(handle);
				823	goto get_block;
				824	}
				825
				826	if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
				827	/*
				828	* Getting low on buffer credits...
				829	*/
				830	ret = ext3_journal_extend(handle, DIO_CREDITS);
				831	if (ret > 0) {
				832	/*
				833	* Couldn't extend the transaction. Start a new one.
				834	*/
				835	ret = ext3_journal_restart(handle, DIO_CREDITS);
				836	}
				837	}
				838
				839	get_block:
				840	if (ret == 0)
				841	ret = ext3_get_block_handle(handle, inode, iblock,
				842	bh_result, create, 0);
				843	bh_result->b_size = (1 << inode->i_blkbits);
				844	return ret;
				845	}
				846
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	847	/*
				848	* `handle' can be NULL if create is zero
				849	*/
				850	struct buffer_head ext3_getblk(handle_t handle, struct inode * inode,
				851	long block, int create, int * errp)
				852	{
				853	struct buffer_head dummy;
				854	int fatal = 0, err;
				855
				856	J_ASSERT(handle != NULL \|\| create == 0);
				857
				858	dummy.b_state = 0;
				859	dummy.b_blocknr = -1000;
				860	buffer_trace_init(&dummy.b_history);
				861	*errp = ext3_get_block_handle(handle, inode, block, &dummy, create, 1);
				862	if (!*errp && buffer_mapped(&dummy)) {
				863	struct buffer_head *bh;
				864	bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
				865	if (buffer_new(&dummy)) {
				866	J_ASSERT(create != 0);
				867	J_ASSERT(handle != 0);
				868
				869	/* Now that we do not always journal data, we
				870	should keep in mind whether this should
				871	always journal the new buffer as metadata.
				872	For now, regular file writes use
				873	ext3_get_block instead, so it's not a
				874	problem. */
				875	lock_buffer(bh);
				876	BUFFER_TRACE(bh, "call get_create_access");
				877	fatal = ext3_journal_get_create_access(handle, bh);
				878	if (!fatal && !buffer_uptodate(bh)) {
				879	memset(bh->b_data, 0, inode->i_sb->s_blocksize);
				880	set_buffer_uptodate(bh);
				881	}
				882	unlock_buffer(bh);
				883	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				884	err = ext3_journal_dirty_metadata(handle, bh);
				885	if (!fatal)
				886	fatal = err;
				887	} else {
				888	BUFFER_TRACE(bh, "not a new buffer");
				889	}
				890	if (fatal) {
				891	*errp = fatal;
				892	brelse(bh);
				893	bh = NULL;
				894	}
				895	return bh;
				896	}
				897	return NULL;
				898	}
				899
				900	struct buffer_head ext3_bread(handle_t handle, struct inode * inode,
				901	int block, int create, int *err)
				902	{
				903	struct buffer_head * bh;
				904
				905	bh = ext3_getblk(handle, inode, block, create, err);
				906	if (!bh)
				907	return bh;
				908	if (buffer_uptodate(bh))
				909	return bh;
				910	ll_rw_block(READ, 1, &bh);
				911	wait_on_buffer(bh);
				912	if (buffer_uptodate(bh))
				913	return bh;
				914	put_bh(bh);
				915	*err = -EIO;
				916	return NULL;
				917	}
				918
				919	static int walk_page_buffers( handle_t *handle,
				920	struct buffer_head *head,
				921	unsigned from,
				922	unsigned to,
				923	int *partial,
				924	int (fn)( handle_t handle,
				925	struct buffer_head *bh))
				926	{
				927	struct buffer_head *bh;
				928	unsigned block_start, block_end;
				929	unsigned blocksize = head->b_size;
				930	int err, ret = 0;
				931	struct buffer_head *next;
				932
				933	for ( bh = head, block_start = 0;
				934	ret == 0 && (bh != head \|\| !block_start);
				935	block_start = block_end, bh = next)
				936	{
				937	next = bh->b_this_page;
				938	block_end = block_start + blocksize;
				939	if (block_end <= from \|\| block_start >= to) {
				940	if (partial && !buffer_uptodate(bh))
				941	*partial = 1;
				942	continue;
				943	}
				944	err = (*fn)(handle, bh);
				945	if (!ret)
				946	ret = err;
				947	}
				948	return ret;
				949	}
				950
				951	/*
				952	* To preserve ordering, it is essential that the hole instantiation and
				953	* the data write be encapsulated in a single transaction. We cannot
				954	* close off a transaction and start a new one between the ext3_get_block()
				955	* and the commit_write(). So doing the journal_start at the start of
				956	* prepare_write() is the right place.
				957	*
				958	* Also, this function can nest inside ext3_writepage() ->
				959	* block_write_full_page(). In that case, we know that ext3_writepage()
				960	* has generated enough buffer credits to do the whole page. So we won't
				961	* block on the journal in that case, which is good, because the caller may
				962	* be PF_MEMALLOC.
				963	*
				964	* By accident, ext3 can be reentered when a transaction is open via
				965	* quota file writes. If we were to commit the transaction while thus
				966	* reentered, there can be a deadlock - we would be holding a quota
				967	* lock, and the commit would never complete if another thread had a
				968	* transaction open and was blocking on the quota lock - a ranking
				969	* violation.
				970	*
				971	* So what we do is to rely on the fact that journal_stop/journal_start
				972	* will _not_ run commit under these circumstances because handle->h_ref
				973	* is elevated. We'll still have enough credits for the tiny quotafile
				974	* write.
				975	*/
				976
				977	static int do_journal_get_write_access(handle_t *handle,
				978	struct buffer_head *bh)
				979	{
				980	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				981	return 0;
				982	return ext3_journal_get_write_access(handle, bh);
				983	}
				984
				985	static int ext3_prepare_write(struct file file, struct page page,
				986	unsigned from, unsigned to)
				987	{
				988	struct inode *inode = page->mapping->host;
				989	int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
				990	handle_t *handle;
				991	int retries = 0;
				992
				993	retry:
				994	handle = ext3_journal_start(inode, needed_blocks);
				995	if (IS_ERR(handle)) {
				996	ret = PTR_ERR(handle);
				997	goto out;
				998	}
				999	if (test_opt(inode->i_sb, NOBH))
				1000	ret = nobh_prepare_write(page, from, to, ext3_get_block);
				1001	else
				1002	ret = block_prepare_write(page, from, to, ext3_get_block);
				1003	if (ret)
				1004	goto prepare_write_failed;
				1005
				1006	if (ext3_should_journal_data(inode)) {
				1007	ret = walk_page_buffers(handle, page_buffers(page),
				1008	from, to, NULL, do_journal_get_write_access);
				1009	}
				1010	prepare_write_failed:
				1011	if (ret)
				1012	ext3_journal_stop(handle);
				1013	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
				1014	goto retry;
				1015	out:
				1016	return ret;
				1017	}
				1018
				1019	int
				1020	ext3_journal_dirty_data(handle_t handle, struct buffer_head bh)
				1021	{
				1022	int err = journal_dirty_data(handle, bh);
				1023	if (err)
				1024	ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
				1025	bh, handle,err);
				1026	return err;
				1027	}
				1028
				1029	/* For commit_write() in data=journal mode */
				1030	static int commit_write_fn(handle_t handle, struct buffer_head bh)
				1031	{
				1032	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1033	return 0;
				1034	set_buffer_uptodate(bh);
				1035	return ext3_journal_dirty_metadata(handle, bh);
				1036	}
				1037
				1038	/*
				1039	* We need to pick up the new inode size which generic_commit_write gave us
				1040	* `file' can be NULL - eg, when called from page_symlink().
				1041	*
				1042	* ext3 never places buffers on inode->i_mapping->private_list. metadata
				1043	* buffers are managed internally.
				1044	*/
				1045
				1046	static int ext3_ordered_commit_write(struct file file, struct page page,
				1047	unsigned from, unsigned to)
				1048	{
				1049	handle_t *handle = ext3_journal_current_handle();
				1050	struct inode *inode = page->mapping->host;
				1051	int ret = 0, ret2;
				1052
				1053	ret = walk_page_buffers(handle, page_buffers(page),
				1054	from, to, NULL, ext3_journal_dirty_data);
				1055
				1056	if (ret == 0) {
				1057	/*
				1058	* generic_commit_write() will run mark_inode_dirty() if i_size
				1059	* changes. So let's piggyback the i_disksize mark_inode_dirty
				1060	* into that.
				1061	*/
				1062	loff_t new_i_size;
				1063
				1064	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1065	if (new_i_size > EXT3_I(inode)->i_disksize)
				1066	EXT3_I(inode)->i_disksize = new_i_size;
				1067	ret = generic_commit_write(file, page, from, to);
				1068	}
				1069	ret2 = ext3_journal_stop(handle);
				1070	if (!ret)
				1071	ret = ret2;
				1072	return ret;
				1073	}
				1074
				1075	static int ext3_writeback_commit_write(struct file file, struct page page,
				1076	unsigned from, unsigned to)
				1077	{
				1078	handle_t *handle = ext3_journal_current_handle();
				1079	struct inode *inode = page->mapping->host;
				1080	int ret = 0, ret2;
				1081	loff_t new_i_size;
				1082
				1083	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1084	if (new_i_size > EXT3_I(inode)->i_disksize)
				1085	EXT3_I(inode)->i_disksize = new_i_size;
				1086
				1087	if (test_opt(inode->i_sb, NOBH))
				1088	ret = nobh_commit_write(file, page, from, to);
				1089	else
				1090	ret = generic_commit_write(file, page, from, to);
				1091
				1092	ret2 = ext3_journal_stop(handle);
				1093	if (!ret)
				1094	ret = ret2;
				1095	return ret;
				1096	}
				1097
				1098	static int ext3_journalled_commit_write(struct file *file,
				1099	struct page *page, unsigned from, unsigned to)
				1100	{
				1101	handle_t *handle = ext3_journal_current_handle();
				1102	struct inode *inode = page->mapping->host;
				1103	int ret = 0, ret2;
				1104	int partial = 0;
				1105	loff_t pos;
				1106
				1107	/*
				1108	* Here we duplicate the generic_commit_write() functionality
				1109	*/
				1110	pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1111
				1112	ret = walk_page_buffers(handle, page_buffers(page), from,
				1113	to, &partial, commit_write_fn);
				1114	if (!partial)
				1115	SetPageUptodate(page);
				1116	if (pos > inode->i_size)
				1117	i_size_write(inode, pos);
				1118	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1119	if (inode->i_size > EXT3_I(inode)->i_disksize) {
				1120	EXT3_I(inode)->i_disksize = inode->i_size;
				1121	ret2 = ext3_mark_inode_dirty(handle, inode);
				1122	if (!ret)
				1123	ret = ret2;
				1124	}
				1125	ret2 = ext3_journal_stop(handle);
				1126	if (!ret)
				1127	ret = ret2;
				1128	return ret;
				1129	}
				1130
				1131	/*
				1132	* bmap() is special. It gets used by applications such as lilo and by
				1133	* the swapper to find the on-disk block of a specific piece of data.
				1134	*
				1135	* Naturally, this is dangerous if the block concerned is still in the
				1136	* journal. If somebody makes a swapfile on an ext3 data-journaling
				1137	* filesystem and enables swap, then they may get a nasty shock when the
				1138	* data getting swapped to that swapfile suddenly gets overwritten by
				1139	* the original zero's written out previously to the journal and
				1140	* awaiting writeback in the kernel's buffer cache.
				1141	*
				1142	* So, if we see any bmap calls here on a modified, data-journaled file,
				1143	* take extra steps to flush any blocks which might be in the cache.
				1144	*/
				1145	static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
				1146	{
				1147	struct inode *inode = mapping->host;
				1148	journal_t *journal;
				1149	int err;
				1150
				1151	if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
				1152	/*
				1153	* This is a REALLY heavyweight approach, but the use of
				1154	* bmap on dirty files is expected to be extremely rare:
				1155	* only if we run lilo or swapon on a freshly made file
				1156	* do we expect this to happen.
				1157	*
				1158	* (bmap requires CAP_SYS_RAWIO so this does not
				1159	* represent an unprivileged user DOS attack --- we'd be
				1160	* in trouble if mortal users could trigger this path at
				1161	* will.)
				1162	*
				1163	* NB. EXT3_STATE_JDATA is not set on files other than
				1164	* regular files. If somebody wants to bmap a directory
				1165	* or symlink and gets confused because the buffer
				1166	* hasn't yet been flushed to disk, they deserve
				1167	* everything they get.
				1168	*/
				1169
				1170	EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
				1171	journal = EXT3_JOURNAL(inode);
				1172	journal_lock_updates(journal);
				1173	err = journal_flush(journal);
				1174	journal_unlock_updates(journal);
				1175
				1176	if (err)
				1177	return 0;
				1178	}
				1179
				1180	return generic_block_bmap(mapping,block,ext3_get_block);
				1181	}
				1182
				1183	static int bget_one(handle_t handle, struct buffer_head bh)
				1184	{
				1185	get_bh(bh);
				1186	return 0;
				1187	}
				1188
				1189	static int bput_one(handle_t handle, struct buffer_head bh)
				1190	{
				1191	put_bh(bh);
				1192	return 0;
				1193	}
				1194
				1195	static int journal_dirty_data_fn(handle_t handle, struct buffer_head bh)
				1196	{
				1197	if (buffer_mapped(bh))
				1198	return ext3_journal_dirty_data(handle, bh);
				1199	return 0;
				1200	}
				1201
				1202	/*
				1203	* Note that we always start a transaction even if we're not journalling
				1204	* data. This is to preserve ordering: any hole instantiation within
				1205	* __block_write_full_page -> ext3_get_block() should be journalled
				1206	* along with the data so we don't crash and then get metadata which
				1207	* refers to old data.
				1208	*
				1209	* In all journalling modes block_write_full_page() will start the I/O.
				1210	*
				1211	* Problem:
				1212	*
				1213	* ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
				1214	* ext3_writepage()
				1215	*
				1216	* Similar for:
				1217	*
				1218	* ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
				1219	*
				1220	* Same applies to ext3_get_block(). We will deadlock on various things like
				1221	* lock_journal and i_truncate_sem.
				1222	*
				1223	* Setting PF_MEMALLOC here doesn't work - too many internal memory
				1224	* allocations fail.
				1225	*
				1226	* 16May01: If we're reentered then journal_current_handle() will be
				1227	* non-zero. We simply return.
				1228	*
				1229	* 1 July 2001: @@@ FIXME:
				1230	* In journalled data mode, a data buffer may be metadata against the
				1231	* current transaction. But the same file is part of a shared mapping
				1232	* and someone does a writepage() on it.
				1233	*
				1234	* We will move the buffer onto the async_data list, but after it has
				1235	* been dirtied. So there's a small window where we have dirty data on
				1236	* BJ_Metadata.
				1237	*
				1238	* Note that this only applies to the last partial page in the file. The
				1239	* bit which block_write_full_page() uses prepare/commit for. (That's
				1240	* broken code anyway: it's wrong for msync()).
				1241	*
				1242	* It's a rare case: affects the final partial page, for journalled data
				1243	* where the file is subject to bith write() and writepage() in the same
				1244	* transction. To fix it we'll need a custom block_write_full_page().
				1245	* We'll probably need that anyway for journalling writepage() output.
				1246	*
				1247	* We don't honour synchronous mounts for writepage(). That would be
				1248	* disastrous. Any write() or metadata operation will sync the fs for
				1249	* us.
				1250	*
				1251	* AKPM2: if all the page's buffers are mapped to disk and !data=journal,
				1252	* we don't need to open a transaction here.
				1253	*/
				1254	static int ext3_ordered_writepage(struct page *page,
				1255	struct writeback_control *wbc)
				1256	{
				1257	struct inode *inode = page->mapping->host;
				1258	struct buffer_head *page_bufs;
				1259	handle_t *handle = NULL;
				1260	int ret = 0;
				1261	int err;
				1262
				1263	J_ASSERT(PageLocked(page));
				1264
				1265	/*
				1266	* We give up here if we're reentered, because it might be for a
				1267	* different filesystem.
				1268	*/
				1269	if (ext3_journal_current_handle())
				1270	goto out_fail;
				1271
				1272	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1273
				1274	if (IS_ERR(handle)) {
				1275	ret = PTR_ERR(handle);
				1276	goto out_fail;
				1277	}
				1278
				1279	if (!page_has_buffers(page)) {
				1280	create_empty_buffers(page, inode->i_sb->s_blocksize,
				1281	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1282	}
				1283	page_bufs = page_buffers(page);
				1284	walk_page_buffers(handle, page_bufs, 0,
				1285	PAGE_CACHE_SIZE, NULL, bget_one);
				1286
				1287	ret = block_write_full_page(page, ext3_get_block, wbc);
				1288
				1289	/*
				1290	* The page can become unlocked at any point now, and
				1291	* truncate can then come in and change things. So we
				1292	* can't touch page from now on. But page_bufs is
				1293	* safe due to elevated refcount.
				1294	*/
				1295
				1296	/*
				1297	* And attach them to the current transaction. But only if
				1298	* block_write_full_page() succeeded. Otherwise they are unmapped,
				1299	* and generally junk.
				1300	*/
				1301	if (ret == 0) {
				1302	err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
				1303	NULL, journal_dirty_data_fn);
				1304	if (!ret)
				1305	ret = err;
				1306	}
				1307	walk_page_buffers(handle, page_bufs, 0,
				1308	PAGE_CACHE_SIZE, NULL, bput_one);
				1309	err = ext3_journal_stop(handle);
				1310	if (!ret)
				1311	ret = err;
				1312	return ret;
				1313
				1314	out_fail:
				1315	redirty_page_for_writepage(wbc, page);
				1316	unlock_page(page);
				1317	return ret;
				1318	}
				1319
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1320	static int ext3_writeback_writepage(struct page *page,
				1321	struct writeback_control *wbc)
				1322	{
				1323	struct inode *inode = page->mapping->host;
				1324	handle_t *handle = NULL;
				1325	int ret = 0;
				1326	int err;
				1327
				1328	if (ext3_journal_current_handle())
				1329	goto out_fail;
				1330
				1331	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1332	if (IS_ERR(handle)) {
				1333	ret = PTR_ERR(handle);
				1334	goto out_fail;
				1335	}
				1336
				1337	if (test_opt(inode->i_sb, NOBH))
				1338	ret = nobh_writepage(page, ext3_get_block, wbc);
				1339	else
				1340	ret = block_write_full_page(page, ext3_get_block, wbc);
				1341
				1342	err = ext3_journal_stop(handle);
				1343	if (!ret)
				1344	ret = err;
				1345	return ret;
				1346
				1347	out_fail:
				1348	redirty_page_for_writepage(wbc, page);
				1349	unlock_page(page);
				1350	return ret;
				1351	}
				1352
				1353	static int ext3_journalled_writepage(struct page *page,
				1354	struct writeback_control *wbc)
				1355	{
				1356	struct inode *inode = page->mapping->host;
				1357	handle_t *handle = NULL;
				1358	int ret = 0;
				1359	int err;
				1360
				1361	if (ext3_journal_current_handle())
				1362	goto no_write;
				1363
				1364	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1365	if (IS_ERR(handle)) {
				1366	ret = PTR_ERR(handle);
				1367	goto no_write;
				1368	}
				1369
				1370	if (!page_has_buffers(page) \|\| PageChecked(page)) {
				1371	/*
				1372	* It's mmapped pagecache. Add buffers and journal it. There
				1373	* doesn't seem much point in redirtying the page here.
				1374	*/
				1375	ClearPageChecked(page);
				1376	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
				1377	ext3_get_block);
				1378	if (ret != 0)
				1379	goto out_unlock;
				1380	ret = walk_page_buffers(handle, page_buffers(page), 0,
				1381	PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
				1382
				1383	err = walk_page_buffers(handle, page_buffers(page), 0,
				1384	PAGE_CACHE_SIZE, NULL, commit_write_fn);
				1385	if (ret == 0)
				1386	ret = err;
				1387	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1388	unlock_page(page);
				1389	} else {
				1390	/*
				1391	* It may be a page full of checkpoint-mode buffers. We don't
				1392	* really know unless we go poke around in the buffer_heads.
				1393	* But block_write_full_page will do the right thing.
				1394	*/
				1395	ret = block_write_full_page(page, ext3_get_block, wbc);
				1396	}
				1397	err = ext3_journal_stop(handle);
				1398	if (!ret)
				1399	ret = err;
				1400	out:
				1401	return ret;
				1402
				1403	no_write:
				1404	redirty_page_for_writepage(wbc, page);
				1405	out_unlock:
				1406	unlock_page(page);
				1407	goto out;
				1408	}
				1409
				1410	static int ext3_readpage(struct file file, struct page page)
				1411	{
				1412	return mpage_readpage(page, ext3_get_block);
				1413	}
				1414
				1415	static int
				1416	ext3_readpages(struct file file, struct address_space mapping,
				1417	struct list_head *pages, unsigned nr_pages)
				1418	{
				1419	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
				1420	}
				1421
				1422	static int ext3_invalidatepage(struct page *page, unsigned long offset)
				1423	{
				1424	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1425
				1426	/*
				1427	* If it's a full truncate we just forget about the pending dirtying
				1428	*/
				1429	if (offset == 0)
				1430	ClearPageChecked(page);
				1431
				1432	return journal_invalidatepage(journal, page, offset);
				1433	}
				1434
				1435	static int ext3_releasepage(struct page *page, int wait)
				1436	{
				1437	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1438
				1439	WARN_ON(PageChecked(page));
				1440	if (!page_has_buffers(page))
				1441	return 0;
				1442	return journal_try_to_free_buffers(journal, page, wait);
				1443	}
				1444
				1445	/*
				1446	* If the O_DIRECT write will extend the file then add this inode to the
				1447	* orphan list. So recovery will truncate it back to the original size
				1448	* if the machine crashes during the write.
				1449	*
				1450	* If the O_DIRECT write is intantiating holes inside i_size and the machine
				1451	* crashes then stale disk data _may_ be exposed inside the file.
				1452	*/
				1453	static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
				1454	const struct iovec *iov, loff_t offset,
				1455	unsigned long nr_segs)
				1456	{
				1457	struct file *file = iocb->ki_filp;
				1458	struct inode *inode = file->f_mapping->host;
				1459	struct ext3_inode_info *ei = EXT3_I(inode);
				1460	handle_t *handle = NULL;
				1461	ssize_t ret;
				1462	int orphan = 0;
				1463	size_t count = iov_length(iov, nr_segs);
				1464
				1465	if (rw == WRITE) {
				1466	loff_t final_size = offset + count;
				1467
				1468	handle = ext3_journal_start(inode, DIO_CREDITS);
				1469	if (IS_ERR(handle)) {
				1470	ret = PTR_ERR(handle);
				1471	goto out;
				1472	}
				1473	if (final_size > inode->i_size) {
				1474	ret = ext3_orphan_add(handle, inode);
				1475	if (ret)
				1476	goto out_stop;
				1477	orphan = 1;
				1478	ei->i_disksize = inode->i_size;
				1479	}
				1480	}
				1481
				1482	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
				1483	offset, nr_segs,
				1484	ext3_direct_io_get_blocks, NULL);
				1485
				1486	/*
				1487	* Reacquire the handle: ext3_direct_io_get_block() can restart the
				1488	* transaction
				1489	*/
				1490	handle = journal_current_handle();
				1491
				1492	out_stop:
				1493	if (handle) {
				1494	int err;
				1495
				1496	if (orphan && inode->i_nlink)
				1497	ext3_orphan_del(handle, inode);
				1498	if (orphan && ret > 0) {
				1499	loff_t end = offset + ret;
				1500	if (end > inode->i_size) {
				1501	ei->i_disksize = end;
				1502	i_size_write(inode, end);
				1503	/*
				1504	* We're going to return a positive `ret'
				1505	* here due to non-zero-length I/O, so there's
				1506	* no way of reporting error returns from
				1507	* ext3_mark_inode_dirty() to userspace. So
				1508	* ignore it.
				1509	*/
				1510	ext3_mark_inode_dirty(handle, inode);
				1511	}
				1512	}
				1513	err = ext3_journal_stop(handle);
				1514	if (ret == 0)
				1515	ret = err;
				1516	}
				1517	out:
				1518	return ret;
				1519	}
				1520
				1521	/*
				1522	* Pages can be marked dirty completely asynchronously from ext3's journalling
				1523	* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
				1524	* much here because ->set_page_dirty is called under VFS locks. The page is
				1525	* not necessarily locked.
				1526	*
				1527	* We cannot just dirty the page and leave attached buffers clean, because the
				1528	* buffers' dirty state is "definitive". We cannot just set the buffers dirty
				1529	* or jbddirty because all the journalling code will explode.
				1530	*
				1531	* So what we do is to mark the page "pending dirty" and next time writepage
				1532	* is called, propagate that into the buffers appropriately.
				1533	*/
				1534	static int ext3_journalled_set_page_dirty(struct page *page)
				1535	{
				1536	SetPageChecked(page);
				1537	return __set_page_dirty_nobuffers(page);
				1538	}
				1539
				1540	static struct address_space_operations ext3_ordered_aops = {
				1541	.readpage = ext3_readpage,
				1542	.readpages = ext3_readpages,
				1543	.writepage = ext3_ordered_writepage,
				1544	.sync_page = block_sync_page,
				1545	.prepare_write = ext3_prepare_write,
				1546	.commit_write = ext3_ordered_commit_write,
				1547	.bmap = ext3_bmap,
				1548	.invalidatepage = ext3_invalidatepage,
				1549	.releasepage = ext3_releasepage,
				1550	.direct_IO = ext3_direct_IO,
				1551	};
				1552
				1553	static struct address_space_operations ext3_writeback_aops = {
				1554	.readpage = ext3_readpage,
				1555	.readpages = ext3_readpages,
				1556	.writepage = ext3_writeback_writepage,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1557	.sync_page = block_sync_page,
				1558	.prepare_write = ext3_prepare_write,
				1559	.commit_write = ext3_writeback_commit_write,
				1560	.bmap = ext3_bmap,
				1561	.invalidatepage = ext3_invalidatepage,
				1562	.releasepage = ext3_releasepage,
				1563	.direct_IO = ext3_direct_IO,
				1564	};
				1565
				1566	static struct address_space_operations ext3_journalled_aops = {
				1567	.readpage = ext3_readpage,
				1568	.readpages = ext3_readpages,
				1569	.writepage = ext3_journalled_writepage,
				1570	.sync_page = block_sync_page,
				1571	.prepare_write = ext3_prepare_write,
				1572	.commit_write = ext3_journalled_commit_write,
				1573	.set_page_dirty = ext3_journalled_set_page_dirty,
				1574	.bmap = ext3_bmap,
				1575	.invalidatepage = ext3_invalidatepage,
				1576	.releasepage = ext3_releasepage,
				1577	};
				1578
				1579	void ext3_set_aops(struct inode *inode)
				1580	{
				1581	if (ext3_should_order_data(inode))
				1582	inode->i_mapping->a_ops = &ext3_ordered_aops;
				1583	else if (ext3_should_writeback_data(inode))
				1584	inode->i_mapping->a_ops = &ext3_writeback_aops;
				1585	else
				1586	inode->i_mapping->a_ops = &ext3_journalled_aops;
				1587	}
				1588
				1589	/*
				1590	* ext3_block_truncate_page() zeroes out a mapping from file offset `from'
				1591	* up to the end of the block which corresponds to `from'.
				1592	* This required during truncate. We need to physically zero the tail end
				1593	* of that block so it doesn't yield old data if the file is later grown.
				1594	*/
				1595	static int ext3_block_truncate_page(handle_t handle, struct page page,
				1596	struct address_space *mapping, loff_t from)
				1597	{
				1598	unsigned long index = from >> PAGE_CACHE_SHIFT;
				1599	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				1600	unsigned blocksize, iblock, length, pos;
				1601	struct inode *inode = mapping->host;
				1602	struct buffer_head *bh;
				1603	int err = 0;
				1604	void *kaddr;
				1605
				1606	blocksize = inode->i_sb->s_blocksize;
				1607	length = blocksize - (offset & (blocksize - 1));
				1608	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
				1609
				1610	/*
				1611	* For "nobh" option, we can only work if we don't need to
				1612	* read-in the page - otherwise we create buffers to do the IO.
				1613	*/
				1614	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH)) {
				1615	if (PageUptodate(page)) {
				1616	kaddr = kmap_atomic(page, KM_USER0);
				1617	memset(kaddr + offset, 0, length);
				1618	flush_dcache_page(page);
				1619	kunmap_atomic(kaddr, KM_USER0);
				1620	set_page_dirty(page);
				1621	goto unlock;
				1622	}
				1623	}
				1624
				1625	if (!page_has_buffers(page))
				1626	create_empty_buffers(page, blocksize, 0);
				1627
				1628	/* Find the buffer that contains "offset" */
				1629	bh = page_buffers(page);
				1630	pos = blocksize;
				1631	while (offset >= pos) {
				1632	bh = bh->b_this_page;
				1633	iblock++;
				1634	pos += blocksize;
				1635	}
				1636
				1637	err = 0;
				1638	if (buffer_freed(bh)) {
				1639	BUFFER_TRACE(bh, "freed: skip");
				1640	goto unlock;
				1641	}
				1642
				1643	if (!buffer_mapped(bh)) {
				1644	BUFFER_TRACE(bh, "unmapped");
				1645	ext3_get_block(inode, iblock, bh, 0);
				1646	/* unmapped? It's a hole - nothing to do */
				1647	if (!buffer_mapped(bh)) {
				1648	BUFFER_TRACE(bh, "still unmapped");
				1649	goto unlock;
				1650	}
				1651	}
				1652
				1653	/* Ok, it's mapped. Make sure it's up-to-date */
				1654	if (PageUptodate(page))
				1655	set_buffer_uptodate(bh);
				1656
				1657	if (!buffer_uptodate(bh)) {
				1658	err = -EIO;
				1659	ll_rw_block(READ, 1, &bh);
				1660	wait_on_buffer(bh);
				1661	/* Uhhuh. Read error. Complain and punt. */
				1662	if (!buffer_uptodate(bh))
				1663	goto unlock;
				1664	}
				1665
				1666	if (ext3_should_journal_data(inode)) {
				1667	BUFFER_TRACE(bh, "get write access");
				1668	err = ext3_journal_get_write_access(handle, bh);
				1669	if (err)
				1670	goto unlock;
				1671	}
				1672
				1673	kaddr = kmap_atomic(page, KM_USER0);
				1674	memset(kaddr + offset, 0, length);
				1675	flush_dcache_page(page);
				1676	kunmap_atomic(kaddr, KM_USER0);
				1677
				1678	BUFFER_TRACE(bh, "zeroed end of block");
				1679
				1680	err = 0;
				1681	if (ext3_should_journal_data(inode)) {
				1682	err = ext3_journal_dirty_metadata(handle, bh);
				1683	} else {
				1684	if (ext3_should_order_data(inode))
				1685	err = ext3_journal_dirty_data(handle, bh);
				1686	mark_buffer_dirty(bh);
				1687	}
				1688
				1689	unlock:
				1690	unlock_page(page);
				1691	page_cache_release(page);
				1692	return err;
				1693	}
				1694
				1695	/*
				1696	* Probably it should be a library function... search for first non-zero word
				1697	* or memcmp with zero_page, whatever is better for particular architecture.
				1698	* Linus?
				1699	*/
				1700	static inline int all_zeroes(__le32 p, __le32 q)
				1701	{
				1702	while (p < q)
				1703	if (*p++)
				1704	return 0;
				1705	return 1;
				1706	}
				1707
				1708	/**
				1709	* ext3_find_shared - find the indirect blocks for partial truncation.
				1710	* @inode: inode in question
				1711	* @depth: depth of the affected branch
				1712	* @offsets: offsets of pointers in that branch (see ext3_block_to_path)
				1713	* @chain: place to store the pointers to partial indirect blocks
				1714	* @top: place to the (detached) top of branch
				1715	*
				1716	* This is a helper function used by ext3_truncate().
				1717	*
				1718	* When we do truncate() we may have to clean the ends of several
				1719	* indirect blocks but leave the blocks themselves alive. Block is
				1720	* partially truncated if some data below the new i_size is refered
				1721	* from it (and it is on the path to the first completely truncated
				1722	* data block, indeed). We have to free the top of that path along
				1723	* with everything to the right of the path. Since no allocation
				1724	* past the truncation point is possible until ext3_truncate()
				1725	* finishes, we may safely do the latter, but top of branch may
				1726	* require special attention - pageout below the truncation point
				1727	* might try to populate it.
				1728	*
				1729	* We atomically detach the top of branch from the tree, store the
				1730	* block number of its root in *@top, pointers to buffer_heads of
				1731	* partially truncated blocks - in @chain[].bh and pointers to
				1732	* their last elements that should not be removed - in
				1733	* @chain[].p. Return value is the pointer to last filled element
				1734	* of @chain.
				1735	*
				1736	* The work left to caller to do the actual freeing of subtrees:
				1737	* a) free the subtree starting from *@top
				1738	* b) free the subtrees whose roots are stored in
				1739	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)
				1740	* c) free the subtrees growing from the inode past the @chain[0].
				1741	* (no partially truncated stuff there). */
				1742
				1743	static Indirect ext3_find_shared(struct inode inode,
				1744	int depth,
				1745	int offsets[4],
				1746	Indirect chain[4],
				1747	__le32 *top)
				1748	{
				1749	Indirect partial, p;
				1750	int k, err;
				1751
				1752	*top = 0;
				1753	/* Make k index the deepest non-null offest + 1 */
				1754	for (k = depth; k > 1 && !offsets[k-1]; k--)
				1755	;
				1756	partial = ext3_get_branch(inode, k, offsets, chain, &err);
				1757	/* Writer: pointers */
				1758	if (!partial)
				1759	partial = chain + k-1;
				1760	/*
				1761	* If the branch acquired continuation since we've looked at it -
				1762	* fine, it should all survive and (new) top doesn't belong to us.
				1763	*/
				1764	if (!partial->key && *partial->p)
				1765	/* Writer: end */
				1766	goto no_top;
				1767	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
				1768	;
				1769	/*
				1770	* OK, we've found the last block that must survive. The rest of our
				1771	* branch should be detached before unlocking. However, if that rest
				1772	* of branch is all ours and does not grow immediately from the inode
				1773	* it's easier to cheat and just decrement partial->p.
				1774	*/
				1775	if (p == chain + k - 1 && p > chain) {
				1776	p->p--;
				1777	} else {
				1778	top = p->p;
				1779	/* Nope, don't do this in ext3. Must leave the tree intact */
				1780	#if 0
				1781	*p->p = 0;
				1782	#endif
				1783	}
				1784	/* Writer: end */
				1785
				1786	while(partial > p)
				1787	{
				1788	brelse(partial->bh);
				1789	partial--;
				1790	}
				1791	no_top:
				1792	return partial;
				1793	}
				1794
				1795	/*
				1796	* Zero a number of block pointers in either an inode or an indirect block.
				1797	* If we restart the transaction we must again get write access to the
				1798	* indirect block for further modification.
				1799	*
				1800	* We release `count' blocks on disk, but (last - first) may be greater
				1801	* than `count' because there can be holes in there.
				1802	*/
				1803	static void
				1804	ext3_clear_blocks(handle_t handle, struct inode inode, struct buffer_head *bh,
				1805	unsigned long block_to_free, unsigned long count,
				1806	__le32 first, __le32 last)
				1807	{
				1808	__le32 *p;
				1809	if (try_to_extend_transaction(handle, inode)) {
				1810	if (bh) {
				1811	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				1812	ext3_journal_dirty_metadata(handle, bh);
				1813	}
				1814	ext3_mark_inode_dirty(handle, inode);
				1815	ext3_journal_test_restart(handle, inode);
				1816	if (bh) {
				1817	BUFFER_TRACE(bh, "retaking write access");
				1818	ext3_journal_get_write_access(handle, bh);
				1819	}
				1820	}
				1821
				1822	/*
				1823	* Any buffers which are on the journal will be in memory. We find
				1824	* them on the hash table so journal_revoke() will run journal_forget()
				1825	* on them. We've already detached each block from the file, so
				1826	* bforget() in journal_forget() should be safe.
				1827	*
				1828	* AKPM: turn on bforget in journal_forget()!!!
				1829	*/
				1830	for (p = first; p < last; p++) {
				1831	u32 nr = le32_to_cpu(*p);
				1832	if (nr) {
				1833	struct buffer_head *bh;
				1834
				1835	*p = 0;
				1836	bh = sb_find_get_block(inode->i_sb, nr);
				1837	ext3_forget(handle, 0, inode, bh, nr);
				1838	}
				1839	}
				1840
				1841	ext3_free_blocks(handle, inode, block_to_free, count);
				1842	}
				1843
				1844	/**
				1845	* ext3_free_data - free a list of data blocks
				1846	* @handle: handle for this transaction
				1847	* @inode: inode we are dealing with
				1848	* @this_bh: indirect buffer_head which contains @first and @last
				1849	* @first: array of block numbers
				1850	* @last: points immediately past the end of array
				1851	*
				1852	* We are freeing all blocks refered from that array (numbers are stored as
				1853	* little-endian 32-bit) and updating @inode->i_blocks appropriately.
				1854	*
				1855	* We accumulate contiguous runs of blocks to free. Conveniently, if these
				1856	* blocks are contiguous then releasing them at one time will only affect one
				1857	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
				1858	* actually use a lot of journal space.
				1859	*
				1860	* @this_bh will be %NULL if @first and @last point into the inode's direct
				1861	* block pointers.
				1862	*/
				1863	static void ext3_free_data(handle_t handle, struct inode inode,
				1864	struct buffer_head *this_bh,
				1865	__le32 first, __le32 last)
				1866	{
				1867	unsigned long block_to_free = 0; /* Starting block # of a run */
				1868	unsigned long count = 0; /* Number of blocks in the run */
				1869	__le32 block_to_free_p = NULL; / Pointer into inode/ind
				1870	corresponding to
				1871	block_to_free */
				1872	unsigned long nr; /* Current block # */
				1873	__le32 p; / Pointer into inode/ind
				1874	for current block */
				1875	int err;
				1876
				1877	if (this_bh) { /* For indirect block */
				1878	BUFFER_TRACE(this_bh, "get_write_access");
				1879	err = ext3_journal_get_write_access(handle, this_bh);
				1880	/* Important: if we can't update the indirect pointers
				1881	* to the blocks, we can't free them. */
				1882	if (err)
				1883	return;
				1884	}
				1885
				1886	for (p = first; p < last; p++) {
				1887	nr = le32_to_cpu(*p);
				1888	if (nr) {
				1889	/* accumulate blocks to free if they're contiguous */
				1890	if (count == 0) {
				1891	block_to_free = nr;
				1892	block_to_free_p = p;
				1893	count = 1;
				1894	} else if (nr == block_to_free + count) {
				1895	count++;
				1896	} else {
				1897	ext3_clear_blocks(handle, inode, this_bh,
				1898	block_to_free,
				1899	count, block_to_free_p, p);
				1900	block_to_free = nr;
				1901	block_to_free_p = p;
				1902	count = 1;
				1903	}
				1904	}
				1905	}
				1906
				1907	if (count > 0)
				1908	ext3_clear_blocks(handle, inode, this_bh, block_to_free,
				1909	count, block_to_free_p, p);
				1910
				1911	if (this_bh) {
				1912	BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
				1913	ext3_journal_dirty_metadata(handle, this_bh);
				1914	}
				1915	}
				1916
				1917	/**
				1918	* ext3_free_branches - free an array of branches
				1919	* @handle: JBD handle for this transaction
				1920	* @inode: inode we are dealing with
				1921	* @parent_bh: the buffer_head which contains @first and @last
				1922	* @first: array of block numbers
				1923	* @last: pointer immediately past the end of array
				1924	* @depth: depth of the branches to free
				1925	*
				1926	* We are freeing all blocks refered from these branches (numbers are
				1927	* stored as little-endian 32-bit) and updating @inode->i_blocks
				1928	* appropriately.
				1929	*/
				1930	static void ext3_free_branches(handle_t handle, struct inode inode,
				1931	struct buffer_head *parent_bh,
				1932	__le32 first, __le32 last, int depth)
				1933	{
				1934	unsigned long nr;
				1935	__le32 *p;
				1936
				1937	if (is_handle_aborted(handle))
				1938	return;
				1939
				1940	if (depth--) {
				1941	struct buffer_head *bh;
				1942	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				1943	p = last;
				1944	while (--p >= first) {
				1945	nr = le32_to_cpu(*p);
				1946	if (!nr)
				1947	continue; /* A hole */
				1948
				1949	/* Go read the buffer for the next level down */
				1950	bh = sb_bread(inode->i_sb, nr);
				1951
				1952	/*
				1953	* A read failure? Report error and clear slot
				1954	* (should be rare).
				1955	*/
				1956	if (!bh) {
				1957	ext3_error(inode->i_sb, "ext3_free_branches",
				1958	"Read failure, inode=%ld, block=%ld",
				1959	inode->i_ino, nr);
				1960	continue;
				1961	}
				1962
				1963	/* This zaps the entire block. Bottom up. */
				1964	BUFFER_TRACE(bh, "free child branches");
				1965	ext3_free_branches(handle, inode, bh,
				1966	(__le32*)bh->b_data,
				1967	(__le32*)bh->b_data + addr_per_block,
				1968	depth);
				1969
				1970	/*
				1971	* We've probably journalled the indirect block several
				1972	* times during the truncate. But it's no longer
				1973	* needed and we now drop it from the transaction via
				1974	* journal_revoke().
				1975	*
				1976	* That's easy if it's exclusively part of this
				1977	* transaction. But if it's part of the committing
				1978	* transaction then journal_forget() will simply
				1979	* brelse() it. That means that if the underlying
				1980	* block is reallocated in ext3_get_block(),
				1981	* unmap_underlying_metadata() will find this block
				1982	* and will try to get rid of it. damn, damn.
				1983	*
				1984	* If this block has already been committed to the
				1985	* journal, a revoke record will be written. And
				1986	* revoke records must be emitted before clearing
				1987	* this block's bit in the bitmaps.
				1988	*/
				1989	ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
				1990
				1991	/*
				1992	* Everything below this this pointer has been
				1993	* released. Now let this top-of-subtree go.
				1994	*
				1995	* We want the freeing of this indirect block to be
				1996	* atomic in the journal with the updating of the
				1997	* bitmap block which owns it. So make some room in
				1998	* the journal.
				1999	*
				2000	* We zero the parent pointer after freeing its
				2001	* pointee in the bitmaps, so if extend_transaction()
				2002	* for some reason fails to put the bitmap changes and
				2003	* the release into the same transaction, recovery
				2004	* will merely complain about releasing a free block,
				2005	* rather than leaking blocks.
				2006	*/
				2007	if (is_handle_aborted(handle))
				2008	return;
				2009	if (try_to_extend_transaction(handle, inode)) {
				2010	ext3_mark_inode_dirty(handle, inode);
				2011	ext3_journal_test_restart(handle, inode);
				2012	}
				2013
				2014	ext3_free_blocks(handle, inode, nr, 1);
				2015
				2016	if (parent_bh) {
				2017	/*
				2018	* The block which we have just freed is
				2019	* pointed to by an indirect block: journal it
				2020	*/
				2021	BUFFER_TRACE(parent_bh, "get_write_access");
				2022	if (!ext3_journal_get_write_access(handle,
				2023	parent_bh)){
				2024	*p = 0;
				2025	BUFFER_TRACE(parent_bh,
				2026	"call ext3_journal_dirty_metadata");
				2027	ext3_journal_dirty_metadata(handle,
				2028	parent_bh);
				2029	}
				2030	}
				2031	}
				2032	} else {
				2033	/* We have reached the bottom of the tree. */
				2034	BUFFER_TRACE(parent_bh, "free data blocks");
				2035	ext3_free_data(handle, inode, parent_bh, first, last);
				2036	}
				2037	}
				2038
				2039	/*
				2040	* ext3_truncate()
				2041	*
				2042	* We block out ext3_get_block() block instantiations across the entire
				2043	* transaction, and VFS/VM ensures that ext3_truncate() cannot run
				2044	* simultaneously on behalf of the same inode.
				2045	*
				2046	* As we work through the truncate and commmit bits of it to the journal there
				2047	* is one core, guiding principle: the file's tree must always be consistent on
				2048	* disk. We must be able to restart the truncate after a crash.
				2049	*
				2050	* The file's tree may be transiently inconsistent in memory (although it
				2051	* probably isn't), but whenever we close off and commit a journal transaction,
				2052	* the contents of (the filesystem + the journal) must be consistent and
				2053	* restartable. It's pretty simple, really: bottom up, right to left (although
				2054	* left-to-right works OK too).
				2055	*
				2056	* Note that at recovery time, journal replay occurs before the restart of
				2057	* truncate against the orphan inode list.
				2058	*
				2059	* The committed inode has the new, desired i_size (which is the same as
				2060	* i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
				2061	* that this inode's truncate did not complete and it will again call
				2062	* ext3_truncate() to have another go. So there will be instantiated blocks
				2063	* to the right of the truncation point in a crashed ext3 filesystem. But
				2064	* that's fine - as long as they are linked from the inode, the post-crash
				2065	* ext3_truncate() run will find them and release them.
				2066	*/
				2067
				2068	void ext3_truncate(struct inode * inode)
				2069	{
				2070	handle_t *handle;
				2071	struct ext3_inode_info *ei = EXT3_I(inode);
				2072	__le32 *i_data = ei->i_data;
				2073	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				2074	struct address_space *mapping = inode->i_mapping;
				2075	int offsets[4];
				2076	Indirect chain[4];
				2077	Indirect *partial;
				2078	__le32 nr = 0;
				2079	int n;
				2080	long last_block;
				2081	unsigned blocksize = inode->i_sb->s_blocksize;
				2082	struct page *page;
				2083
				2084	if (!(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
				2085	S_ISLNK(inode->i_mode)))
				2086	return;
				2087	if (ext3_inode_is_fast_symlink(inode))
				2088	return;
				2089	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				2090	return;
				2091
				2092	/*
				2093	* We have to lock the EOF page here, because lock_page() nests
				2094	* outside journal_start().
				2095	*/
				2096	if ((inode->i_size & (blocksize - 1)) == 0) {
				2097	/* Block boundary? Nothing to do */
				2098	page = NULL;
				2099	} else {
				2100	page = grab_cache_page(mapping,
				2101	inode->i_size >> PAGE_CACHE_SHIFT);
				2102	if (!page)
				2103	return;
				2104	}
				2105
				2106	handle = start_transaction(inode);
				2107	if (IS_ERR(handle)) {
				2108	if (page) {
				2109	clear_highpage(page);
				2110	flush_dcache_page(page);
				2111	unlock_page(page);
				2112	page_cache_release(page);
				2113	}
				2114	return; /* AKPM: return what? */
				2115	}
				2116
				2117	last_block = (inode->i_size + blocksize-1)
				2118	>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
				2119
				2120	if (page)
				2121	ext3_block_truncate_page(handle, page, mapping, inode->i_size);
				2122
				2123	n = ext3_block_to_path(inode, last_block, offsets, NULL);
				2124	if (n == 0)
				2125	goto out_stop; /* error */
				2126
				2127	/*
				2128	* OK. This truncate is going to happen. We add the inode to the
				2129	* orphan list, so that if this truncate spans multiple transactions,
				2130	* and we crash, we will resume the truncate when the filesystem
				2131	* recovers. It also marks the inode dirty, to catch the new size.
				2132	*
				2133	* Implication: the file must always be in a sane, consistent
				2134	* truncatable state while each transaction commits.
				2135	*/
				2136	if (ext3_orphan_add(handle, inode))
				2137	goto out_stop;
				2138
				2139	/*
				2140	* The orphan list entry will now protect us from any crash which
				2141	* occurs before the truncate completes, so it is now safe to propagate
				2142	* the new, shorter inode size (held for now in i_size) into the
				2143	* on-disk inode. We do this via i_disksize, which is the value which
				2144	* ext3 really writes onto the disk inode.
				2145	*/
				2146	ei->i_disksize = inode->i_size;
				2147
				2148	/*
				2149	* From here we block out all ext3_get_block() callers who want to
				2150	* modify the block allocation tree.
				2151	*/
				2152	down(&ei->truncate_sem);
				2153
				2154	if (n == 1) { /* direct blocks */
				2155	ext3_free_data(handle, inode, NULL, i_data+offsets[0],
				2156	i_data + EXT3_NDIR_BLOCKS);
				2157	goto do_indirects;
				2158	}
				2159
				2160	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
				2161	/* Kill the top of shared branch (not detached) */
				2162	if (nr) {
				2163	if (partial == chain) {
				2164	/* Shared branch grows from the inode */
				2165	ext3_free_branches(handle, inode, NULL,
				2166	&nr, &nr+1, (chain+n-1) - partial);
				2167	*partial->p = 0;
				2168	/*
				2169	* We mark the inode dirty prior to restart,
				2170	* and prior to stop. No need for it here.
				2171	*/
				2172	} else {
				2173	/* Shared branch grows from an indirect block */
				2174	BUFFER_TRACE(partial->bh, "get_write_access");
				2175	ext3_free_branches(handle, inode, partial->bh,
				2176	partial->p,
				2177	partial->p+1, (chain+n-1) - partial);
				2178	}
				2179	}
				2180	/* Clear the ends of indirect blocks on the shared branch */
				2181	while (partial > chain) {
				2182	ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
				2183	(__le32*)partial->bh->b_data+addr_per_block,
				2184	(chain+n-1) - partial);
				2185	BUFFER_TRACE(partial->bh, "call brelse");
				2186	brelse (partial->bh);
				2187	partial--;
				2188	}
				2189	do_indirects:
				2190	/* Kill the remaining (whole) subtrees */
				2191	switch (offsets[0]) {
				2192	default:
				2193	nr = i_data[EXT3_IND_BLOCK];
				2194	if (nr) {
				2195	ext3_free_branches(handle, inode, NULL,
				2196	&nr, &nr+1, 1);
				2197	i_data[EXT3_IND_BLOCK] = 0;
				2198	}
				2199	case EXT3_IND_BLOCK:
				2200	nr = i_data[EXT3_DIND_BLOCK];
				2201	if (nr) {
				2202	ext3_free_branches(handle, inode, NULL,
				2203	&nr, &nr+1, 2);
				2204	i_data[EXT3_DIND_BLOCK] = 0;
				2205	}
				2206	case EXT3_DIND_BLOCK:
				2207	nr = i_data[EXT3_TIND_BLOCK];
				2208	if (nr) {
				2209	ext3_free_branches(handle, inode, NULL,
				2210	&nr, &nr+1, 3);
				2211	i_data[EXT3_TIND_BLOCK] = 0;
				2212	}
				2213	case EXT3_TIND_BLOCK:
				2214	;
				2215	}
				2216
				2217	ext3_discard_reservation(inode);
				2218
				2219	up(&ei->truncate_sem);
				2220	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
				2221	ext3_mark_inode_dirty(handle, inode);
				2222
				2223	/* In a multi-transaction truncate, we only make the final
				2224	* transaction synchronous */
				2225	if (IS_SYNC(inode))
				2226	handle->h_sync = 1;
				2227	out_stop:
				2228	/*
				2229	* If this was a simple ftruncate(), and the file will remain alive
				2230	* then we need to clear up the orphan record which we created above.
				2231	* However, if this was a real unlink then we were called by
				2232	* ext3_delete_inode(), and we allow that function to clean up the
				2233	* orphan info for us.
				2234	*/
				2235	if (inode->i_nlink)
				2236	ext3_orphan_del(handle, inode);
				2237
				2238	ext3_journal_stop(handle);
				2239	}
				2240
				2241	static unsigned long ext3_get_inode_block(struct super_block *sb,
				2242	unsigned long ino, struct ext3_iloc *iloc)
				2243	{
				2244	unsigned long desc, group_desc, block_group;
				2245	unsigned long offset, block;
				2246	struct buffer_head *bh;
				2247	struct ext3_group_desc * gdp;
				2248
				2249
				2250	if ((ino != EXT3_ROOT_INO &&
				2251	ino != EXT3_JOURNAL_INO &&
				2252	ino != EXT3_RESIZE_INO &&
				2253	ino < EXT3_FIRST_INO(sb)) \|\|
				2254	ino > le32_to_cpu(
				2255	EXT3_SB(sb)->s_es->s_inodes_count)) {
				2256	ext3_error (sb, "ext3_get_inode_block",
				2257	"bad inode number: %lu", ino);
				2258	return 0;
				2259	}
				2260	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
				2261	if (block_group >= EXT3_SB(sb)->s_groups_count) {
				2262	ext3_error (sb, "ext3_get_inode_block",
				2263	"group >= groups count");
				2264	return 0;
				2265	}
				2266	smp_rmb();
				2267	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
				2268	desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
				2269	bh = EXT3_SB(sb)->s_group_desc[group_desc];
				2270	if (!bh) {
				2271	ext3_error (sb, "ext3_get_inode_block",
				2272	"Descriptor not loaded");
				2273	return 0;
				2274	}
				2275
				2276	gdp = (struct ext3_group_desc *) bh->b_data;
				2277	/*
				2278	* Figure out the offset within the block group inode table
				2279	*/
				2280	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
				2281	EXT3_INODE_SIZE(sb);
				2282	block = le32_to_cpu(gdp[desc].bg_inode_table) +
				2283	(offset >> EXT3_BLOCK_SIZE_BITS(sb));
				2284
				2285	iloc->block_group = block_group;
				2286	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
				2287	return block;
				2288	}
				2289
				2290	/*
				2291	* ext3_get_inode_loc returns with an extra refcount against the inode's
				2292	* underlying buffer_head on success. If 'in_mem' is true, we have all
				2293	* data in memory that is needed to recreate the on-disk version of this
				2294	* inode.
				2295	*/
				2296	static int __ext3_get_inode_loc(struct inode *inode,
				2297	struct ext3_iloc *iloc, int in_mem)
				2298	{
				2299	unsigned long block;
				2300	struct buffer_head *bh;
				2301
				2302	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
				2303	if (!block)
				2304	return -EIO;
				2305
				2306	bh = sb_getblk(inode->i_sb, block);
				2307	if (!bh) {
				2308	ext3_error (inode->i_sb, "ext3_get_inode_loc",
				2309	"unable to read inode block - "
				2310	"inode=%lu, block=%lu", inode->i_ino, block);
				2311	return -EIO;
				2312	}
				2313	if (!buffer_uptodate(bh)) {
				2314	lock_buffer(bh);
				2315	if (buffer_uptodate(bh)) {
				2316	/* someone brought it uptodate while we waited */
				2317	unlock_buffer(bh);
				2318	goto has_buffer;
				2319	}
				2320
				2321	/*
				2322	* If we have all information of the inode in memory and this
				2323	* is the only valid inode in the block, we need not read the
				2324	* block.
				2325	*/
				2326	if (in_mem) {
				2327	struct buffer_head *bitmap_bh;
				2328	struct ext3_group_desc *desc;
				2329	int inodes_per_buffer;
				2330	int inode_offset, i;
				2331	int block_group;
				2332	int start;
				2333
				2334	block_group = (inode->i_ino - 1) /
				2335	EXT3_INODES_PER_GROUP(inode->i_sb);
				2336	inodes_per_buffer = bh->b_size /
				2337	EXT3_INODE_SIZE(inode->i_sb);
				2338	inode_offset = ((inode->i_ino - 1) %
				2339	EXT3_INODES_PER_GROUP(inode->i_sb));
				2340	start = inode_offset & ~(inodes_per_buffer - 1);
				2341
				2342	/* Is the inode bitmap in cache? */
				2343	desc = ext3_get_group_desc(inode->i_sb,
				2344	block_group, NULL);
				2345	if (!desc)
				2346	goto make_io;
				2347
				2348	bitmap_bh = sb_getblk(inode->i_sb,
				2349	le32_to_cpu(desc->bg_inode_bitmap));
				2350	if (!bitmap_bh)
				2351	goto make_io;
				2352
				2353	/*
				2354	* If the inode bitmap isn't in cache then the
				2355	* optimisation may end up performing two reads instead
				2356	* of one, so skip it.
				2357	*/
				2358	if (!buffer_uptodate(bitmap_bh)) {
				2359	brelse(bitmap_bh);
				2360	goto make_io;
				2361	}
				2362	for (i = start; i < start + inodes_per_buffer; i++) {
				2363	if (i == inode_offset)
				2364	continue;
				2365	if (ext3_test_bit(i, bitmap_bh->b_data))
				2366	break;
				2367	}
				2368	brelse(bitmap_bh);
				2369	if (i == start + inodes_per_buffer) {
				2370	/* all other inodes are free, so skip I/O */
				2371	memset(bh->b_data, 0, bh->b_size);
				2372	set_buffer_uptodate(bh);
				2373	unlock_buffer(bh);
				2374	goto has_buffer;
				2375	}
				2376	}
				2377
				2378	make_io:
				2379	/*
				2380	* There are other valid inodes in the buffer, this inode
				2381	* has in-inode xattrs, or we don't have this inode in memory.
				2382	* Read the block from disk.
				2383	*/
				2384	get_bh(bh);
				2385	bh->b_end_io = end_buffer_read_sync;
				2386	submit_bh(READ, bh);
				2387	wait_on_buffer(bh);
				2388	if (!buffer_uptodate(bh)) {
				2389	ext3_error(inode->i_sb, "ext3_get_inode_loc",
				2390	"unable to read inode block - "
				2391	"inode=%lu, block=%lu",
				2392	inode->i_ino, block);
				2393	brelse(bh);
				2394	return -EIO;
				2395	}
				2396	}
				2397	has_buffer:
				2398	iloc->bh = bh;
				2399	return 0;
				2400	}
				2401
				2402	int ext3_get_inode_loc(struct inode inode, struct ext3_iloc iloc)
				2403	{
				2404	/* We have all inode data except xattrs in memory here. */
				2405	return __ext3_get_inode_loc(inode, iloc,
				2406	!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
				2407	}
				2408
				2409	void ext3_set_inode_flags(struct inode *inode)
				2410	{
				2411	unsigned int flags = EXT3_I(inode)->i_flags;
				2412
				2413	inode->i_flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC);
				2414	if (flags & EXT3_SYNC_FL)
				2415	inode->i_flags \|= S_SYNC;
				2416	if (flags & EXT3_APPEND_FL)
				2417	inode->i_flags \|= S_APPEND;
				2418	if (flags & EXT3_IMMUTABLE_FL)
				2419	inode->i_flags \|= S_IMMUTABLE;
				2420	if (flags & EXT3_NOATIME_FL)
				2421	inode->i_flags \|= S_NOATIME;
				2422	if (flags & EXT3_DIRSYNC_FL)
				2423	inode->i_flags \|= S_DIRSYNC;
				2424	}
				2425
				2426	void ext3_read_inode(struct inode * inode)
				2427	{
				2428	struct ext3_iloc iloc;
				2429	struct ext3_inode *raw_inode;
				2430	struct ext3_inode_info *ei = EXT3_I(inode);
				2431	struct buffer_head *bh;
				2432	int block;
				2433
				2434	#ifdef CONFIG_EXT3_FS_POSIX_ACL
				2435	ei->i_acl = EXT3_ACL_NOT_CACHED;
				2436	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
				2437	#endif
				2438	ei->i_block_alloc_info = NULL;
				2439
				2440	if (__ext3_get_inode_loc(inode, &iloc, 0))
				2441	goto bad_inode;
				2442	bh = iloc.bh;
				2443	raw_inode = ext3_raw_inode(&iloc);
				2444	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
				2445	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
				2446	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
				2447	if(!(test_opt (inode->i_sb, NO_UID32))) {
				2448	inode->i_uid \|= le16_to_cpu(raw_inode->i_uid_high) << 16;
				2449	inode->i_gid \|= le16_to_cpu(raw_inode->i_gid_high) << 16;
				2450	}
				2451	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
				2452	inode->i_size = le32_to_cpu(raw_inode->i_size);
				2453	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
				2454	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
				2455	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
				2456	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
				2457
				2458	ei->i_state = 0;
				2459	ei->i_dir_start_lookup = 0;
				2460	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
				2461	/* We now have enough fields to check if the inode was active or not.
				2462	* This is needed because nfsd might try to access dead inodes
				2463	* the test is that same one that e2fsck uses
				2464	* NeilBrown 1999oct15
				2465	*/
				2466	if (inode->i_nlink == 0) {
				2467	if (inode->i_mode == 0 \|\|
				2468	!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
				2469	/* this inode is deleted */
				2470	brelse (bh);
				2471	goto bad_inode;
				2472	}
				2473	/* The only unlinked inodes we let through here have
				2474	* valid i_mode and are being read by the orphan
				2475	* recovery code: that's fine, we're about to complete
				2476	* the process of deleting those. */
				2477	}
				2478	inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
				2479	* (for stat), not the fs block
				2480	* size */
				2481	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
				2482	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
				2483	#ifdef EXT3_FRAGMENTS
				2484	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
				2485	ei->i_frag_no = raw_inode->i_frag;
				2486	ei->i_frag_size = raw_inode->i_fsize;
				2487	#endif
				2488	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
				2489	if (!S_ISREG(inode->i_mode)) {
				2490	ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
				2491	} else {
				2492	inode->i_size \|=
				2493	((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
				2494	}
				2495	ei->i_disksize = inode->i_size;
				2496	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
				2497	ei->i_block_group = iloc.block_group;
				2498	/*
				2499	* NOTE! The in-memory inode i_data array is in little-endian order
				2500	* even on big-endian machines: we do NOT byteswap the block numbers!
				2501	*/
				2502	for (block = 0; block < EXT3_N_BLOCKS; block++)
				2503	ei->i_data[block] = raw_inode->i_block[block];
				2504	INIT_LIST_HEAD(&ei->i_orphan);
				2505
				2506	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
				2507	EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
				2508	/*
				2509	* When mke2fs creates big inodes it does not zero out
				2510	* the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
				2511	* so ignore those first few inodes.
				2512	*/
				2513	ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
				2514	if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
				2515	EXT3_INODE_SIZE(inode->i_sb))
				2516	goto bad_inode;
				2517	if (ei->i_extra_isize == 0) {
				2518	/* The extra space is currently unused. Use it. */
				2519	ei->i_extra_isize = sizeof(struct ext3_inode) -
				2520	EXT3_GOOD_OLD_INODE_SIZE;
				2521	} else {
				2522	__le32 magic = (void )raw_inode +
				2523	EXT3_GOOD_OLD_INODE_SIZE +
				2524	ei->i_extra_isize;
				2525	if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
				2526	ei->i_state \|= EXT3_STATE_XATTR;
				2527	}
				2528	} else
				2529	ei->i_extra_isize = 0;
				2530
				2531	if (S_ISREG(inode->i_mode)) {
				2532	inode->i_op = &ext3_file_inode_operations;
				2533	inode->i_fop = &ext3_file_operations;
				2534	ext3_set_aops(inode);
				2535	} else if (S_ISDIR(inode->i_mode)) {
				2536	inode->i_op = &ext3_dir_inode_operations;
				2537	inode->i_fop = &ext3_dir_operations;
				2538	} else if (S_ISLNK(inode->i_mode)) {
				2539	if (ext3_inode_is_fast_symlink(inode))
				2540	inode->i_op = &ext3_fast_symlink_inode_operations;
				2541	else {
				2542	inode->i_op = &ext3_symlink_inode_operations;
				2543	ext3_set_aops(inode);
				2544	}
				2545	} else {
				2546	inode->i_op = &ext3_special_inode_operations;
				2547	if (raw_inode->i_block[0])
				2548	init_special_inode(inode, inode->i_mode,
				2549	old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
				2550	else
				2551	init_special_inode(inode, inode->i_mode,
				2552	new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
				2553	}
				2554	brelse (iloc.bh);
				2555	ext3_set_inode_flags(inode);
				2556	return;
				2557
				2558	bad_inode:
				2559	make_bad_inode(inode);
				2560	return;
				2561	}
				2562
				2563	/*
				2564	* Post the struct inode info into an on-disk inode location in the
				2565	* buffer-cache. This gobbles the caller's reference to the
				2566	* buffer_head in the inode location struct.
				2567	*
				2568	* The caller must have write access to iloc->bh.
				2569	*/
				2570	static int ext3_do_update_inode(handle_t *handle,
				2571	struct inode *inode,
				2572	struct ext3_iloc *iloc)
				2573	{
				2574	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
				2575	struct ext3_inode_info *ei = EXT3_I(inode);
				2576	struct buffer_head *bh = iloc->bh;
				2577	int err = 0, rc, block;
				2578
				2579	/* For fields not not tracking in the in-memory inode,
				2580	* initialise them to zero for new inodes. */
				2581	if (ei->i_state & EXT3_STATE_NEW)
				2582	memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
				2583
				2584	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
				2585	if(!(test_opt(inode->i_sb, NO_UID32))) {
				2586	raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
				2587	raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
				2588	/*
				2589	* Fix up interoperability with old kernels. Otherwise, old inodes get
				2590	* re-used with the upper 16 bits of the uid/gid intact
				2591	*/
				2592	if(!ei->i_dtime) {
				2593	raw_inode->i_uid_high =
				2594	cpu_to_le16(high_16_bits(inode->i_uid));
				2595	raw_inode->i_gid_high =
				2596	cpu_to_le16(high_16_bits(inode->i_gid));
				2597	} else {
				2598	raw_inode->i_uid_high = 0;
				2599	raw_inode->i_gid_high = 0;
				2600	}
				2601	} else {
				2602	raw_inode->i_uid_low =
				2603	cpu_to_le16(fs_high2lowuid(inode->i_uid));
				2604	raw_inode->i_gid_low =
				2605	cpu_to_le16(fs_high2lowgid(inode->i_gid));
				2606	raw_inode->i_uid_high = 0;
				2607	raw_inode->i_gid_high = 0;
				2608	}
				2609	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
				2610	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
				2611	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
				2612	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
				2613	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
				2614	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
				2615	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
				2616	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
				2617	#ifdef EXT3_FRAGMENTS
				2618	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
				2619	raw_inode->i_frag = ei->i_frag_no;
				2620	raw_inode->i_fsize = ei->i_frag_size;
				2621	#endif
				2622	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
				2623	if (!S_ISREG(inode->i_mode)) {
				2624	raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
				2625	} else {
				2626	raw_inode->i_size_high =
				2627	cpu_to_le32(ei->i_disksize >> 32);
				2628	if (ei->i_disksize > 0x7fffffffULL) {
				2629	struct super_block *sb = inode->i_sb;
				2630	if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
				2631	EXT3_FEATURE_RO_COMPAT_LARGE_FILE) \|\|
				2632	EXT3_SB(sb)->s_es->s_rev_level ==
				2633	cpu_to_le32(EXT3_GOOD_OLD_REV)) {
				2634	/* If this is the first large file
				2635	* created, add a flag to the superblock.
				2636	*/
				2637	err = ext3_journal_get_write_access(handle,
				2638	EXT3_SB(sb)->s_sbh);
				2639	if (err)
				2640	goto out_brelse;
				2641	ext3_update_dynamic_rev(sb);
				2642	EXT3_SET_RO_COMPAT_FEATURE(sb,
				2643	EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
				2644	sb->s_dirt = 1;
				2645	handle->h_sync = 1;
				2646	err = ext3_journal_dirty_metadata(handle,
				2647	EXT3_SB(sb)->s_sbh);
				2648	}
				2649	}
				2650	}
				2651	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
				2652	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
				2653	if (old_valid_dev(inode->i_rdev)) {
				2654	raw_inode->i_block[0] =
				2655	cpu_to_le32(old_encode_dev(inode->i_rdev));
				2656	raw_inode->i_block[1] = 0;
				2657	} else {
				2658	raw_inode->i_block[0] = 0;
				2659	raw_inode->i_block[1] =
				2660	cpu_to_le32(new_encode_dev(inode->i_rdev));
				2661	raw_inode->i_block[2] = 0;
				2662	}
				2663	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
				2664	raw_inode->i_block[block] = ei->i_data[block];
				2665
				2666	if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
				2667	raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
				2668
				2669	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				2670	rc = ext3_journal_dirty_metadata(handle, bh);
				2671	if (!err)
				2672	err = rc;
				2673	ei->i_state &= ~EXT3_STATE_NEW;
				2674
				2675	out_brelse:
				2676	brelse (bh);
				2677	ext3_std_error(inode->i_sb, err);
				2678	return err;
				2679	}
				2680
				2681	/*
				2682	* ext3_write_inode()
				2683	*
				2684	* We are called from a few places:
				2685	*
				2686	* - Within generic_file_write() for O_SYNC files.
				2687	* Here, there will be no transaction running. We wait for any running
				2688	* trasnaction to commit.
				2689	*
				2690	* - Within sys_sync(), kupdate and such.
				2691	* We wait on commit, if tol to.
				2692	*
				2693	* - Within prune_icache() (PF_MEMALLOC == true)
				2694	* Here we simply return. We can't afford to block kswapd on the
				2695	* journal commit.
				2696	*
				2697	* In all cases it is actually safe for us to return without doing anything,
				2698	* because the inode has been copied into a raw inode buffer in
				2699	* ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
				2700	* knfsd.
				2701	*
				2702	* Note that we are absolutely dependent upon all inode dirtiers doing the
				2703	* right thing: they must call mark_inode_dirty() after dirtying info in
				2704	* which we are interested.
				2705	*
				2706	* It would be a bug for them to not do this. The code:
				2707	*
				2708	* mark_inode_dirty(inode)
				2709	* stuff();
				2710	* inode->i_size = expr;
				2711	*
				2712	* is in error because a kswapd-driven write_inode() could occur while
				2713	* `stuff()' is running, and the new i_size will be lost. Plus the inode
				2714	* will no longer be on the superblock's dirty inode list.
				2715	*/
				2716	int ext3_write_inode(struct inode *inode, int wait)
				2717	{
				2718	if (current->flags & PF_MEMALLOC)
				2719	return 0;
				2720
				2721	if (ext3_journal_current_handle()) {
				2722	jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
				2723	dump_stack();
				2724	return -EIO;
				2725	}
				2726
				2727	if (!wait)
				2728	return 0;
				2729
				2730	return ext3_force_commit(inode->i_sb);
				2731	}
				2732
				2733	/*
				2734	* ext3_setattr()
				2735	*
				2736	* Called from notify_change.
				2737	*
				2738	* We want to trap VFS attempts to truncate the file as soon as
				2739	* possible. In particular, we want to make sure that when the VFS
				2740	* shrinks i_size, we put the inode on the orphan list and modify
				2741	* i_disksize immediately, so that during the subsequent flushing of
				2742	* dirty pages and freeing of disk blocks, we can guarantee that any
				2743	* commit will leave the blocks being flushed in an unused state on
				2744	* disk. (On recovery, the inode will get truncated and the blocks will
				2745	* be freed, so we have a strong guarantee that no future commit will
				2746	* leave these blocks visible to the user.)
				2747	*
				2748	* Called with inode->sem down.
				2749	*/
				2750	int ext3_setattr(struct dentry dentry, struct iattr attr)
				2751	{
				2752	struct inode *inode = dentry->d_inode;
				2753	int error, rc = 0;
				2754	const unsigned int ia_valid = attr->ia_valid;
				2755
				2756	error = inode_change_ok(inode, attr);
				2757	if (error)
				2758	return error;
				2759
				2760	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|
				2761	(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
				2762	handle_t *handle;
				2763
				2764	/* (user+group)*(old+new) structure, inode write (sb,
				2765	* inode block, ? - but truncate inode update has it) */
				2766	handle = ext3_journal_start(inode, 4*EXT3_QUOTA_INIT_BLOCKS+3);
				2767	if (IS_ERR(handle)) {
				2768	error = PTR_ERR(handle);
				2769	goto err_out;
				2770	}
				2771	error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
				2772	if (error) {
				2773	ext3_journal_stop(handle);
				2774	return error;
				2775	}
				2776	/* Update corresponding info in inode so that everything is in
				2777	* one transaction */
				2778	if (attr->ia_valid & ATTR_UID)
				2779	inode->i_uid = attr->ia_uid;
				2780	if (attr->ia_valid & ATTR_GID)
				2781	inode->i_gid = attr->ia_gid;
				2782	error = ext3_mark_inode_dirty(handle, inode);
				2783	ext3_journal_stop(handle);
				2784	}
				2785
				2786	if (S_ISREG(inode->i_mode) &&
				2787	attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
				2788	handle_t *handle;
				2789
				2790	handle = ext3_journal_start(inode, 3);
				2791	if (IS_ERR(handle)) {
				2792	error = PTR_ERR(handle);
				2793	goto err_out;
				2794	}
				2795
				2796	error = ext3_orphan_add(handle, inode);
				2797	EXT3_I(inode)->i_disksize = attr->ia_size;
				2798	rc = ext3_mark_inode_dirty(handle, inode);
				2799	if (!error)
				2800	error = rc;
				2801	ext3_journal_stop(handle);
				2802	}
				2803
				2804	rc = inode_setattr(inode, attr);
				2805
				2806	/* If inode_setattr's call to ext3_truncate failed to get a
				2807	* transaction handle at all, we need to clean up the in-core
				2808	* orphan list manually. */
				2809	if (inode->i_nlink)
				2810	ext3_orphan_del(NULL, inode);
				2811
				2812	if (!rc && (ia_valid & ATTR_MODE))
				2813	rc = ext3_acl_chmod(inode);
				2814
				2815	err_out:
				2816	ext3_std_error(inode->i_sb, error);
				2817	if (!error)
				2818	error = rc;
				2819	return error;
				2820	}
				2821
				2822
				2823	/*
				2824	* akpm: how many blocks doth make a writepage()?
				2825	*
				2826	* With N blocks per page, it may be:
				2827	* N data blocks
				2828	* 2 indirect block
				2829	* 2 dindirect
				2830	* 1 tindirect
				2831	* N+5 bitmap blocks (from the above)
				2832	* N+5 group descriptor summary blocks
				2833	* 1 inode block
				2834	* 1 superblock.
				2835	* 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
				2836	*
				2837	* 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
				2838	*
				2839	* With ordered or writeback data it's the same, less the N data blocks.
				2840	*
				2841	* If the inode's direct blocks can hold an integral number of pages then a
				2842	* page cannot straddle two indirect blocks, and we can only touch one indirect
				2843	* and dindirect block, and the "5" above becomes "3".
				2844	*
				2845	* This still overestimates under most circumstances. If we were to pass the
				2846	* start and end offsets in here as well we could do block_to_path() on each
				2847	* block and work out the exact number of indirects which are touched. Pah.
				2848	*/
				2849
				2850	static int ext3_writepage_trans_blocks(struct inode *inode)
				2851	{
				2852	int bpp = ext3_journal_blocks_per_page(inode);
				2853	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
				2854	int ret;
				2855
				2856	if (ext3_should_journal_data(inode))
				2857	ret = 3 * (bpp + indirects) + 2;
				2858	else
				2859	ret = 2 * (bpp + indirects) + 2;
				2860
				2861	#ifdef CONFIG_QUOTA
				2862	/* We know that structure was already allocated during DQUOT_INIT so
				2863	* we will be updating only the data blocks + inodes */
				2864	ret += 2*EXT3_QUOTA_TRANS_BLOCKS;
				2865	#endif
				2866
				2867	return ret;
				2868	}
				2869
				2870	/*
				2871	* The caller must have previously called ext3_reserve_inode_write().
				2872	* Give this, we know that the caller already has write access to iloc->bh.
				2873	*/
				2874	int ext3_mark_iloc_dirty(handle_t *handle,
				2875	struct inode inode, struct ext3_iloc iloc)
				2876	{
				2877	int err = 0;
				2878
				2879	/* the do_update_inode consumes one bh->b_count */
				2880	get_bh(iloc->bh);
				2881
				2882	/* ext3_do_update_inode() does journal_dirty_metadata */
				2883	err = ext3_do_update_inode(handle, inode, iloc);
				2884	put_bh(iloc->bh);
				2885	return err;
				2886	}
				2887
				2888	/*
				2889	* On success, We end up with an outstanding reference count against
				2890	* iloc->bh. This _must_ be cleaned up later.
				2891	*/
				2892
				2893	int
				2894	ext3_reserve_inode_write(handle_t handle, struct inode inode,
				2895	struct ext3_iloc *iloc)
				2896	{
				2897	int err = 0;
				2898	if (handle) {
				2899	err = ext3_get_inode_loc(inode, iloc);
				2900	if (!err) {
				2901	BUFFER_TRACE(iloc->bh, "get_write_access");
				2902	err = ext3_journal_get_write_access(handle, iloc->bh);
				2903	if (err) {
				2904	brelse(iloc->bh);
				2905	iloc->bh = NULL;
				2906	}
				2907	}
				2908	}
				2909	ext3_std_error(inode->i_sb, err);
				2910	return err;
				2911	}
				2912
				2913	/*
				2914	* akpm: What we do here is to mark the in-core inode as clean
				2915	* with respect to inode dirtiness (it may still be data-dirty).
				2916	* This means that the in-core inode may be reaped by prune_icache
				2917	* without having to perform any I/O. This is a very good thing,
				2918	* because any task may call prune_icache - even ones which
				2919	* have a transaction open against a different journal.
				2920	*
				2921	* Is this cheating? Not really. Sure, we haven't written the
				2922	* inode out, but prune_icache isn't a user-visible syncing function.
				2923	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
				2924	* we start and wait on commits.
				2925	*
				2926	* Is this efficient/effective? Well, we're being nice to the system
				2927	* by cleaning up our inodes proactively so they can be reaped
				2928	* without I/O. But we are potentially leaving up to five seconds'
				2929	* worth of inodes floating about which prune_icache wants us to
				2930	* write out. One way to fix that would be to get prune_icache()
				2931	* to do a write_super() to free up some memory. It has the desired
				2932	* effect.
				2933	*/
				2934	int ext3_mark_inode_dirty(handle_t handle, struct inode inode)
				2935	{
				2936	struct ext3_iloc iloc;
				2937	int err;
				2938
				2939	might_sleep();
				2940	err = ext3_reserve_inode_write(handle, inode, &iloc);
				2941	if (!err)
				2942	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
				2943	return err;
				2944	}
				2945
				2946	/*
				2947	* akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
				2948	*
				2949	* We're really interested in the case where a file is being extended.
				2950	* i_size has been changed by generic_commit_write() and we thus need
				2951	* to include the updated inode in the current transaction.
				2952	*
				2953	* Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
				2954	* are allocated to the file.
				2955	*
				2956	* If the inode is marked synchronous, we don't honour that here - doing
				2957	* so would cause a commit on atime updates, which we don't bother doing.
				2958	* We handle synchronous inodes at the highest possible level.
				2959	*/
				2960	void ext3_dirty_inode(struct inode *inode)
				2961	{
				2962	handle_t *current_handle = ext3_journal_current_handle();
				2963	handle_t *handle;
				2964
				2965	handle = ext3_journal_start(inode, 2);
				2966	if (IS_ERR(handle))
				2967	goto out;
				2968	if (current_handle &&
				2969	current_handle->h_transaction != handle->h_transaction) {
				2970	/* This task has a transaction open against a different fs */
				2971	printk(KERN_EMERG "%s: transactions do not match!\n",
				2972	__FUNCTION__);
				2973	} else {
				2974	jbd_debug(5, "marking dirty. outer handle=%p\n",
				2975	current_handle);
				2976	ext3_mark_inode_dirty(handle, inode);
				2977	}
				2978	ext3_journal_stop(handle);
				2979	out:
				2980	return;
				2981	}
				2982
				2983	#ifdef AKPM
				2984	/*
				2985	* Bind an inode's backing buffer_head into this transaction, to prevent
				2986	* it from being flushed to disk early. Unlike
				2987	* ext3_reserve_inode_write, this leaves behind no bh reference and
				2988	* returns no iloc structure, so the caller needs to repeat the iloc
				2989	* lookup to mark the inode dirty later.
				2990	*/
				2991	static inline int
				2992	ext3_pin_inode(handle_t handle, struct inode inode)
				2993	{
				2994	struct ext3_iloc iloc;
				2995
				2996	int err = 0;
				2997	if (handle) {
				2998	err = ext3_get_inode_loc(inode, &iloc);
				2999	if (!err) {
				3000	BUFFER_TRACE(iloc.bh, "get_write_access");
				3001	err = journal_get_write_access(handle, iloc.bh);
				3002	if (!err)
				3003	err = ext3_journal_dirty_metadata(handle,
				3004	iloc.bh);
				3005	brelse(iloc.bh);
				3006	}
				3007	}
				3008	ext3_std_error(inode->i_sb, err);
				3009	return err;
				3010	}
				3011	#endif
				3012
				3013	int ext3_change_inode_journal_flag(struct inode *inode, int val)
				3014	{
				3015	journal_t *journal;
				3016	handle_t *handle;
				3017	int err;
				3018
				3019	/*
				3020	* We have to be very careful here: changing a data block's
				3021	* journaling status dynamically is dangerous. If we write a
				3022	* data block to the journal, change the status and then delete
				3023	* that block, we risk forgetting to revoke the old log record
				3024	* from the journal and so a subsequent replay can corrupt data.
				3025	* So, first we make sure that the journal is empty and that
				3026	* nobody is changing anything.
				3027	*/
				3028
				3029	journal = EXT3_JOURNAL(inode);
				3030	if (is_journal_aborted(journal) \|\| IS_RDONLY(inode))
				3031	return -EROFS;
				3032
				3033	journal_lock_updates(journal);
				3034	journal_flush(journal);
				3035
				3036	/*
				3037	* OK, there are no updates running now, and all cached data is
				3038	* synced to disk. We are now in a completely consistent state
				3039	* which doesn't have anything in the journal, and we know that
				3040	* no filesystem updates are running, so it is safe to modify
				3041	* the inode's in-core data-journaling state flag now.
				3042	*/
				3043
				3044	if (val)
				3045	EXT3_I(inode)->i_flags \|= EXT3_JOURNAL_DATA_FL;
				3046	else
				3047	EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
				3048	ext3_set_aops(inode);
				3049
				3050	journal_unlock_updates(journal);
				3051
				3052	/* Finally we can mark the inode as dirty. */
				3053
				3054	handle = ext3_journal_start(inode, 1);
				3055	if (IS_ERR(handle))
				3056	return PTR_ERR(handle);
				3057
				3058	err = ext3_mark_inode_dirty(handle, inode);
				3059	handle->h_sync = 1;
				3060	ext3_journal_stop(handle);
				3061	ext3_std_error(inode->i_sb, err);
				3062
				3063	return err;
				3064	}