Blame - fs/btrfs/tree-log.c - android_kernel_oneplus_sm8150

blob: 33eee256ee818ab52fe7189ffa2d03f20e79cc63 [file] [log] [blame]

Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include "ctree.h"
				21	#include "transaction.h"
				22	#include "disk-io.h"
				23	#include "locking.h"
				24	#include "print-tree.h"
				25	#include "compat.h"
Christoph Hellwig	b295086	2008-12-02 09:54:17 -0500	[diff] [blame]	26	#include "tree-log.h"
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	27
				28	/* magic values for the inode_only field in btrfs_log_inode:
				29	*
				30	* LOG_INODE_ALL means to log everything
				31	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				32	* during log replay
				33	*/
				34	#define LOG_INODE_ALL 0
				35	#define LOG_INODE_EXISTS 1
				36
				37	/*
				38	* stages for the tree walking. The first
				39	* stage (0) is to only pin down the blocks we find
				40	* the second stage (1) is to make sure that all the inodes
				41	* we find in the log are created in the subvolume.
				42	*
				43	* The last stage is to deal with directories and links and extents
				44	* and all the other fun semantics
				45	*/
				46	#define LOG_WALK_PIN_ONLY 0
				47	#define LOG_WALK_REPLAY_INODES 1
				48	#define LOG_WALK_REPLAY_ALL 2
				49
				50	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				51	struct btrfs_root root, struct inode inode,
				52	int inode_only);
				53
				54	/*
				55	* tree logging is a special write ahead log used to make sure that
				56	* fsyncs and O_SYNCs can happen without doing full tree commits.
				57	*
				58	* Full tree commits are expensive because they require commonly
				59	* modified blocks to be recowed, creating many dirty pages in the
				60	* extent tree an 4x-6x higher write load than ext3.
				61	*
				62	* Instead of doing a tree commit on every fsync, we use the
				63	* key ranges and transaction ids to find items for a given file or directory
				64	* that have changed in this transaction. Those items are copied into
				65	* a special tree (one per subvolume root), that tree is written to disk
				66	* and then the fsync is considered complete.
				67	*
				68	* After a crash, items are copied out of the log-tree back into the
				69	* subvolume tree. Any file data extents found are recorded in the extent
				70	* allocation tree, and the log-tree freed.
				71	*
				72	* The log tree is read three times, once to pin down all the extents it is
				73	* using in ram and once, once to create all the inodes logged in the tree
				74	* and once to do all the other items.
				75	*/
				76
				77	/*
				78	* btrfs_add_log_tree adds a new per-subvolume log tree into the
				79	* tree of log tree roots. This must be called with a tree log transaction
				80	* running (see start_log_trans).
				81	*/
Christoph Hellwig	b295086	2008-12-02 09:54:17 -0500	[diff] [blame]	82	static int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	83	struct btrfs_root *root)
				84	{
				85	struct btrfs_key key;
				86	struct btrfs_root_item root_item;
				87	struct btrfs_inode_item *inode_item;
				88	struct extent_buffer *leaf;
				89	struct btrfs_root *new_root = root;
				90	int ret;
				91	u64 objectid = root->root_key.objectid;
				92
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	93	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	94	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	95	trans->transid, 0, 0, 0);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	96	if (IS_ERR(leaf)) {
				97	ret = PTR_ERR(leaf);
				98	return ret;
				99	}
				100
				101	btrfs_set_header_nritems(leaf, 0);
				102	btrfs_set_header_level(leaf, 0);
				103	btrfs_set_header_bytenr(leaf, leaf->start);
				104	btrfs_set_header_generation(leaf, trans->transid);
				105	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
				106
				107	write_extent_buffer(leaf, root->fs_info->fsid,
				108	(unsigned long)btrfs_header_fsid(leaf),
				109	BTRFS_FSID_SIZE);
				110	btrfs_mark_buffer_dirty(leaf);
				111
				112	inode_item = &root_item.inode;
				113	memset(inode_item, 0, sizeof(*inode_item));
				114	inode_item->generation = cpu_to_le64(1);
				115	inode_item->size = cpu_to_le64(3);
				116	inode_item->nlink = cpu_to_le32(1);
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	117	inode_item->nbytes = cpu_to_le64(root->leafsize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	118	inode_item->mode = cpu_to_le32(S_IFDIR \| 0755);
				119
				120	btrfs_set_root_bytenr(&root_item, leaf->start);
Yan Zheng	84234f3	2008-10-29 14:49:05 -0400	[diff] [blame]	121	btrfs_set_root_generation(&root_item, trans->transid);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	122	btrfs_set_root_level(&root_item, 0);
				123	btrfs_set_root_refs(&root_item, 0);
				124	btrfs_set_root_used(&root_item, 0);
				125
				126	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
				127	root_item.drop_level = 0;
				128
				129	btrfs_tree_unlock(leaf);
				130	free_extent_buffer(leaf);
				131	leaf = NULL;
				132
				133	btrfs_set_root_dirid(&root_item, 0);
				134
				135	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				136	key.offset = objectid;
				137	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				138	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
				139	&root_item);
				140	if (ret)
				141	goto fail;
				142
				143	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
				144	&key);
				145	BUG_ON(!new_root);
				146
				147	WARN_ON(root->log_root);
				148	root->log_root = new_root;
				149
				150	/*
				151	* log trees do not get reference counted because they go away
				152	* before a real commit is actually done. They do store pointers
				153	* to file data extents, and those reference counts still get
				154	* updated (along with back refs to the log tree).
				155	*/
				156	new_root->ref_cows = 0;
				157	new_root->last_trans = trans->transid;
				158	fail:
				159	return ret;
				160	}
				161
				162	/*
				163	* start a sub transaction and setup the log tree
				164	* this increments the log tree writer count to make the people
				165	* syncing the tree wait for us to finish
				166	*/
				167	static int start_log_trans(struct btrfs_trans_handle *trans,
				168	struct btrfs_root *root)
				169	{
				170	int ret;
				171	mutex_lock(&root->fs_info->tree_log_mutex);
				172	if (!root->fs_info->log_root_tree) {
				173	ret = btrfs_init_log_root_tree(trans, root->fs_info);
				174	BUG_ON(ret);
				175	}
				176	if (!root->log_root) {
				177	ret = btrfs_add_log_tree(trans, root);
				178	BUG_ON(ret);
				179	}
				180	atomic_inc(&root->fs_info->tree_log_writers);
				181	root->fs_info->tree_log_batch++;
				182	mutex_unlock(&root->fs_info->tree_log_mutex);
				183	return 0;
				184	}
				185
				186	/*
				187	* returns 0 if there was a log transaction running and we were able
				188	* to join, or returns -ENOENT if there were not transactions
				189	* in progress
				190	*/
				191	static int join_running_log_trans(struct btrfs_root *root)
				192	{
				193	int ret = -ENOENT;
				194
				195	smp_mb();
				196	if (!root->log_root)
				197	return -ENOENT;
				198
				199	mutex_lock(&root->fs_info->tree_log_mutex);
				200	if (root->log_root) {
				201	ret = 0;
				202	atomic_inc(&root->fs_info->tree_log_writers);
				203	root->fs_info->tree_log_batch++;
				204	}
				205	mutex_unlock(&root->fs_info->tree_log_mutex);
				206	return ret;
				207	}
				208
				209	/*
				210	* indicate we're done making changes to the log tree
				211	* and wake up anyone waiting to do a sync
				212	*/
				213	static int end_log_trans(struct btrfs_root *root)
				214	{
				215	atomic_dec(&root->fs_info->tree_log_writers);
				216	smp_mb();
				217	if (waitqueue_active(&root->fs_info->tree_log_wait))
				218	wake_up(&root->fs_info->tree_log_wait);
				219	return 0;
				220	}
				221
				222
				223	/*
				224	* the walk control struct is used to pass state down the chain when
				225	* processing the log tree. The stage field tells us which part
				226	* of the log tree processing we are currently doing. The others
				227	* are state fields used for that specific part
				228	*/
				229	struct walk_control {
				230	/* should we free the extent on disk when done? This is used
				231	* at transaction commit time while freeing a log tree
				232	*/
				233	int free;
				234
				235	/* should we write out the extent buffer? This is used
				236	* while flushing the log tree to disk during a sync
				237	*/
				238	int write;
				239
				240	/* should we wait for the extent buffer io to finish? Also used
				241	* while flushing the log tree to disk for a sync
				242	*/
				243	int wait;
				244
				245	/* pin only walk, we record which extents on disk belong to the
				246	* log trees
				247	*/
				248	int pin;
				249
				250	/* what stage of the replay code we're currently in */
				251	int stage;
				252
				253	/* the root we are currently replaying */
				254	struct btrfs_root *replay_dest;
				255
				256	/* the trans handle for the current replay */
				257	struct btrfs_trans_handle *trans;
				258
				259	/* the function that gets used to process blocks we find in the
				260	* tree. Note the extent_buffer might not be up to date when it is
				261	* passed in, and it must be checked or read if you need the data
				262	* inside it
				263	*/
				264	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				265	struct walk_control *wc, u64 gen);
				266	};
				267
				268	/*
				269	* process_func used to pin down extents, write them or wait on them
				270	*/
				271	static int process_one_buffer(struct btrfs_root *log,
				272	struct extent_buffer *eb,
				273	struct walk_control *wc, u64 gen)
				274	{
				275	if (wc->pin) {
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	276	mutex_lock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	277	btrfs_update_pinned_extents(log->fs_info->extent_root,
				278	eb->start, eb->len, 1);
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	279	mutex_unlock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	280	}
				281
				282	if (btrfs_buffer_uptodate(eb, gen)) {
				283	if (wc->write)
				284	btrfs_write_tree_block(eb);
				285	if (wc->wait)
				286	btrfs_wait_tree_block_writeback(eb);
				287	}
				288	return 0;
				289	}
				290
				291	/*
				292	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				293	* to the src data we are copying out.
				294	*
				295	* root is the tree we are copying into, and path is a scratch
				296	* path for use in this function (it should be released on entry and
				297	* will be released on exit).
				298	*
				299	* If the key is already in the destination tree the existing item is
				300	* overwritten. If the existing item isn't big enough, it is extended.
				301	* If it is too large, it is truncated.
				302	*
				303	* If the key isn't in the destination yet, a new item is inserted.
				304	*/
				305	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				306	struct btrfs_root *root,
				307	struct btrfs_path *path,
				308	struct extent_buffer *eb, int slot,
				309	struct btrfs_key *key)
				310	{
				311	int ret;
				312	u32 item_size;
				313	u64 saved_i_size = 0;
				314	int save_old_i_size = 0;
				315	unsigned long src_ptr;
				316	unsigned long dst_ptr;
				317	int overwrite_root = 0;
				318
				319	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				320	overwrite_root = 1;
				321
				322	item_size = btrfs_item_size_nr(eb, slot);
				323	src_ptr = btrfs_item_ptr_offset(eb, slot);
				324
				325	/* look for the key in the destination tree */
				326	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				327	if (ret == 0) {
				328	char *src_copy;
				329	char *dst_copy;
				330	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				331	path->slots[0]);
				332	if (dst_size != item_size)
				333	goto insert;
				334
				335	if (item_size == 0) {
				336	btrfs_release_path(root, path);
				337	return 0;
				338	}
				339	dst_copy = kmalloc(item_size, GFP_NOFS);
				340	src_copy = kmalloc(item_size, GFP_NOFS);
				341
				342	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				343
				344	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				345	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				346	item_size);
				347	ret = memcmp(dst_copy, src_copy, item_size);
				348
				349	kfree(dst_copy);
				350	kfree(src_copy);
				351	/*
				352	* they have the same contents, just return, this saves
				353	* us from cowing blocks in the destination tree and doing
				354	* extra writes that may not have been done by a previous
				355	* sync
				356	*/
				357	if (ret == 0) {
				358	btrfs_release_path(root, path);
				359	return 0;
				360	}
				361
				362	}
				363	insert:
				364	btrfs_release_path(root, path);
				365	/* try to insert the key into the destination tree */
				366	ret = btrfs_insert_empty_item(trans, root, path,
				367	key, item_size);
				368
				369	/* make sure any existing item is the correct size */
				370	if (ret == -EEXIST) {
				371	u32 found_size;
				372	found_size = btrfs_item_size_nr(path->nodes[0],
				373	path->slots[0]);
				374	if (found_size > item_size) {
				375	btrfs_truncate_item(trans, root, path, item_size, 1);
				376	} else if (found_size < item_size) {
Yan Zheng	87b29b2	2008-12-17 10:21:48 -0500	[diff] [blame^]	377	ret = btrfs_extend_item(trans, root, path,
				378	item_size - found_size);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	379	BUG_ON(ret);
				380	}
				381	} else if (ret) {
				382	BUG();
				383	}
				384	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				385	path->slots[0]);
				386
				387	/* don't overwrite an existing inode if the generation number
				388	* was logged as zero. This is done when the tree logging code
				389	* is just logging an inode to make sure it exists after recovery.
				390	*
				391	* Also, don't overwrite i_size on directories during replay.
				392	* log replay inserts and removes directory items based on the
				393	* state of the tree found in the subvolume, and i_size is modified
				394	* as it goes
				395	*/
				396	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				397	struct btrfs_inode_item *src_item;
				398	struct btrfs_inode_item *dst_item;
				399
				400	src_item = (struct btrfs_inode_item *)src_ptr;
				401	dst_item = (struct btrfs_inode_item *)dst_ptr;
				402
				403	if (btrfs_inode_generation(eb, src_item) == 0)
				404	goto no_copy;
				405
				406	if (overwrite_root &&
				407	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				408	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				409	save_old_i_size = 1;
				410	saved_i_size = btrfs_inode_size(path->nodes[0],
				411	dst_item);
				412	}
				413	}
				414
				415	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				416	src_ptr, item_size);
				417
				418	if (save_old_i_size) {
				419	struct btrfs_inode_item *dst_item;
				420	dst_item = (struct btrfs_inode_item *)dst_ptr;
				421	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				422	}
				423
				424	/* make sure the generation is filled in */
				425	if (key->type == BTRFS_INODE_ITEM_KEY) {
				426	struct btrfs_inode_item *dst_item;
				427	dst_item = (struct btrfs_inode_item *)dst_ptr;
				428	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				429	btrfs_set_inode_generation(path->nodes[0], dst_item,
				430	trans->transid);
				431	}
				432	}
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	433
				434	if (overwrite_root &&
				435	key->type == BTRFS_EXTENT_DATA_KEY) {
				436	int extent_type;
				437	struct btrfs_file_extent_item *fi;
				438
				439	fi = (struct btrfs_file_extent_item *)dst_ptr;
				440	extent_type = btrfs_file_extent_type(path->nodes[0], fi);
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	441	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
				442	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	443	struct btrfs_key ins;
				444	ins.objectid = btrfs_file_extent_disk_bytenr(
				445	path->nodes[0], fi);
				446	ins.offset = btrfs_file_extent_disk_num_bytes(
				447	path->nodes[0], fi);
				448	ins.type = BTRFS_EXTENT_ITEM_KEY;
				449
				450	/*
				451	* is this extent already allocated in the extent
				452	* allocation tree? If so, just add a reference
				453	*/
				454	ret = btrfs_lookup_extent(root, ins.objectid,
				455	ins.offset);
				456	if (ret == 0) {
				457	ret = btrfs_inc_extent_ref(trans, root,
				458	ins.objectid, ins.offset,
				459	path->nodes[0]->start,
				460	root->root_key.objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	461	trans->transid, key->objectid);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	462	} else {
				463	/*
				464	* insert the extent pointer in the extent
				465	* allocation tree
				466	*/
				467	ret = btrfs_alloc_logged_extent(trans, root,
				468	path->nodes[0]->start,
				469	root->root_key.objectid,
				470	trans->transid, key->objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	471	&ins);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	472	BUG_ON(ret);
				473	}
				474	}
				475	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	476	no_copy:
				477	btrfs_mark_buffer_dirty(path->nodes[0]);
				478	btrfs_release_path(root, path);
				479	return 0;
				480	}
				481
				482	/*
				483	* simple helper to read an inode off the disk from a given root
				484	* This can only be called for subvolume roots and not for the log
				485	*/
				486	static noinline struct inode read_one_inode(struct btrfs_root root,
				487	u64 objectid)
				488	{
				489	struct inode *inode;
				490	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
				491	if (inode->i_state & I_NEW) {
				492	BTRFS_I(inode)->root = root;
				493	BTRFS_I(inode)->location.objectid = objectid;
				494	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
				495	BTRFS_I(inode)->location.offset = 0;
				496	btrfs_read_locked_inode(inode);
				497	unlock_new_inode(inode);
				498
				499	}
				500	if (is_bad_inode(inode)) {
				501	iput(inode);
				502	inode = NULL;
				503	}
				504	return inode;
				505	}
				506
				507	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				508	* subvolume 'root'. path is released on entry and should be released
				509	* on exit.
				510	*
				511	* extents in the log tree have not been allocated out of the extent
				512	* tree yet. So, this completes the allocation, taking a reference
				513	* as required if the extent already exists or creating a new extent
				514	* if it isn't in the extent allocation tree yet.
				515	*
				516	* The extent is inserted into the file, dropping any existing extents
				517	* from the file that overlap the new one.
				518	*/
				519	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				520	struct btrfs_root *root,
				521	struct btrfs_path *path,
				522	struct extent_buffer *eb, int slot,
				523	struct btrfs_key *key)
				524	{
				525	int found_type;
				526	u64 mask = root->sectorsize - 1;
				527	u64 extent_end;
				528	u64 alloc_hint;
				529	u64 start = key->offset;
				530	struct btrfs_file_extent_item *item;
				531	struct inode *inode = NULL;
				532	unsigned long size;
				533	int ret = 0;
				534
				535	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				536	found_type = btrfs_file_extent_type(eb, item);
				537
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	538	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				539	found_type == BTRFS_FILE_EXTENT_PREALLOC)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	540	extent_end = start + btrfs_file_extent_num_bytes(eb, item);
				541	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
Chris Mason	c8b9781	2008-10-29 14:49:59 -0400	[diff] [blame]	542	size = btrfs_file_extent_inline_len(eb, item);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	543	extent_end = (start + size + mask) & ~mask;
				544	} else {
				545	ret = 0;
				546	goto out;
				547	}
				548
				549	inode = read_one_inode(root, key->objectid);
				550	if (!inode) {
				551	ret = -EIO;
				552	goto out;
				553	}
				554
				555	/*
				556	* first check to see if we already have this extent in the
				557	* file. This must be done before the btrfs_drop_extents run
				558	* so we don't try to drop this extent.
				559	*/
				560	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
				561	start, 0);
				562
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	563	if (ret == 0 &&
				564	(found_type == BTRFS_FILE_EXTENT_REG \|\|
				565	found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	566	struct btrfs_file_extent_item cmp1;
				567	struct btrfs_file_extent_item cmp2;
				568	struct btrfs_file_extent_item *existing;
				569	struct extent_buffer *leaf;
				570
				571	leaf = path->nodes[0];
				572	existing = btrfs_item_ptr(leaf, path->slots[0],
				573	struct btrfs_file_extent_item);
				574
				575	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				576	sizeof(cmp1));
				577	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				578	sizeof(cmp2));
				579
				580	/*
				581	* we already have a pointer to this exact extent,
				582	* we don't have to do anything
				583	*/
				584	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				585	btrfs_release_path(root, path);
				586	goto out;
				587	}
				588	}
				589	btrfs_release_path(root, path);
				590
				591	/* drop any overlapping extents */
				592	ret = btrfs_drop_extents(trans, root, inode,
				593	start, extent_end, start, &alloc_hint);
				594	BUG_ON(ret);
				595
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	596	/* insert the extent */
				597	ret = overwrite_item(trans, root, path, eb, slot, key);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	598	BUG_ON(ret);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	599
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	600	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
				601	inode_add_bytes(inode, extent_end - start);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	602	btrfs_update_inode(trans, root, inode);
				603	out:
				604	if (inode)
				605	iput(inode);
				606	return ret;
				607	}
				608
				609	/*
				610	* when cleaning up conflicts between the directory names in the
				611	* subvolume, directory names in the log and directory names in the
				612	* inode back references, we may have to unlink inodes from directories.
				613	*
				614	* This is a helper function to do the unlink of a specific directory
				615	* item
				616	*/
				617	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				618	struct btrfs_root *root,
				619	struct btrfs_path *path,
				620	struct inode *dir,
				621	struct btrfs_dir_item *di)
				622	{
				623	struct inode *inode;
				624	char *name;
				625	int name_len;
				626	struct extent_buffer *leaf;
				627	struct btrfs_key location;
				628	int ret;
				629
				630	leaf = path->nodes[0];
				631
				632	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				633	name_len = btrfs_dir_name_len(leaf, di);
				634	name = kmalloc(name_len, GFP_NOFS);
				635	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				636	btrfs_release_path(root, path);
				637
				638	inode = read_one_inode(root, location.objectid);
				639	BUG_ON(!inode);
				640
				641	btrfs_inc_nlink(inode);
				642	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
				643	kfree(name);
				644
				645	iput(inode);
				646	return ret;
				647	}
				648
				649	/*
				650	* helper function to see if a given name and sequence number found
				651	* in an inode back reference are already in a directory and correctly
				652	* point to this inode
				653	*/
				654	static noinline int inode_in_dir(struct btrfs_root *root,
				655	struct btrfs_path *path,
				656	u64 dirid, u64 objectid, u64 index,
				657	const char *name, int name_len)
				658	{
				659	struct btrfs_dir_item *di;
				660	struct btrfs_key location;
				661	int match = 0;
				662
				663	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				664	index, name, name_len, 0);
				665	if (di && !IS_ERR(di)) {
				666	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				667	if (location.objectid != objectid)
				668	goto out;
				669	} else
				670	goto out;
				671	btrfs_release_path(root, path);
				672
				673	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				674	if (di && !IS_ERR(di)) {
				675	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				676	if (location.objectid != objectid)
				677	goto out;
				678	} else
				679	goto out;
				680	match = 1;
				681	out:
				682	btrfs_release_path(root, path);
				683	return match;
				684	}
				685
				686	/*
				687	* helper function to check a log tree for a named back reference in
				688	* an inode. This is used to decide if a back reference that is
				689	* found in the subvolume conflicts with what we find in the log.
				690	*
				691	* inode backreferences may have multiple refs in a single item,
				692	* during replay we process one reference at a time, and we don't
				693	* want to delete valid links to a file from the subvolume if that
				694	* link is also in the log.
				695	*/
				696	static noinline int backref_in_log(struct btrfs_root *log,
				697	struct btrfs_key *key,
				698	char *name, int namelen)
				699	{
				700	struct btrfs_path *path;
				701	struct btrfs_inode_ref *ref;
				702	unsigned long ptr;
				703	unsigned long ptr_end;
				704	unsigned long name_ptr;
				705	int found_name_len;
				706	int item_size;
				707	int ret;
				708	int match = 0;
				709
				710	path = btrfs_alloc_path();
				711	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				712	if (ret != 0)
				713	goto out;
				714
				715	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				716	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				717	ptr_end = ptr + item_size;
				718	while (ptr < ptr_end) {
				719	ref = (struct btrfs_inode_ref *)ptr;
				720	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				721	if (found_name_len == namelen) {
				722	name_ptr = (unsigned long)(ref + 1);
				723	ret = memcmp_extent_buffer(path->nodes[0], name,
				724	name_ptr, namelen);
				725	if (ret == 0) {
				726	match = 1;
				727	goto out;
				728	}
				729	}
				730	ptr = (unsigned long)(ref + 1) + found_name_len;
				731	}
				732	out:
				733	btrfs_free_path(path);
				734	return match;
				735	}
				736
				737
				738	/*
				739	* replay one inode back reference item found in the log tree.
				740	* eb, slot and key refer to the buffer and key found in the log tree.
				741	* root is the destination we are replaying into, and path is for temp
				742	* use by this function. (it should be released on return).
				743	*/
				744	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				745	struct btrfs_root *root,
				746	struct btrfs_root *log,
				747	struct btrfs_path *path,
				748	struct extent_buffer *eb, int slot,
				749	struct btrfs_key *key)
				750	{
				751	struct inode *dir;
				752	int ret;
				753	struct btrfs_key location;
				754	struct btrfs_inode_ref *ref;
				755	struct btrfs_dir_item *di;
				756	struct inode *inode;
				757	char *name;
				758	int namelen;
				759	unsigned long ref_ptr;
				760	unsigned long ref_end;
				761
				762	location.objectid = key->objectid;
				763	location.type = BTRFS_INODE_ITEM_KEY;
				764	location.offset = 0;
				765
				766	/*
				767	* it is possible that we didn't log all the parent directories
				768	* for a given inode. If we don't find the dir, just don't
				769	* copy the back ref in. The link count fixup code will take
				770	* care of the rest
				771	*/
				772	dir = read_one_inode(root, key->offset);
				773	if (!dir)
				774	return -ENOENT;
				775
				776	inode = read_one_inode(root, key->objectid);
				777	BUG_ON(!dir);
				778
				779	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				780	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				781
				782	again:
				783	ref = (struct btrfs_inode_ref *)ref_ptr;
				784
				785	namelen = btrfs_inode_ref_name_len(eb, ref);
				786	name = kmalloc(namelen, GFP_NOFS);
				787	BUG_ON(!name);
				788
				789	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				790
				791	/* if we already have a perfect match, we're done */
				792	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
				793	btrfs_inode_ref_index(eb, ref),
				794	name, namelen)) {
				795	goto out;
				796	}
				797
				798	/*
				799	* look for a conflicting back reference in the metadata.
				800	* if we find one we have to unlink that name of the file
				801	* before we add our new link. Later on, we overwrite any
				802	* existing back reference, and we don't want to create
				803	* dangling pointers in the directory.
				804	*/
				805	conflict_again:
				806	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				807	if (ret == 0) {
				808	char *victim_name;
				809	int victim_name_len;
				810	struct btrfs_inode_ref *victim_ref;
				811	unsigned long ptr;
				812	unsigned long ptr_end;
				813	struct extent_buffer *leaf = path->nodes[0];
				814
				815	/* are we trying to overwrite a back ref for the root directory
				816	* if so, just jump out, we're done
				817	*/
				818	if (key->objectid == key->offset)
				819	goto out_nowrite;
				820
				821	/* check all the names in this back reference to see
				822	* if they are in the log. if so, we allow them to stay
				823	* otherwise they must be unlinked as a conflict
				824	*/
				825	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				826	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
				827	while(ptr < ptr_end) {
				828	victim_ref = (struct btrfs_inode_ref *)ptr;
				829	victim_name_len = btrfs_inode_ref_name_len(leaf,
				830	victim_ref);
				831	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				832	BUG_ON(!victim_name);
				833
				834	read_extent_buffer(leaf, victim_name,
				835	(unsigned long)(victim_ref + 1),
				836	victim_name_len);
				837
				838	if (!backref_in_log(log, key, victim_name,
				839	victim_name_len)) {
				840	btrfs_inc_nlink(inode);
				841	btrfs_release_path(root, path);
				842	ret = btrfs_unlink_inode(trans, root, dir,
				843	inode, victim_name,
				844	victim_name_len);
				845	kfree(victim_name);
				846	btrfs_release_path(root, path);
				847	goto conflict_again;
				848	}
				849	kfree(victim_name);
				850	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				851	}
				852	BUG_ON(ret);
				853	}
				854	btrfs_release_path(root, path);
				855
				856	/* look for a conflicting sequence number */
				857	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
				858	btrfs_inode_ref_index(eb, ref),
				859	name, namelen, 0);
				860	if (di && !IS_ERR(di)) {
				861	ret = drop_one_dir_item(trans, root, path, dir, di);
				862	BUG_ON(ret);
				863	}
				864	btrfs_release_path(root, path);
				865
				866
				867	/* look for a conflicting name */
				868	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
				869	name, namelen, 0);
				870	if (di && !IS_ERR(di)) {
				871	ret = drop_one_dir_item(trans, root, path, dir, di);
				872	BUG_ON(ret);
				873	}
				874	btrfs_release_path(root, path);
				875
				876	/* insert our name */
				877	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
				878	btrfs_inode_ref_index(eb, ref));
				879	BUG_ON(ret);
				880
				881	btrfs_update_inode(trans, root, inode);
				882
				883	out:
				884	ref_ptr = (unsigned long)(ref + 1) + namelen;
				885	kfree(name);
				886	if (ref_ptr < ref_end)
				887	goto again;
				888
				889	/* finally write the back reference in the inode */
				890	ret = overwrite_item(trans, root, path, eb, slot, key);
				891	BUG_ON(ret);
				892
				893	out_nowrite:
				894	btrfs_release_path(root, path);
				895	iput(dir);
				896	iput(inode);
				897	return 0;
				898	}
				899
				900	/*
				901	* replay one csum item from the log tree into the subvolume 'root'
				902	* eb, slot and key all refer to the log tree
				903	* path is for temp use by this function and should be released on return
				904	*
				905	* This copies the checksums out of the log tree and inserts them into
				906	* the subvolume. Any existing checksums for this range in the file
				907	* are overwritten, and new items are added where required.
				908	*
				909	* We keep this simple by reusing the btrfs_ordered_sum code from
				910	* the data=ordered mode. This basically means making a copy
				911	* of all the checksums in ram, which we have to do anyway for kmap
				912	* rules.
				913	*
				914	* The copy is then sent down to btrfs_csum_file_blocks, which
				915	* does all the hard work of finding existing items in the file
				916	* or adding new ones.
				917	*/
				918	static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
				919	struct btrfs_root *root,
				920	struct btrfs_path *path,
				921	struct extent_buffer *eb, int slot,
				922	struct btrfs_key *key)
				923	{
				924	int ret;
				925	u32 item_size = btrfs_item_size_nr(eb, slot);
				926	u64 cur_offset;
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	927	u16 csum_size =
				928	btrfs_super_csum_size(&root->fs_info->super_copy);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	929	unsigned long file_bytes;
				930	struct btrfs_ordered_sum *sums;
				931	struct btrfs_sector_sum *sector_sum;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	932	unsigned long ptr;
				933
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	934	file_bytes = (item_size / csum_size) * root->sectorsize;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	935	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
				936	if (!sums) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	937	return -ENOMEM;
				938	}
				939
				940	INIT_LIST_HEAD(&sums->list);
				941	sums->len = file_bytes;
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	942	sums->bytenr = key->offset;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	943
				944	/*
				945	* copy all the sums into the ordered sum struct
				946	*/
				947	sector_sum = sums->sums;
				948	cur_offset = key->offset;
				949	ptr = btrfs_item_ptr_offset(eb, slot);
				950	while(item_size > 0) {
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	951	sector_sum->bytenr = cur_offset;
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	952	read_extent_buffer(eb, &sector_sum->sum, ptr, csum_size);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	953	sector_sum++;
Josef Bacik	607d432	2008-12-02 07:17:45 -0500	[diff] [blame]	954	item_size -= csum_size;
				955	ptr += csum_size;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	956	cur_offset += root->sectorsize;
				957	}
				958
				959	/* let btrfs_csum_file_blocks add them into the file */
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	960	ret = btrfs_csum_file_blocks(trans, root->fs_info->csum_root, sums);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	961	BUG_ON(ret);
				962	kfree(sums);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	963	return 0;
				964	}
				965	/*
				966	* There are a few corners where the link count of the file can't
				967	* be properly maintained during replay. So, instead of adding
				968	* lots of complexity to the log code, we just scan the backrefs
				969	* for any file that has been through replay.
				970	*
				971	* The scan will update the link count on the inode to reflect the
				972	* number of back refs found. If it goes down to zero, the iput
				973	* will free the inode.
				974	*/
				975	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				976	struct btrfs_root *root,
				977	struct inode *inode)
				978	{
				979	struct btrfs_path *path;
				980	int ret;
				981	struct btrfs_key key;
				982	u64 nlink = 0;
				983	unsigned long ptr;
				984	unsigned long ptr_end;
				985	int name_len;
				986
				987	key.objectid = inode->i_ino;
				988	key.type = BTRFS_INODE_REF_KEY;
				989	key.offset = (u64)-1;
				990
				991	path = btrfs_alloc_path();
				992
				993	while(1) {
				994	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				995	if (ret < 0)
				996	break;
				997	if (ret > 0) {
				998	if (path->slots[0] == 0)
				999	break;
				1000	path->slots[0]--;
				1001	}
				1002	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1003	path->slots[0]);
				1004	if (key.objectid != inode->i_ino \|\|
				1005	key.type != BTRFS_INODE_REF_KEY)
				1006	break;
				1007	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				1008	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				1009	path->slots[0]);
				1010	while(ptr < ptr_end) {
				1011	struct btrfs_inode_ref *ref;
				1012
				1013	ref = (struct btrfs_inode_ref *)ptr;
				1014	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				1015	ref);
				1016	ptr = (unsigned long)(ref + 1) + name_len;
				1017	nlink++;
				1018	}
				1019
				1020	if (key.offset == 0)
				1021	break;
				1022	key.offset--;
				1023	btrfs_release_path(root, path);
				1024	}
				1025	btrfs_free_path(path);
				1026	if (nlink != inode->i_nlink) {
				1027	inode->i_nlink = nlink;
				1028	btrfs_update_inode(trans, root, inode);
				1029	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	1030	BTRFS_I(inode)->index_cnt = (u64)-1;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1031
				1032	return 0;
				1033	}
				1034
				1035	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				1036	struct btrfs_root *root,
				1037	struct btrfs_path *path)
				1038	{
				1039	int ret;
				1040	struct btrfs_key key;
				1041	struct inode *inode;
				1042
				1043	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1044	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1045	key.offset = (u64)-1;
				1046	while(1) {
				1047	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1048	if (ret < 0)
				1049	break;
				1050
				1051	if (ret == 1) {
				1052	if (path->slots[0] == 0)
				1053	break;
				1054	path->slots[0]--;
				1055	}
				1056
				1057	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1058	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				1059	key.type != BTRFS_ORPHAN_ITEM_KEY)
				1060	break;
				1061
				1062	ret = btrfs_del_item(trans, root, path);
				1063	BUG_ON(ret);
				1064
				1065	btrfs_release_path(root, path);
				1066	inode = read_one_inode(root, key.offset);
				1067	BUG_ON(!inode);
				1068
				1069	ret = fixup_inode_link_count(trans, root, inode);
				1070	BUG_ON(ret);
				1071
				1072	iput(inode);
				1073
				1074	if (key.offset == 0)
				1075	break;
				1076	key.offset--;
				1077	}
				1078	btrfs_release_path(root, path);
				1079	return 0;
				1080	}
				1081
				1082
				1083	/*
				1084	* record a given inode in the fixup dir so we can check its link
				1085	* count when replay is done. The link count is incremented here
				1086	* so the inode won't go away until we check it
				1087	*/
				1088	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				1089	struct btrfs_root *root,
				1090	struct btrfs_path *path,
				1091	u64 objectid)
				1092	{
				1093	struct btrfs_key key;
				1094	int ret = 0;
				1095	struct inode *inode;
				1096
				1097	inode = read_one_inode(root, objectid);
				1098	BUG_ON(!inode);
				1099
				1100	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1101	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
				1102	key.offset = objectid;
				1103
				1104	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1105
				1106	btrfs_release_path(root, path);
				1107	if (ret == 0) {
				1108	btrfs_inc_nlink(inode);
				1109	btrfs_update_inode(trans, root, inode);
				1110	} else if (ret == -EEXIST) {
				1111	ret = 0;
				1112	} else {
				1113	BUG();
				1114	}
				1115	iput(inode);
				1116
				1117	return ret;
				1118	}
				1119
				1120	/*
				1121	* when replaying the log for a directory, we only insert names
				1122	* for inodes that actually exist. This means an fsync on a directory
				1123	* does not implicitly fsync all the new files in it
				1124	*/
				1125	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1126	struct btrfs_root *root,
				1127	struct btrfs_path *path,
				1128	u64 dirid, u64 index,
				1129	char *name, int name_len, u8 type,
				1130	struct btrfs_key *location)
				1131	{
				1132	struct inode *inode;
				1133	struct inode *dir;
				1134	int ret;
				1135
				1136	inode = read_one_inode(root, location->objectid);
				1137	if (!inode)
				1138	return -ENOENT;
				1139
				1140	dir = read_one_inode(root, dirid);
				1141	if (!dir) {
				1142	iput(inode);
				1143	return -EIO;
				1144	}
				1145	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
				1146
				1147	/* FIXME, put inode into FIXUP list */
				1148
				1149	iput(inode);
				1150	iput(dir);
				1151	return ret;
				1152	}
				1153
				1154	/*
				1155	* take a single entry in a log directory item and replay it into
				1156	* the subvolume.
				1157	*
				1158	* if a conflicting item exists in the subdirectory already,
				1159	* the inode it points to is unlinked and put into the link count
				1160	* fix up tree.
				1161	*
				1162	* If a name from the log points to a file or directory that does
				1163	* not exist in the FS, it is skipped. fsyncs on directories
				1164	* do not force down inodes inside that directory, just changes to the
				1165	* names or unlinks in a directory.
				1166	*/
				1167	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1168	struct btrfs_root *root,
				1169	struct btrfs_path *path,
				1170	struct extent_buffer *eb,
				1171	struct btrfs_dir_item *di,
				1172	struct btrfs_key *key)
				1173	{
				1174	char *name;
				1175	int name_len;
				1176	struct btrfs_dir_item *dst_di;
				1177	struct btrfs_key found_key;
				1178	struct btrfs_key log_key;
				1179	struct inode *dir;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1180	u8 log_type;
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1181	int exists;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1182	int ret;
				1183
				1184	dir = read_one_inode(root, key->objectid);
				1185	BUG_ON(!dir);
				1186
				1187	name_len = btrfs_dir_name_len(eb, di);
				1188	name = kmalloc(name_len, GFP_NOFS);
				1189	log_type = btrfs_dir_type(eb, di);
				1190	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1191	name_len);
				1192
				1193	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1194	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1195	if (exists == 0)
				1196	exists = 1;
				1197	else
				1198	exists = 0;
				1199	btrfs_release_path(root, path);
				1200
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1201	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1202	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1203	name, name_len, 1);
				1204	}
				1205	else if (key->type == BTRFS_DIR_INDEX_KEY) {
				1206	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1207	key->objectid,
				1208	key->offset, name,
				1209	name_len, 1);
				1210	} else {
				1211	BUG();
				1212	}
				1213	if (!dst_di \|\| IS_ERR(dst_di)) {
				1214	/* we need a sequence number to insert, so we only
				1215	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1216	*/
				1217	if (key->type != BTRFS_DIR_INDEX_KEY)
				1218	goto out;
				1219	goto insert;
				1220	}
				1221
				1222	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1223	/* the existing item matches the logged item */
				1224	if (found_key.objectid == log_key.objectid &&
				1225	found_key.type == log_key.type &&
				1226	found_key.offset == log_key.offset &&
				1227	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				1228	goto out;
				1229	}
				1230
				1231	/*
				1232	* don't drop the conflicting directory entry if the inode
				1233	* for the new entry doesn't exist
				1234	*/
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1235	if (!exists)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1236	goto out;
				1237
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1238	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
				1239	BUG_ON(ret);
				1240
				1241	if (key->type == BTRFS_DIR_INDEX_KEY)
				1242	goto insert;
				1243	out:
				1244	btrfs_release_path(root, path);
				1245	kfree(name);
				1246	iput(dir);
				1247	return 0;
				1248
				1249	insert:
				1250	btrfs_release_path(root, path);
				1251	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
				1252	name, name_len, log_type, &log_key);
				1253
				1254	if (ret && ret != -ENOENT)
				1255	BUG();
				1256	goto out;
				1257	}
				1258
				1259	/*
				1260	* find all the names in a directory item and reconcile them into
				1261	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				1262	* one name in a directory item, but the same code gets used for
				1263	* both directory index types
				1264	*/
				1265	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				1266	struct btrfs_root *root,
				1267	struct btrfs_path *path,
				1268	struct extent_buffer *eb, int slot,
				1269	struct btrfs_key *key)
				1270	{
				1271	int ret;
				1272	u32 item_size = btrfs_item_size_nr(eb, slot);
				1273	struct btrfs_dir_item *di;
				1274	int name_len;
				1275	unsigned long ptr;
				1276	unsigned long ptr_end;
				1277
				1278	ptr = btrfs_item_ptr_offset(eb, slot);
				1279	ptr_end = ptr + item_size;
				1280	while(ptr < ptr_end) {
				1281	di = (struct btrfs_dir_item *)ptr;
				1282	name_len = btrfs_dir_name_len(eb, di);
				1283	ret = replay_one_name(trans, root, path, eb, di, key);
				1284	BUG_ON(ret);
				1285	ptr = (unsigned long)(di + 1);
				1286	ptr += name_len;
				1287	}
				1288	return 0;
				1289	}
				1290
				1291	/*
				1292	* directory replay has two parts. There are the standard directory
				1293	* items in the log copied from the subvolume, and range items
				1294	* created in the log while the subvolume was logged.
				1295	*
				1296	* The range items tell us which parts of the key space the log
				1297	* is authoritative for. During replay, if a key in the subvolume
				1298	* directory is in a logged range item, but not actually in the log
				1299	* that means it was deleted from the directory before the fsync
				1300	* and should be removed.
				1301	*/
				1302	static noinline int find_dir_range(struct btrfs_root *root,
				1303	struct btrfs_path *path,
				1304	u64 dirid, int key_type,
				1305	u64 start_ret, u64 end_ret)
				1306	{
				1307	struct btrfs_key key;
				1308	u64 found_end;
				1309	struct btrfs_dir_log_item *item;
				1310	int ret;
				1311	int nritems;
				1312
				1313	if (*start_ret == (u64)-1)
				1314	return 1;
				1315
				1316	key.objectid = dirid;
				1317	key.type = key_type;
				1318	key.offset = *start_ret;
				1319
				1320	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1321	if (ret < 0)
				1322	goto out;
				1323	if (ret > 0) {
				1324	if (path->slots[0] == 0)
				1325	goto out;
				1326	path->slots[0]--;
				1327	}
				1328	if (ret != 0)
				1329	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1330
				1331	if (key.type != key_type \|\| key.objectid != dirid) {
				1332	ret = 1;
				1333	goto next;
				1334	}
				1335	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1336	struct btrfs_dir_log_item);
				1337	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1338
				1339	if (start_ret >= key.offset && start_ret <= found_end) {
				1340	ret = 0;
				1341	*start_ret = key.offset;
				1342	*end_ret = found_end;
				1343	goto out;
				1344	}
				1345	ret = 1;
				1346	next:
				1347	/* check the next slot in the tree to see if it is a valid item */
				1348	nritems = btrfs_header_nritems(path->nodes[0]);
				1349	if (path->slots[0] >= nritems) {
				1350	ret = btrfs_next_leaf(root, path);
				1351	if (ret)
				1352	goto out;
				1353	} else {
				1354	path->slots[0]++;
				1355	}
				1356
				1357	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1358
				1359	if (key.type != key_type \|\| key.objectid != dirid) {
				1360	ret = 1;
				1361	goto out;
				1362	}
				1363	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1364	struct btrfs_dir_log_item);
				1365	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1366	*start_ret = key.offset;
				1367	*end_ret = found_end;
				1368	ret = 0;
				1369	out:
				1370	btrfs_release_path(root, path);
				1371	return ret;
				1372	}
				1373
				1374	/*
				1375	* this looks for a given directory item in the log. If the directory
				1376	* item is not in the log, the item is removed and the inode it points
				1377	* to is unlinked
				1378	*/
				1379	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				1380	struct btrfs_root *root,
				1381	struct btrfs_root *log,
				1382	struct btrfs_path *path,
				1383	struct btrfs_path *log_path,
				1384	struct inode *dir,
				1385	struct btrfs_key *dir_key)
				1386	{
				1387	int ret;
				1388	struct extent_buffer *eb;
				1389	int slot;
				1390	u32 item_size;
				1391	struct btrfs_dir_item *di;
				1392	struct btrfs_dir_item *log_di;
				1393	int name_len;
				1394	unsigned long ptr;
				1395	unsigned long ptr_end;
				1396	char *name;
				1397	struct inode *inode;
				1398	struct btrfs_key location;
				1399
				1400	again:
				1401	eb = path->nodes[0];
				1402	slot = path->slots[0];
				1403	item_size = btrfs_item_size_nr(eb, slot);
				1404	ptr = btrfs_item_ptr_offset(eb, slot);
				1405	ptr_end = ptr + item_size;
				1406	while(ptr < ptr_end) {
				1407	di = (struct btrfs_dir_item *)ptr;
				1408	name_len = btrfs_dir_name_len(eb, di);
				1409	name = kmalloc(name_len, GFP_NOFS);
				1410	if (!name) {
				1411	ret = -ENOMEM;
				1412	goto out;
				1413	}
				1414	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1415	name_len);
				1416	log_di = NULL;
				1417	if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
				1418	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				1419	dir_key->objectid,
				1420	name, name_len, 0);
				1421	} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
				1422	log_di = btrfs_lookup_dir_index_item(trans, log,
				1423	log_path,
				1424	dir_key->objectid,
				1425	dir_key->offset,
				1426	name, name_len, 0);
				1427	}
				1428	if (!log_di \|\| IS_ERR(log_di)) {
				1429	btrfs_dir_item_key_to_cpu(eb, di, &location);
				1430	btrfs_release_path(root, path);
				1431	btrfs_release_path(log, log_path);
				1432	inode = read_one_inode(root, location.objectid);
				1433	BUG_ON(!inode);
				1434
				1435	ret = link_to_fixup_dir(trans, root,
				1436	path, location.objectid);
				1437	BUG_ON(ret);
				1438	btrfs_inc_nlink(inode);
				1439	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1440	name, name_len);
				1441	BUG_ON(ret);
				1442	kfree(name);
				1443	iput(inode);
				1444
				1445	/* there might still be more names under this key
				1446	* check and repeat if required
				1447	*/
				1448	ret = btrfs_search_slot(NULL, root, dir_key, path,
				1449	0, 0);
				1450	if (ret == 0)
				1451	goto again;
				1452	ret = 0;
				1453	goto out;
				1454	}
				1455	btrfs_release_path(log, log_path);
				1456	kfree(name);
				1457
				1458	ptr = (unsigned long)(di + 1);
				1459	ptr += name_len;
				1460	}
				1461	ret = 0;
				1462	out:
				1463	btrfs_release_path(root, path);
				1464	btrfs_release_path(log, log_path);
				1465	return ret;
				1466	}
				1467
				1468	/*
				1469	* deletion replay happens before we copy any new directory items
				1470	* out of the log or out of backreferences from inodes. It
				1471	* scans the log to find ranges of keys that log is authoritative for,
				1472	* and then scans the directory to find items in those ranges that are
				1473	* not present in the log.
				1474	*
				1475	* Anything we don't find in the log is unlinked and removed from the
				1476	* directory.
				1477	*/
				1478	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				1479	struct btrfs_root *root,
				1480	struct btrfs_root *log,
				1481	struct btrfs_path *path,
				1482	u64 dirid)
				1483	{
				1484	u64 range_start;
				1485	u64 range_end;
				1486	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				1487	int ret = 0;
				1488	struct btrfs_key dir_key;
				1489	struct btrfs_key found_key;
				1490	struct btrfs_path *log_path;
				1491	struct inode *dir;
				1492
				1493	dir_key.objectid = dirid;
				1494	dir_key.type = BTRFS_DIR_ITEM_KEY;
				1495	log_path = btrfs_alloc_path();
				1496	if (!log_path)
				1497	return -ENOMEM;
				1498
				1499	dir = read_one_inode(root, dirid);
				1500	/* it isn't an error if the inode isn't there, that can happen
				1501	* because we replay the deletes before we copy in the inode item
				1502	* from the log
				1503	*/
				1504	if (!dir) {
				1505	btrfs_free_path(log_path);
				1506	return 0;
				1507	}
				1508	again:
				1509	range_start = 0;
				1510	range_end = 0;
				1511	while(1) {
				1512	ret = find_dir_range(log, path, dirid, key_type,
				1513	&range_start, &range_end);
				1514	if (ret != 0)
				1515	break;
				1516
				1517	dir_key.offset = range_start;
				1518	while(1) {
				1519	int nritems;
				1520	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				1521	0, 0);
				1522	if (ret < 0)
				1523	goto out;
				1524
				1525	nritems = btrfs_header_nritems(path->nodes[0]);
				1526	if (path->slots[0] >= nritems) {
				1527	ret = btrfs_next_leaf(root, path);
				1528	if (ret)
				1529	break;
				1530	}
				1531	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1532	path->slots[0]);
				1533	if (found_key.objectid != dirid \|\|
				1534	found_key.type != dir_key.type)
				1535	goto next_type;
				1536
				1537	if (found_key.offset > range_end)
				1538	break;
				1539
				1540	ret = check_item_in_log(trans, root, log, path,
				1541	log_path, dir, &found_key);
				1542	BUG_ON(ret);
				1543	if (found_key.offset == (u64)-1)
				1544	break;
				1545	dir_key.offset = found_key.offset + 1;
				1546	}
				1547	btrfs_release_path(root, path);
				1548	if (range_end == (u64)-1)
				1549	break;
				1550	range_start = range_end + 1;
				1551	}
				1552
				1553	next_type:
				1554	ret = 0;
				1555	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				1556	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				1557	dir_key.type = BTRFS_DIR_INDEX_KEY;
				1558	btrfs_release_path(root, path);
				1559	goto again;
				1560	}
				1561	out:
				1562	btrfs_release_path(root, path);
				1563	btrfs_free_path(log_path);
				1564	iput(dir);
				1565	return ret;
				1566	}
				1567
				1568	/*
				1569	* the process_func used to replay items from the log tree. This
				1570	* gets called in two different stages. The first stage just looks
				1571	* for inodes and makes sure they are all copied into the subvolume.
				1572	*
				1573	* The second stage copies all the other item types from the log into
				1574	* the subvolume. The two stage approach is slower, but gets rid of
				1575	* lots of complexity around inodes referencing other inodes that exist
				1576	* only in the log (references come from either directory items or inode
				1577	* back refs).
				1578	*/
				1579	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				1580	struct walk_control *wc, u64 gen)
				1581	{
				1582	int nritems;
				1583	struct btrfs_path *path;
				1584	struct btrfs_root *root = wc->replay_dest;
				1585	struct btrfs_key key;
				1586	u32 item_size;
				1587	int level;
				1588	int i;
				1589	int ret;
				1590
				1591	btrfs_read_buffer(eb, gen);
				1592
				1593	level = btrfs_header_level(eb);
				1594
				1595	if (level != 0)
				1596	return 0;
				1597
				1598	path = btrfs_alloc_path();
				1599	BUG_ON(!path);
				1600
				1601	nritems = btrfs_header_nritems(eb);
				1602	for (i = 0; i < nritems; i++) {
				1603	btrfs_item_key_to_cpu(eb, &key, i);
				1604	item_size = btrfs_item_size_nr(eb, i);
				1605
				1606	/* inode keys are done during the first stage */
				1607	if (key.type == BTRFS_INODE_ITEM_KEY &&
				1608	wc->stage == LOG_WALK_REPLAY_INODES) {
				1609	struct inode *inode;
				1610	struct btrfs_inode_item *inode_item;
				1611	u32 mode;
				1612
				1613	inode_item = btrfs_item_ptr(eb, i,
				1614	struct btrfs_inode_item);
				1615	mode = btrfs_inode_mode(eb, inode_item);
				1616	if (S_ISDIR(mode)) {
				1617	ret = replay_dir_deletes(wc->trans,
				1618	root, log, path, key.objectid);
				1619	BUG_ON(ret);
				1620	}
				1621	ret = overwrite_item(wc->trans, root, path,
				1622	eb, i, &key);
				1623	BUG_ON(ret);
				1624
				1625	/* for regular files, truncate away
				1626	* extents past the new EOF
				1627	*/
				1628	if (S_ISREG(mode)) {
				1629	inode = read_one_inode(root,
				1630	key.objectid);
				1631	BUG_ON(!inode);
				1632
				1633	ret = btrfs_truncate_inode_items(wc->trans,
				1634	root, inode, inode->i_size,
				1635	BTRFS_EXTENT_DATA_KEY);
				1636	BUG_ON(ret);
				1637	iput(inode);
				1638	}
				1639	ret = link_to_fixup_dir(wc->trans, root,
				1640	path, key.objectid);
				1641	BUG_ON(ret);
				1642	}
				1643	if (wc->stage < LOG_WALK_REPLAY_ALL)
				1644	continue;
				1645
				1646	/* these keys are simply copied */
				1647	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				1648	ret = overwrite_item(wc->trans, root, path,
				1649	eb, i, &key);
				1650	BUG_ON(ret);
				1651	} else if (key.type == BTRFS_INODE_REF_KEY) {
				1652	ret = add_inode_ref(wc->trans, root, log, path,
				1653	eb, i, &key);
				1654	BUG_ON(ret && ret != -ENOENT);
				1655	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				1656	ret = replay_one_extent(wc->trans, root, path,
				1657	eb, i, &key);
				1658	BUG_ON(ret);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	1659	} else if (key.type == BTRFS_EXTENT_CSUM_KEY) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1660	ret = replay_one_csum(wc->trans, root, path,
				1661	eb, i, &key);
				1662	BUG_ON(ret);
				1663	} else if (key.type == BTRFS_DIR_ITEM_KEY \|\|
				1664	key.type == BTRFS_DIR_INDEX_KEY) {
				1665	ret = replay_one_dir_item(wc->trans, root, path,
				1666	eb, i, &key);
				1667	BUG_ON(ret);
				1668	}
				1669	}
				1670	btrfs_free_path(path);
				1671	return 0;
				1672	}
				1673
				1674	static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
				1675	struct btrfs_root *root,
				1676	struct btrfs_path path, int level,
				1677	struct walk_control *wc)
				1678	{
				1679	u64 root_owner;
				1680	u64 root_gen;
				1681	u64 bytenr;
				1682	u64 ptr_gen;
				1683	struct extent_buffer *next;
				1684	struct extent_buffer *cur;
				1685	struct extent_buffer *parent;
				1686	u32 blocksize;
				1687	int ret = 0;
				1688
				1689	WARN_ON(*level < 0);
				1690	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1691
				1692	while(*level > 0) {
				1693	WARN_ON(*level < 0);
				1694	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1695	cur = path->nodes[*level];
				1696
				1697	if (btrfs_header_level(cur) != *level)
				1698	WARN_ON(1);
				1699
				1700	if (path->slots[*level] >=
				1701	btrfs_header_nritems(cur))
				1702	break;
				1703
				1704	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				1705	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				1706	blocksize = btrfs_level_size(root, *level - 1);
				1707
				1708	parent = path->nodes[*level];
				1709	root_owner = btrfs_header_owner(parent);
				1710	root_gen = btrfs_header_generation(parent);
				1711
				1712	next = btrfs_find_create_tree_block(root, bytenr, blocksize);
				1713
				1714	wc->process_func(root, next, wc, ptr_gen);
				1715
				1716	if (*level == 1) {
				1717	path->slots[*level]++;
				1718	if (wc->free) {
				1719	btrfs_read_buffer(next, ptr_gen);
				1720
				1721	btrfs_tree_lock(next);
				1722	clean_tree_block(trans, root, next);
				1723	btrfs_wait_tree_block_writeback(next);
				1724	btrfs_tree_unlock(next);
				1725
				1726	ret = btrfs_drop_leaf_ref(trans, root, next);
				1727	BUG_ON(ret);
				1728
				1729	WARN_ON(root_owner !=
				1730	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1731	ret = btrfs_free_reserved_extent(root,
				1732	bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1733	BUG_ON(ret);
				1734	}
				1735	free_extent_buffer(next);
				1736	continue;
				1737	}
				1738	btrfs_read_buffer(next, ptr_gen);
				1739
				1740	WARN_ON(*level <= 0);
				1741	if (path->nodes[*level-1])
				1742	free_extent_buffer(path->nodes[*level-1]);
				1743	path->nodes[*level-1] = next;
				1744	*level = btrfs_header_level(next);
				1745	path->slots[*level] = 0;
				1746	cond_resched();
				1747	}
				1748	WARN_ON(*level < 0);
				1749	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1750
				1751	if (path->nodes[*level] == root->node) {
				1752	parent = path->nodes[*level];
				1753	} else {
				1754	parent = path->nodes[*level + 1];
				1755	}
				1756	bytenr = path->nodes[*level]->start;
				1757
				1758	blocksize = btrfs_level_size(root, *level);
				1759	root_owner = btrfs_header_owner(parent);
				1760	root_gen = btrfs_header_generation(parent);
				1761
				1762	wc->process_func(root, path->nodes[*level], wc,
				1763	btrfs_header_generation(path->nodes[*level]));
				1764
				1765	if (wc->free) {
				1766	next = path->nodes[*level];
				1767	btrfs_tree_lock(next);
				1768	clean_tree_block(trans, root, next);
				1769	btrfs_wait_tree_block_writeback(next);
				1770	btrfs_tree_unlock(next);
				1771
				1772	if (*level == 0) {
				1773	ret = btrfs_drop_leaf_ref(trans, root, next);
				1774	BUG_ON(ret);
				1775	}
				1776	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1777	ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1778	BUG_ON(ret);
				1779	}
				1780	free_extent_buffer(path->nodes[*level]);
				1781	path->nodes[*level] = NULL;
				1782	*level += 1;
				1783
				1784	cond_resched();
				1785	return 0;
				1786	}
				1787
				1788	static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
				1789	struct btrfs_root *root,
				1790	struct btrfs_path path, int level,
				1791	struct walk_control *wc)
				1792	{
				1793	u64 root_owner;
				1794	u64 root_gen;
				1795	int i;
				1796	int slot;
				1797	int ret;
				1798
				1799	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
				1800	slot = path->slots[i];
				1801	if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
				1802	struct extent_buffer *node;
				1803	node = path->nodes[i];
				1804	path->slots[i]++;
				1805	*level = i;
				1806	WARN_ON(*level == 0);
				1807	return 0;
				1808	} else {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	1809	struct extent_buffer *parent;
				1810	if (path->nodes[*level] == root->node)
				1811	parent = path->nodes[*level];
				1812	else
				1813	parent = path->nodes[*level + 1];
				1814
				1815	root_owner = btrfs_header_owner(parent);
				1816	root_gen = btrfs_header_generation(parent);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1817	wc->process_func(root, path->nodes[*level], wc,
				1818	btrfs_header_generation(path->nodes[*level]));
				1819	if (wc->free) {
				1820	struct extent_buffer *next;
				1821
				1822	next = path->nodes[*level];
				1823
				1824	btrfs_tree_lock(next);
				1825	clean_tree_block(trans, root, next);
				1826	btrfs_wait_tree_block_writeback(next);
				1827	btrfs_tree_unlock(next);
				1828
				1829	if (*level == 0) {
				1830	ret = btrfs_drop_leaf_ref(trans, root,
				1831	next);
				1832	BUG_ON(ret);
				1833	}
				1834
				1835	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1836	ret = btrfs_free_reserved_extent(root,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1837	path->nodes[*level]->start,
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1838	path->nodes[*level]->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1839	BUG_ON(ret);
				1840	}
				1841	free_extent_buffer(path->nodes[*level]);
				1842	path->nodes[*level] = NULL;
				1843	*level = i + 1;
				1844	}
				1845	}
				1846	return 1;
				1847	}
				1848
				1849	/*
				1850	* drop the reference count on the tree rooted at 'snap'. This traverses
				1851	* the tree freeing any blocks that have a ref count of zero after being
				1852	* decremented.
				1853	*/
				1854	static int walk_log_tree(struct btrfs_trans_handle *trans,
				1855	struct btrfs_root log, struct walk_control wc)
				1856	{
				1857	int ret = 0;
				1858	int wret;
				1859	int level;
				1860	struct btrfs_path *path;
				1861	int i;
				1862	int orig_level;
				1863
				1864	path = btrfs_alloc_path();
				1865	BUG_ON(!path);
				1866
				1867	level = btrfs_header_level(log->node);
				1868	orig_level = level;
				1869	path->nodes[level] = log->node;
				1870	extent_buffer_get(log->node);
				1871	path->slots[level] = 0;
				1872
				1873	while(1) {
				1874	wret = walk_down_log_tree(trans, log, path, &level, wc);
				1875	if (wret > 0)
				1876	break;
				1877	if (wret < 0)
				1878	ret = wret;
				1879
				1880	wret = walk_up_log_tree(trans, log, path, &level, wc);
				1881	if (wret > 0)
				1882	break;
				1883	if (wret < 0)
				1884	ret = wret;
				1885	}
				1886
				1887	/* was the root node processed? if not, catch it here */
				1888	if (path->nodes[orig_level]) {
				1889	wc->process_func(log, path->nodes[orig_level], wc,
				1890	btrfs_header_generation(path->nodes[orig_level]));
				1891	if (wc->free) {
				1892	struct extent_buffer *next;
				1893
				1894	next = path->nodes[orig_level];
				1895
				1896	btrfs_tree_lock(next);
				1897	clean_tree_block(trans, log, next);
				1898	btrfs_wait_tree_block_writeback(next);
				1899	btrfs_tree_unlock(next);
				1900
				1901	if (orig_level == 0) {
				1902	ret = btrfs_drop_leaf_ref(trans, log,
				1903	next);
				1904	BUG_ON(ret);
				1905	}
				1906	WARN_ON(log->root_key.objectid !=
				1907	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1908	ret = btrfs_free_reserved_extent(log, next->start,
				1909	next->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1910	BUG_ON(ret);
				1911	}
				1912	}
				1913
				1914	for (i = 0; i <= orig_level; i++) {
				1915	if (path->nodes[i]) {
				1916	free_extent_buffer(path->nodes[i]);
				1917	path->nodes[i] = NULL;
				1918	}
				1919	}
				1920	btrfs_free_path(path);
				1921	if (wc->free)
				1922	free_extent_buffer(log->node);
				1923	return ret;
				1924	}
				1925
Christoph Hellwig	b295086	2008-12-02 09:54:17 -0500	[diff] [blame]	1926	static int wait_log_commit(struct btrfs_root *log)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1927	{
				1928	DEFINE_WAIT(wait);
				1929	u64 transid = log->fs_info->tree_log_transid;
				1930
				1931	do {
				1932	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1933	TASK_UNINTERRUPTIBLE);
				1934	mutex_unlock(&log->fs_info->tree_log_mutex);
				1935	if (atomic_read(&log->fs_info->tree_log_commit))
				1936	schedule();
				1937	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1938	mutex_lock(&log->fs_info->tree_log_mutex);
				1939	} while(transid == log->fs_info->tree_log_transid &&
				1940	atomic_read(&log->fs_info->tree_log_commit));
				1941	return 0;
				1942	}
				1943
				1944	/*
				1945	* btrfs_sync_log does sends a given tree log down to the disk and
				1946	* updates the super blocks to record it. When this call is done,
				1947	* you know that any inodes previously logged are safely on disk
				1948	*/
				1949	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				1950	struct btrfs_root *root)
				1951	{
				1952	int ret;
				1953	unsigned long batch;
				1954	struct btrfs_root *log = root->log_root;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1955
				1956	mutex_lock(&log->fs_info->tree_log_mutex);
				1957	if (atomic_read(&log->fs_info->tree_log_commit)) {
				1958	wait_log_commit(log);
				1959	goto out;
				1960	}
				1961	atomic_set(&log->fs_info->tree_log_commit, 1);
				1962
				1963	while(1) {
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	1964	batch = log->fs_info->tree_log_batch;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1965	mutex_unlock(&log->fs_info->tree_log_mutex);
				1966	schedule_timeout_uninterruptible(1);
				1967	mutex_lock(&log->fs_info->tree_log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1968
				1969	while(atomic_read(&log->fs_info->tree_log_writers)) {
				1970	DEFINE_WAIT(wait);
				1971	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1972	TASK_UNINTERRUPTIBLE);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1973	mutex_unlock(&log->fs_info->tree_log_mutex);
				1974	if (atomic_read(&log->fs_info->tree_log_writers))
				1975	schedule();
				1976	mutex_lock(&log->fs_info->tree_log_mutex);
				1977	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1978	}
				1979	if (batch == log->fs_info->tree_log_batch)
				1980	break;
				1981	}
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1982
				1983	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1984	BUG_ON(ret);
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1985	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
				1986	&root->fs_info->log_root_tree->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1987	BUG_ON(ret);
				1988
				1989	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
				1990	log->fs_info->log_root_tree->node->start);
				1991	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
				1992	btrfs_header_level(log->fs_info->log_root_tree->node));
				1993
Yan Zheng	a512bbf	2008-12-08 16:46:26 -0500	[diff] [blame]	1994	write_ctree_super(trans, log->fs_info->tree_root, 2);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1995	log->fs_info->tree_log_transid++;
				1996	log->fs_info->tree_log_batch = 0;
				1997	atomic_set(&log->fs_info->tree_log_commit, 0);
				1998	smp_mb();
				1999	if (waitqueue_active(&log->fs_info->tree_log_wait))
				2000	wake_up(&log->fs_info->tree_log_wait);
				2001	out:
				2002	mutex_unlock(&log->fs_info->tree_log_mutex);
				2003	return 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2004	}
				2005
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2006	/* * free all the extents used by the tree log. This should be called
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2007	* at commit time of the full transaction
				2008	*/
				2009	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				2010	{
				2011	int ret;
				2012	struct btrfs_root *log;
				2013	struct key;
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2014	u64 start;
				2015	u64 end;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2016	struct walk_control wc = {
				2017	.free = 1,
				2018	.process_func = process_one_buffer
				2019	};
				2020
				2021	if (!root->log_root)
				2022	return 0;
				2023
				2024	log = root->log_root;
				2025	ret = walk_log_tree(trans, log, &wc);
				2026	BUG_ON(ret);
				2027
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2028	while(1) {
				2029	ret = find_first_extent_bit(&log->dirty_log_pages,
				2030	0, &start, &end, EXTENT_DIRTY);
				2031	if (ret)
				2032	break;
				2033
				2034	clear_extent_dirty(&log->dirty_log_pages,
				2035	start, end, GFP_NOFS);
				2036	}
				2037
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2038	log = root->log_root;
				2039	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
				2040	&log->root_key);
				2041	BUG_ON(ret);
				2042	root->log_root = NULL;
				2043	kfree(root->log_root);
				2044	return 0;
				2045	}
				2046
				2047	/*
				2048	* helper function to update the item for a given subvolumes log root
				2049	* in the tree of log roots
				2050	*/
				2051	static int update_log_root(struct btrfs_trans_handle *trans,
				2052	struct btrfs_root *log)
				2053	{
				2054	u64 bytenr = btrfs_root_bytenr(&log->root_item);
				2055	int ret;
				2056
				2057	if (log->node->start == bytenr)
				2058	return 0;
				2059
				2060	btrfs_set_root_bytenr(&log->root_item, log->node->start);
Yan Zheng	84234f3	2008-10-29 14:49:05 -0400	[diff] [blame]	2061	btrfs_set_root_generation(&log->root_item, trans->transid);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2062	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
				2063	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
				2064	&log->root_key, &log->root_item);
				2065	BUG_ON(ret);
				2066	return ret;
				2067	}
				2068
				2069	/*
				2070	* If both a file and directory are logged, and unlinks or renames are
				2071	* mixed in, we have a few interesting corners:
				2072	*
				2073	* create file X in dir Y
				2074	* link file X to X.link in dir Y
				2075	* fsync file X
				2076	* unlink file X but leave X.link
				2077	* fsync dir Y
				2078	*
				2079	* After a crash we would expect only X.link to exist. But file X
				2080	* didn't get fsync'd again so the log has back refs for X and X.link.
				2081	*
				2082	* We solve this by removing directory entries and inode backrefs from the
				2083	* log when a file that was logged in the current transaction is
				2084	* unlinked. Any later fsync will include the updated log entries, and
				2085	* we'll be able to reconstruct the proper directory items from backrefs.
				2086	*
				2087	* This optimizations allows us to avoid relogging the entire inode
				2088	* or the entire directory.
				2089	*/
				2090	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				2091	struct btrfs_root *root,
				2092	const char *name, int name_len,
				2093	struct inode *dir, u64 index)
				2094	{
				2095	struct btrfs_root *log;
				2096	struct btrfs_dir_item *di;
				2097	struct btrfs_path *path;
				2098	int ret;
				2099	int bytes_del = 0;
				2100
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2101	if (BTRFS_I(dir)->logged_trans < trans->transid)
				2102	return 0;
				2103
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2104	ret = join_running_log_trans(root);
				2105	if (ret)
				2106	return 0;
				2107
				2108	mutex_lock(&BTRFS_I(dir)->log_mutex);
				2109
				2110	log = root->log_root;
				2111	path = btrfs_alloc_path();
				2112	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
				2113	name, name_len, -1);
				2114	if (di && !IS_ERR(di)) {
				2115	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2116	bytes_del += name_len;
				2117	BUG_ON(ret);
				2118	}
				2119	btrfs_release_path(log, path);
				2120	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
				2121	index, name, name_len, -1);
				2122	if (di && !IS_ERR(di)) {
				2123	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2124	bytes_del += name_len;
				2125	BUG_ON(ret);
				2126	}
				2127
				2128	/* update the directory size in the log to reflect the names
				2129	* we have removed
				2130	*/
				2131	if (bytes_del) {
				2132	struct btrfs_key key;
				2133
				2134	key.objectid = dir->i_ino;
				2135	key.offset = 0;
				2136	key.type = BTRFS_INODE_ITEM_KEY;
				2137	btrfs_release_path(log, path);
				2138
				2139	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				2140	if (ret == 0) {
				2141	struct btrfs_inode_item *item;
				2142	u64 i_size;
				2143
				2144	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2145	struct btrfs_inode_item);
				2146	i_size = btrfs_inode_size(path->nodes[0], item);
				2147	if (i_size > bytes_del)
				2148	i_size -= bytes_del;
				2149	else
				2150	i_size = 0;
				2151	btrfs_set_inode_size(path->nodes[0], item, i_size);
				2152	btrfs_mark_buffer_dirty(path->nodes[0]);
				2153	} else
				2154	ret = 0;
				2155	btrfs_release_path(log, path);
				2156	}
				2157
				2158	btrfs_free_path(path);
				2159	mutex_unlock(&BTRFS_I(dir)->log_mutex);
				2160	end_log_trans(root);
				2161
				2162	return 0;
				2163	}
				2164
				2165	/* see comments for btrfs_del_dir_entries_in_log */
				2166	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				2167	struct btrfs_root *root,
				2168	const char *name, int name_len,
				2169	struct inode *inode, u64 dirid)
				2170	{
				2171	struct btrfs_root *log;
				2172	u64 index;
				2173	int ret;
				2174
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2175	if (BTRFS_I(inode)->logged_trans < trans->transid)
				2176	return 0;
				2177
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2178	ret = join_running_log_trans(root);
				2179	if (ret)
				2180	return 0;
				2181	log = root->log_root;
				2182	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2183
				2184	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
				2185	dirid, &index);
				2186	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2187	end_log_trans(root);
				2188
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2189	return ret;
				2190	}
				2191
				2192	/*
				2193	* creates a range item in the log for 'dirid'. first_offset and
				2194	* last_offset tell us which parts of the key space the log should
				2195	* be considered authoritative for.
				2196	*/
				2197	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				2198	struct btrfs_root *log,
				2199	struct btrfs_path *path,
				2200	int key_type, u64 dirid,
				2201	u64 first_offset, u64 last_offset)
				2202	{
				2203	int ret;
				2204	struct btrfs_key key;
				2205	struct btrfs_dir_log_item *item;
				2206
				2207	key.objectid = dirid;
				2208	key.offset = first_offset;
				2209	if (key_type == BTRFS_DIR_ITEM_KEY)
				2210	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				2211	else
				2212	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				2213	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				2214	BUG_ON(ret);
				2215
				2216	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2217	struct btrfs_dir_log_item);
				2218	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				2219	btrfs_mark_buffer_dirty(path->nodes[0]);
				2220	btrfs_release_path(log, path);
				2221	return 0;
				2222	}
				2223
				2224	/*
				2225	* log all the items included in the current transaction for a given
				2226	* directory. This also creates the range items in the log tree required
				2227	* to replay anything deleted before the fsync
				2228	*/
				2229	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				2230	struct btrfs_root root, struct inode inode,
				2231	struct btrfs_path *path,
				2232	struct btrfs_path *dst_path, int key_type,
				2233	u64 min_offset, u64 *last_offset_ret)
				2234	{
				2235	struct btrfs_key min_key;
				2236	struct btrfs_key max_key;
				2237	struct btrfs_root *log = root->log_root;
				2238	struct extent_buffer *src;
				2239	int ret;
				2240	int i;
				2241	int nritems;
				2242	u64 first_offset = min_offset;
				2243	u64 last_offset = (u64)-1;
				2244
				2245	log = root->log_root;
				2246	max_key.objectid = inode->i_ino;
				2247	max_key.offset = (u64)-1;
				2248	max_key.type = key_type;
				2249
				2250	min_key.objectid = inode->i_ino;
				2251	min_key.type = key_type;
				2252	min_key.offset = min_offset;
				2253
				2254	path->keep_locks = 1;
				2255
				2256	ret = btrfs_search_forward(root, &min_key, &max_key,
				2257	path, 0, trans->transid);
				2258
				2259	/*
				2260	* we didn't find anything from this transaction, see if there
				2261	* is anything at all
				2262	*/
				2263	if (ret != 0 \|\| min_key.objectid != inode->i_ino \|\|
				2264	min_key.type != key_type) {
				2265	min_key.objectid = inode->i_ino;
				2266	min_key.type = key_type;
				2267	min_key.offset = (u64)-1;
				2268	btrfs_release_path(root, path);
				2269	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2270	if (ret < 0) {
				2271	btrfs_release_path(root, path);
				2272	return ret;
				2273	}
				2274	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2275
				2276	/* if ret == 0 there are items for this type,
				2277	* create a range to tell us the last key of this type.
				2278	* otherwise, there are no items in this directory after
				2279	* *min_offset, and we create a range to indicate that.
				2280	*/
				2281	if (ret == 0) {
				2282	struct btrfs_key tmp;
				2283	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				2284	path->slots[0]);
				2285	if (key_type == tmp.type) {
				2286	first_offset = max(min_offset, tmp.offset) + 1;
				2287	}
				2288	}
				2289	goto done;
				2290	}
				2291
				2292	/* go backward to find any previous key */
				2293	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2294	if (ret == 0) {
				2295	struct btrfs_key tmp;
				2296	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2297	if (key_type == tmp.type) {
				2298	first_offset = tmp.offset;
				2299	ret = overwrite_item(trans, log, dst_path,
				2300	path->nodes[0], path->slots[0],
				2301	&tmp);
				2302	}
				2303	}
				2304	btrfs_release_path(root, path);
				2305
				2306	/* find the first key from this transaction again */
				2307	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2308	if (ret != 0) {
				2309	WARN_ON(1);
				2310	goto done;
				2311	}
				2312
				2313	/*
				2314	* we have a block from this transaction, log every item in it
				2315	* from our directory
				2316	*/
				2317	while(1) {
				2318	struct btrfs_key tmp;
				2319	src = path->nodes[0];
				2320	nritems = btrfs_header_nritems(src);
				2321	for (i = path->slots[0]; i < nritems; i++) {
				2322	btrfs_item_key_to_cpu(src, &min_key, i);
				2323
				2324	if (min_key.objectid != inode->i_ino \|\|
				2325	min_key.type != key_type)
				2326	goto done;
				2327	ret = overwrite_item(trans, log, dst_path, src, i,
				2328	&min_key);
				2329	BUG_ON(ret);
				2330	}
				2331	path->slots[0] = nritems;
				2332
				2333	/*
				2334	* look ahead to the next item and see if it is also
				2335	* from this directory and from this transaction
				2336	*/
				2337	ret = btrfs_next_leaf(root, path);
				2338	if (ret == 1) {
				2339	last_offset = (u64)-1;
				2340	goto done;
				2341	}
				2342	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2343	if (tmp.objectid != inode->i_ino \|\| tmp.type != key_type) {
				2344	last_offset = (u64)-1;
				2345	goto done;
				2346	}
				2347	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				2348	ret = overwrite_item(trans, log, dst_path,
				2349	path->nodes[0], path->slots[0],
				2350	&tmp);
				2351
				2352	BUG_ON(ret);
				2353	last_offset = tmp.offset;
				2354	goto done;
				2355	}
				2356	}
				2357	done:
				2358	*last_offset_ret = last_offset;
				2359	btrfs_release_path(root, path);
				2360	btrfs_release_path(log, dst_path);
				2361
				2362	/* insert the log range keys to indicate where the log is valid */
				2363	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
				2364	first_offset, last_offset);
				2365	BUG_ON(ret);
				2366	return 0;
				2367	}
				2368
				2369	/*
				2370	* logging directories is very similar to logging inodes, We find all the items
				2371	* from the current transaction and write them to the log.
				2372	*
				2373	* The recovery code scans the directory in the subvolume, and if it finds a
				2374	* key in the range logged that is not present in the log tree, then it means
				2375	* that dir entry was unlinked during the transaction.
				2376	*
				2377	* In order for that scan to work, we must include one key smaller than
				2378	* the smallest logged by this transaction and one key larger than the largest
				2379	* key logged by this transaction.
				2380	*/
				2381	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				2382	struct btrfs_root root, struct inode inode,
				2383	struct btrfs_path *path,
				2384	struct btrfs_path *dst_path)
				2385	{
				2386	u64 min_key;
				2387	u64 max_key;
				2388	int ret;
				2389	int key_type = BTRFS_DIR_ITEM_KEY;
				2390
				2391	again:
				2392	min_key = 0;
				2393	max_key = 0;
				2394	while(1) {
				2395	ret = log_dir_items(trans, root, inode, path,
				2396	dst_path, key_type, min_key,
				2397	&max_key);
				2398	BUG_ON(ret);
				2399	if (max_key == (u64)-1)
				2400	break;
				2401	min_key = max_key + 1;
				2402	}
				2403
				2404	if (key_type == BTRFS_DIR_ITEM_KEY) {
				2405	key_type = BTRFS_DIR_INDEX_KEY;
				2406	goto again;
				2407	}
				2408	return 0;
				2409	}
				2410
				2411	/*
				2412	* a helper function to drop items from the log before we relog an
				2413	* inode. max_key_type indicates the highest item type to remove.
				2414	* This cannot be run for file data extents because it does not
				2415	* free the extents they point to.
				2416	*/
				2417	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				2418	struct btrfs_root *log,
				2419	struct btrfs_path *path,
				2420	u64 objectid, int max_key_type)
				2421	{
				2422	int ret;
				2423	struct btrfs_key key;
				2424	struct btrfs_key found_key;
				2425
				2426	key.objectid = objectid;
				2427	key.type = max_key_type;
				2428	key.offset = (u64)-1;
				2429
				2430	while(1) {
				2431	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				2432
				2433	if (ret != 1)
				2434	break;
				2435
				2436	if (path->slots[0] == 0)
				2437	break;
				2438
				2439	path->slots[0]--;
				2440	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2441	path->slots[0]);
				2442
				2443	if (found_key.objectid != objectid)
				2444	break;
				2445
				2446	ret = btrfs_del_item(trans, log, path);
				2447	BUG_ON(ret);
				2448	btrfs_release_path(log, path);
				2449	}
				2450	btrfs_release_path(log, path);
				2451	return 0;
				2452	}
				2453
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2454	static noinline int copy_extent_csums(struct btrfs_trans_handle *trans,
				2455	struct list_head *list,
				2456	struct btrfs_root *root,
				2457	u64 disk_bytenr, u64 len)
				2458	{
				2459	struct btrfs_ordered_sum *sums;
				2460	struct btrfs_sector_sum *sector_sum;
				2461	int ret;
				2462	struct btrfs_path *path;
				2463	struct btrfs_csum_item *item = NULL;
				2464	u64 end = disk_bytenr + len;
				2465	u64 item_start_offset = 0;
				2466	u64 item_last_offset = 0;
				2467	u32 diff;
				2468	u32 sum;
				2469	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
				2470
				2471	sums = kzalloc(btrfs_ordered_sum_size(root, len), GFP_NOFS);
				2472
				2473	sector_sum = sums->sums;
				2474	sums->bytenr = disk_bytenr;
				2475	sums->len = len;
				2476	list_add_tail(&sums->list, list);
				2477
				2478	path = btrfs_alloc_path();
				2479	while(disk_bytenr < end) {
				2480	if (!item \|\| disk_bytenr < item_start_offset \|\|
				2481	disk_bytenr >= item_last_offset) {
				2482	struct btrfs_key found_key;
				2483	u32 item_size;
				2484
				2485	if (item)
				2486	btrfs_release_path(root, path);
				2487	item = btrfs_lookup_csum(NULL, root, path,
				2488	disk_bytenr, 0);
				2489	if (IS_ERR(item)) {
				2490	ret = PTR_ERR(item);
				2491	if (ret == -ENOENT \|\| ret == -EFBIG)
				2492	ret = 0;
				2493	sum = 0;
				2494	printk("log no csum found for byte %llu\n",
				2495	(unsigned long long)disk_bytenr);
				2496	item = NULL;
				2497	btrfs_release_path(root, path);
				2498	goto found;
				2499	}
				2500	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2501	path->slots[0]);
				2502
				2503	item_start_offset = found_key.offset;
				2504	item_size = btrfs_item_size_nr(path->nodes[0],
				2505	path->slots[0]);
				2506	item_last_offset = item_start_offset +
				2507	(item_size / csum_size) *
				2508	root->sectorsize;
				2509	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2510	struct btrfs_csum_item);
				2511	}
				2512	/*
				2513	* this byte range must be able to fit inside
				2514	* a single leaf so it will also fit inside a u32
				2515	*/
				2516	diff = disk_bytenr - item_start_offset;
				2517	diff = diff / root->sectorsize;
				2518	diff = diff * csum_size;
				2519
				2520	read_extent_buffer(path->nodes[0], &sum,
				2521	((unsigned long)item) + diff,
				2522	csum_size);
				2523	found:
				2524	sector_sum->bytenr = disk_bytenr;
				2525	sector_sum->sum = sum;
				2526	disk_bytenr += root->sectorsize;
				2527	sector_sum++;
				2528	}
				2529	btrfs_free_path(path);
				2530	return 0;
				2531	}
				2532
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2533	static noinline int copy_items(struct btrfs_trans_handle *trans,
				2534	struct btrfs_root *log,
				2535	struct btrfs_path *dst_path,
				2536	struct extent_buffer *src,
				2537	int start_slot, int nr, int inode_only)
				2538	{
				2539	unsigned long src_offset;
				2540	unsigned long dst_offset;
				2541	struct btrfs_file_extent_item *extent;
				2542	struct btrfs_inode_item *inode_item;
				2543	int ret;
				2544	struct btrfs_key *ins_keys;
				2545	u32 *ins_sizes;
				2546	char *ins_data;
				2547	int i;
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2548	struct list_head ordered_sums;
				2549
				2550	INIT_LIST_HEAD(&ordered_sums);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2551
				2552	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				2553	nr * sizeof(u32), GFP_NOFS);
				2554	ins_sizes = (u32 *)ins_data;
				2555	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				2556
				2557	for (i = 0; i < nr; i++) {
				2558	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				2559	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				2560	}
				2561	ret = btrfs_insert_empty_items(trans, log, dst_path,
				2562	ins_keys, ins_sizes, nr);
				2563	BUG_ON(ret);
				2564
				2565	for (i = 0; i < nr; i++) {
				2566	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				2567	dst_path->slots[0]);
				2568
				2569	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				2570
				2571	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				2572	src_offset, ins_sizes[i]);
				2573
				2574	if (inode_only == LOG_INODE_EXISTS &&
				2575	ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				2576	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				2577	dst_path->slots[0],
				2578	struct btrfs_inode_item);
				2579	btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
				2580
				2581	/* set the generation to zero so the recover code
				2582	* can tell the difference between an logging
				2583	* just to say 'this inode exists' and a logging
				2584	* to say 'update this inode with these values'
				2585	*/
				2586	btrfs_set_inode_generation(dst_path->nodes[0],
				2587	inode_item, 0);
				2588	}
				2589	/* take a reference on file data extents so that truncates
				2590	* or deletes of this inode don't have to relog the inode
				2591	* again
				2592	*/
				2593	if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
				2594	int found_type;
				2595	extent = btrfs_item_ptr(src, start_slot + i,
				2596	struct btrfs_file_extent_item);
				2597
				2598	found_type = btrfs_file_extent_type(src, extent);
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	2599	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				2600	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2601	u64 ds = btrfs_file_extent_disk_bytenr(src,
				2602	extent);
				2603	u64 dl = btrfs_file_extent_disk_num_bytes(src,
				2604	extent);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2605	u64 cs = btrfs_file_extent_offset(src, extent);
				2606	u64 cl = btrfs_file_extent_num_bytes(src,
				2607	extent);;
Chris Mason	580afd7	2008-12-08 19:15:39 -0500	[diff] [blame]	2608	if (btrfs_file_extent_compression(src,
				2609	extent)) {
				2610	cs = 0;
				2611	cl = dl;
				2612	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2613	/* ds == 0 is a hole */
				2614	if (ds != 0) {
				2615	ret = btrfs_inc_extent_ref(trans, log,
				2616	ds, dl,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2617	dst_path->nodes[0]->start,
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2618	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2619	trans->transid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	2620	ins_keys[i].objectid);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2621	BUG_ON(ret);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2622	ret = copy_extent_csums(trans,
				2623	&ordered_sums,
				2624	log->fs_info->csum_root,
				2625	ds + cs, cl);
				2626	BUG_ON(ret);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2627	}
				2628	}
				2629	}
				2630	dst_path->slots[0]++;
				2631	}
				2632
				2633	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				2634	btrfs_release_path(log, dst_path);
				2635	kfree(ins_data);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2636
				2637	/*
				2638	* we have to do this after the loop above to avoid changing the
				2639	* log tree while trying to change the log tree.
				2640	*/
				2641	while(!list_empty(&ordered_sums)) {
				2642	struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
				2643	struct btrfs_ordered_sum,
				2644	list);
				2645	ret = btrfs_csum_file_blocks(trans, log, sums);
				2646	BUG_ON(ret);
				2647	list_del(&sums->list);
				2648	kfree(sums);
				2649	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2650	return 0;
				2651	}
				2652
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2653	/* log a single inode in the tree log.
				2654	* At least one parent directory for this inode must exist in the tree
				2655	* or be logged already.
				2656	*
				2657	* Any items from this inode changed by the current transaction are copied
				2658	* to the log tree. An extra reference is taken on any extents in this
				2659	* file, allowing us to avoid a whole pile of corner cases around logging
				2660	* blocks that have been removed from the tree.
				2661	*
				2662	* See LOG_INODE_ALL and related defines for a description of what inode_only
				2663	* does.
				2664	*
				2665	* This handles both files and directories.
				2666	*/
				2667	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				2668	struct btrfs_root root, struct inode inode,
				2669	int inode_only)
				2670	{
				2671	struct btrfs_path *path;
				2672	struct btrfs_path *dst_path;
				2673	struct btrfs_key min_key;
				2674	struct btrfs_key max_key;
				2675	struct btrfs_root *log = root->log_root;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2676	struct extent_buffer *src = NULL;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2677	u32 size;
				2678	int ret;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2679	int nritems;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2680	int ins_start_slot = 0;
				2681	int ins_nr;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2682
				2683	log = root->log_root;
				2684
				2685	path = btrfs_alloc_path();
				2686	dst_path = btrfs_alloc_path();
				2687
				2688	min_key.objectid = inode->i_ino;
				2689	min_key.type = BTRFS_INODE_ITEM_KEY;
				2690	min_key.offset = 0;
				2691
				2692	max_key.objectid = inode->i_ino;
				2693	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))
				2694	max_key.type = BTRFS_XATTR_ITEM_KEY;
				2695	else
				2696	max_key.type = (u8)-1;
				2697	max_key.offset = (u64)-1;
				2698
				2699	/*
				2700	* if this inode has already been logged and we're in inode_only
				2701	* mode, we don't want to delete the things that have already
				2702	* been written to the log.
				2703	*
				2704	* But, if the inode has been through an inode_only log,
				2705	* the logged_trans field is not set. This allows us to catch
				2706	* any new names for this inode in the backrefs by logging it
				2707	* again
				2708	*/
				2709	if (inode_only == LOG_INODE_EXISTS &&
				2710	BTRFS_I(inode)->logged_trans == trans->transid) {
				2711	btrfs_free_path(path);
				2712	btrfs_free_path(dst_path);
				2713	goto out;
				2714	}
				2715	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2716
				2717	/*
				2718	* a brute force approach to making sure we get the most uptodate
				2719	* copies of everything.
				2720	*/
				2721	if (S_ISDIR(inode->i_mode)) {
				2722	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2723
				2724	if (inode_only == LOG_INODE_EXISTS)
				2725	max_key_type = BTRFS_XATTR_ITEM_KEY;
				2726	ret = drop_objectid_items(trans, log, path,
				2727	inode->i_ino, max_key_type);
				2728	} else {
				2729	ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
				2730	}
				2731	BUG_ON(ret);
				2732	path->keep_locks = 1;
				2733
				2734	while(1) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2735	ins_nr = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2736	ret = btrfs_search_forward(root, &min_key, &max_key,
				2737	path, 0, trans->transid);
				2738	if (ret != 0)
				2739	break;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2740	again:
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2741	/* note, ins_nr might be > 0 here, cleanup outside the loop */
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2742	if (min_key.objectid != inode->i_ino)
				2743	break;
				2744	if (min_key.type > max_key.type)
				2745	break;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2746
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2747	src = path->nodes[0];
				2748	size = btrfs_item_size_nr(src, path->slots[0]);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2749	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				2750	ins_nr++;
				2751	goto next_slot;
				2752	} else if (!ins_nr) {
				2753	ins_start_slot = path->slots[0];
				2754	ins_nr = 1;
				2755	goto next_slot;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2756	}
				2757
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2758	ret = copy_items(trans, log, dst_path, src, ins_start_slot,
				2759	ins_nr, inode_only);
				2760	BUG_ON(ret);
				2761	ins_nr = 1;
				2762	ins_start_slot = path->slots[0];
				2763	next_slot:
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2764
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2765	nritems = btrfs_header_nritems(path->nodes[0]);
				2766	path->slots[0]++;
				2767	if (path->slots[0] < nritems) {
				2768	btrfs_item_key_to_cpu(path->nodes[0], &min_key,
				2769	path->slots[0]);
				2770	goto again;
				2771	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2772	if (ins_nr) {
				2773	ret = copy_items(trans, log, dst_path, src,
				2774	ins_start_slot,
				2775	ins_nr, inode_only);
				2776	BUG_ON(ret);
				2777	ins_nr = 0;
				2778	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2779	btrfs_release_path(root, path);
				2780
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2781	if (min_key.offset < (u64)-1)
				2782	min_key.offset++;
				2783	else if (min_key.type < (u8)-1)
				2784	min_key.type++;
				2785	else if (min_key.objectid < (u64)-1)
				2786	min_key.objectid++;
				2787	else
				2788	break;
				2789	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2790	if (ins_nr) {
				2791	ret = copy_items(trans, log, dst_path, src,
				2792	ins_start_slot,
				2793	ins_nr, inode_only);
				2794	BUG_ON(ret);
				2795	ins_nr = 0;
				2796	}
				2797	WARN_ON(ins_nr);
Chris Mason	9623f9a	2008-09-11 17:42:42 -0400	[diff] [blame]	2798	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2799	btrfs_release_path(root, path);
				2800	btrfs_release_path(log, dst_path);
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	2801	BTRFS_I(inode)->log_dirty_trans = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2802	ret = log_directory_changes(trans, root, inode, path, dst_path);
				2803	BUG_ON(ret);
				2804	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2805	BTRFS_I(inode)->logged_trans = trans->transid;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2806	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2807
				2808	btrfs_free_path(path);
				2809	btrfs_free_path(dst_path);
				2810
				2811	mutex_lock(&root->fs_info->tree_log_mutex);
				2812	ret = update_log_root(trans, log);
				2813	BUG_ON(ret);
				2814	mutex_unlock(&root->fs_info->tree_log_mutex);
				2815	out:
				2816	return 0;
				2817	}
				2818
				2819	int btrfs_log_inode(struct btrfs_trans_handle *trans,
				2820	struct btrfs_root root, struct inode inode,
				2821	int inode_only)
				2822	{
				2823	int ret;
				2824
				2825	start_log_trans(trans, root);
				2826	ret = __btrfs_log_inode(trans, root, inode, inode_only);
				2827	end_log_trans(root);
				2828	return ret;
				2829	}
				2830
				2831	/*
				2832	* helper function around btrfs_log_inode to make sure newly created
				2833	* parent directories also end up in the log. A minimal inode and backref
				2834	* only logging is done of any parent directories that are older than
				2835	* the last committed transaction
				2836	*/
				2837	int btrfs_log_dentry(struct btrfs_trans_handle *trans,
				2838	struct btrfs_root root, struct dentry dentry)
				2839	{
				2840	int inode_only = LOG_INODE_ALL;
				2841	struct super_block *sb;
				2842	int ret;
				2843
				2844	start_log_trans(trans, root);
				2845	sb = dentry->d_inode->i_sb;
				2846	while(1) {
				2847	ret = __btrfs_log_inode(trans, root, dentry->d_inode,
				2848	inode_only);
				2849	BUG_ON(ret);
				2850	inode_only = LOG_INODE_EXISTS;
				2851
				2852	dentry = dentry->d_parent;
				2853	if (!dentry \|\| !dentry->d_inode \|\| sb != dentry->d_inode->i_sb)
				2854	break;
				2855
				2856	if (BTRFS_I(dentry->d_inode)->generation <=
				2857	root->fs_info->last_trans_committed)
				2858	break;
				2859	}
				2860	end_log_trans(root);
				2861	return 0;
				2862	}
				2863
				2864	/*
				2865	* it is not safe to log dentry if the chunk root has added new
				2866	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				2867	* If this returns 1, you must commit the transaction to safely get your
				2868	* data on disk.
				2869	*/
				2870	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				2871	struct btrfs_root root, struct dentry dentry)
				2872	{
				2873	u64 gen;
				2874	gen = root->fs_info->last_trans_new_blockgroup;
				2875	if (gen > root->fs_info->last_trans_committed)
				2876	return 1;
				2877	else
				2878	return btrfs_log_dentry(trans, root, dentry);
				2879	}
				2880
				2881	/*
				2882	* should be called during mount to recover any replay any log trees
				2883	* from the FS
				2884	*/
				2885	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				2886	{
				2887	int ret;
				2888	struct btrfs_path *path;
				2889	struct btrfs_trans_handle *trans;
				2890	struct btrfs_key key;
				2891	struct btrfs_key found_key;
				2892	struct btrfs_key tmp_key;
				2893	struct btrfs_root *log;
				2894	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2895	u64 highest_inode;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2896	struct walk_control wc = {
				2897	.process_func = process_one_buffer,
				2898	.stage = 0,
				2899	};
				2900
				2901	fs_info->log_root_recovering = 1;
				2902	path = btrfs_alloc_path();
				2903	BUG_ON(!path);
				2904
				2905	trans = btrfs_start_transaction(fs_info->tree_root, 1);
				2906
				2907	wc.trans = trans;
				2908	wc.pin = 1;
				2909
				2910	walk_log_tree(trans, log_root_tree, &wc);
				2911
				2912	again:
				2913	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				2914	key.offset = (u64)-1;
				2915	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				2916
				2917	while(1) {
				2918	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				2919	if (ret < 0)
				2920	break;
				2921	if (ret > 0) {
				2922	if (path->slots[0] == 0)
				2923	break;
				2924	path->slots[0]--;
				2925	}
				2926	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2927	path->slots[0]);
				2928	btrfs_release_path(log_root_tree, path);
				2929	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				2930	break;
				2931
				2932	log = btrfs_read_fs_root_no_radix(log_root_tree,
				2933	&found_key);
				2934	BUG_ON(!log);
				2935
				2936
				2937	tmp_key.objectid = found_key.offset;
				2938	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				2939	tmp_key.offset = (u64)-1;
				2940
				2941	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
				2942
				2943	BUG_ON(!wc.replay_dest);
				2944
				2945	btrfs_record_root_in_trans(wc.replay_dest);
				2946	ret = walk_log_tree(trans, log, &wc);
				2947	BUG_ON(ret);
				2948
				2949	if (wc.stage == LOG_WALK_REPLAY_ALL) {
				2950	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				2951	path);
				2952	BUG_ON(ret);
				2953	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2954	ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
				2955	if (ret == 0) {
				2956	wc.replay_dest->highest_inode = highest_inode;
				2957	wc.replay_dest->last_inode_alloc = highest_inode;
				2958	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2959
				2960	key.offset = found_key.offset - 1;
				2961	free_extent_buffer(log->node);
				2962	kfree(log);
				2963
				2964	if (found_key.offset == 0)
				2965	break;
				2966	}
				2967	btrfs_release_path(log_root_tree, path);
				2968
				2969	/* step one is to pin it all, step two is to replay just inodes */
				2970	if (wc.pin) {
				2971	wc.pin = 0;
				2972	wc.process_func = replay_one_buffer;
				2973	wc.stage = LOG_WALK_REPLAY_INODES;
				2974	goto again;
				2975	}
				2976	/* step three is to replay everything */
				2977	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				2978	wc.stage++;
				2979	goto again;
				2980	}
				2981
				2982	btrfs_free_path(path);
				2983
				2984	free_extent_buffer(log_root_tree->node);
				2985	log_root_tree->log_root = NULL;
				2986	fs_info->log_root_recovering = 0;
				2987
				2988	/* step 4: commit the transaction, which also unpins the blocks */
				2989	btrfs_commit_transaction(trans, fs_info->tree_root);
				2990
				2991	kfree(log_root_tree);
				2992	return 0;
				2993	}