Btrfs: Fix streaming read performance with checksumming on Large streaming reads make for large bios, which means each entry on the list async work queues represents a large amount of data. IO congestion throttling on the device was kicking in before the async worker threads decided a single thread was busy and needed some help. The end result was that a streaming read would result in a single CPU running at 100% instead of balancing the work off to other CPUs. This patch also changes the pre-IO checksum lookup done by reads to work on a per-bio basis instead of a per-page. This results in many extra btree lookups on large streaming reads. Doing the checksum lookup right before bio submit allows us to reuse searches while processing adjacent offsets. Signed-off-by: Chris Mason <chris.mason@oracle.com>

commit: 61b4944018449003ac5f9757f4d125dce519cf51 [log] [tgz]
author: Chris Mason <chris.mason@oracle.com> Thu Jul 31 15:42:53 2008 -0400
committer: Chris Mason <chris.mason@oracle.com> Thu Sep 25 11:04:05 2008 -0400
tree: 553855996c641a945344db870b6dfd0d2d02086e
parent: 37d1aeee3990385e9bb436c50c2f7e120a668df6 [diff] [blame]
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c4afa9d..31d52c5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c

@@ -374,6 +374,10 @@
 	BUG_ON(ret);
 
 	if (!(rw & (1 << BIO_RW))) {
+		if (!btrfs_test_opt(root, NODATASUM) &&
+		    !btrfs_test_flag(inode, NODATASUM)) {
+			btrfs_lookup_bio_sums(root, inode, bio);
+		}
 		goto mapit;
 	}
 
@@ -598,58 +602,6 @@
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
-int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
-{
-	int ret = 0;
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct btrfs_csum_item *item;
-	struct btrfs_path *path = NULL;
-	u32 csum;
-
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
-		return 0;
-
-	/*
-	 * It is possible there is an ordered extent that has
-	 * not yet finished for this range in the file.  If so,
-	 * that extent will have a csum cached, and it will insert
-	 * the sum after all the blocks in the extent are fully
-	 * on disk.  So, look for an ordered extent and use the
-	 * sum if found.  We have to do this before looking in the
-	 * btree because csum items are pre-inserted based on
-	 * the file size.  btrfs_lookup_csum might find an item
-	 * that still hasn't been fully filled.
-	 */
-	ret = btrfs_find_ordered_sum(inode, start, &csum);
-	if (ret == 0)
-		goto found;
-
-	ret = 0;
-	path = btrfs_alloc_path();
-	item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
-	if (IS_ERR(item)) {
-		ret = PTR_ERR(item);
-		/* a csum that isn't present is a preallocated region. */
-		if (ret == -ENOENT || ret == -EFBIG)
-			ret = 0;
-		csum = 0;
-		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
-		       start);
-		goto out;
-	}
-	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
-			   BTRFS_CRC32_SIZE);
-found:
-	set_state_private(io_tree, start, csum);
-out:
-	if (path)
-		btrfs_free_path(path);
-	return ret;
-}
-
 struct io_failure_record {
 	struct page *page;
 	u64 start;
@@ -3613,7 +3565,6 @@
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.merge_bio_hook = btrfs_merge_bio_hook,
-	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
commit	61b4944018449003ac5f9757f4d125dce519cf51	[log] [tgz]
author	Chris Mason <chris.mason@oracle.com>	Thu Jul 31 15:42:53 2008 -0400
committer	Chris Mason <chris.mason@oracle.com>	Thu Sep 25 11:04:05 2008 -0400
tree	553855996c641a945344db870b6dfd0d2d02086e
parent	37d1aeee3990385e9bb436c50c2f7e120a668df6 [diff] [blame]