Blame - fs/btrfs/raid56.c - android_kernel_oneplus_msm8996

blob: e34e568534d930dac561a24ed4a48db282b4c221 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
				34	#include <asm/div64.h>
				35	#include "compat.h"
				36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	50	/*
				51	* set when this rbio is sitting in the hash, but it is just a cache
				52	* of past RMW
				53	*/
				54	#define RBIO_CACHE_BIT 2
				55
				56	/*
				57	* set when it is safe to trust the stripe_pages for caching
				58	*/
				59	#define RBIO_CACHE_READY_BIT 3
				60
				61
				62	#define RBIO_CACHE_SIZE 1024
				63
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	64	struct btrfs_raid_bio {
				65	struct btrfs_fs_info *fs_info;
				66	struct btrfs_bio *bbio;
				67
				68	/*
				69	* logical block numbers for the start of each stripe
				70	* The last one or two are p/q. These are sorted,
				71	* so raid_map[0] is the start of our full stripe
				72	*/
				73	u64 *raid_map;
				74
				75	/* while we're doing rmw on a stripe
				76	* we put it into a hash table so we can
				77	* lock the stripe and merge more rbios
				78	* into it.
				79	*/
				80	struct list_head hash_list;
				81
				82	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	83	* LRU list for the stripe cache
				84	*/
				85	struct list_head stripe_cache;
				86
				87	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	88	* for scheduling work in the helper threads
				89	*/
				90	struct btrfs_work work;
				91
				92	/*
				93	* bio list and bio_list_lock are used
				94	* to add more bios into the stripe
				95	* in hopes of avoiding the full rmw
				96	*/
				97	struct bio_list bio_list;
				98	spinlock_t bio_list_lock;
				99
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame^]	100	/* also protected by the bio_list_lock, the
				101	* plug list is used by the plugging code
				102	* to collect partial bios while plugged. The
				103	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	104	* the stripe lock to the next pending IO
				105	*/
				106	struct list_head plug_list;
				107
				108	/*
				109	* flags that tell us if it is safe to
				110	* merge with this bio
				111	*/
				112	unsigned long flags;
				113
				114	/* size of each individual stripe on disk */
				115	int stripe_len;
				116
				117	/* number of data stripes (no p/q) */
				118	int nr_data;
				119
				120	/*
				121	* set if we're doing a parity rebuild
				122	* for a read from higher up, which is handled
				123	* differently from a parity rebuild as part of
				124	* rmw
				125	*/
				126	int read_rebuild;
				127
				128	/* first bad stripe */
				129	int faila;
				130
				131	/* second bad stripe (for raid6 use) */
				132	int failb;
				133
				134	/*
				135	* number of pages needed to represent the full
				136	* stripe
				137	*/
				138	int nr_pages;
				139
				140	/*
				141	* size of all the bios in the bio_list. This
				142	* helps us decide if the rbio maps to a full
				143	* stripe or not
				144	*/
				145	int bio_list_bytes;
				146
				147	atomic_t refs;
				148
				149	/*
				150	* these are two arrays of pointers. We allocate the
				151	* rbio big enough to hold them both and setup their
				152	* locations when the rbio is allocated
				153	*/
				154
				155	/* pointers to pages that we allocated for
				156	* reading/writing stripes directly from the disk (including P/Q)
				157	*/
				158	struct page **stripe_pages;
				159
				160	/*
				161	* pointers to the pages in the bio_list. Stored
				162	* here for faster lookup
				163	*/
				164	struct page **bio_pages;
				165	};
				166
				167	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				168	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				169	static void rmw_work(struct btrfs_work *work);
				170	static void read_rebuild_work(struct btrfs_work *work);
				171	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				172	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				173	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				174	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				175	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				176	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				177	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				178
				179	/*
				180	* the stripe hash table is used for locking, and to collect
				181	* bios in hopes of making a full stripe
				182	*/
				183	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				184	{
				185	struct btrfs_stripe_hash_table *table;
				186	struct btrfs_stripe_hash_table *x;
				187	struct btrfs_stripe_hash *cur;
				188	struct btrfs_stripe_hash *h;
				189	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				190	int i;
				191
				192	if (info->stripe_hash_table)
				193	return 0;
				194
				195	table = kzalloc(sizeof(table) + sizeof(h) * num_entries, GFP_NOFS);
				196	if (!table)
				197	return -ENOMEM;
				198
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	199	spin_lock_init(&table->cache_lock);
				200	INIT_LIST_HEAD(&table->stripe_cache);
				201
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	202	h = table->table;
				203
				204	for (i = 0; i < num_entries; i++) {
				205	cur = h + i;
				206	INIT_LIST_HEAD(&cur->hash_list);
				207	spin_lock_init(&cur->lock);
				208	init_waitqueue_head(&cur->wait);
				209	}
				210
				211	x = cmpxchg(&info->stripe_hash_table, NULL, table);
				212	if (x)
				213	kfree(x);
				214	return 0;
				215	}
				216
				217	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	218	* caching an rbio means to copy anything from the
				219	* bio_pages array into the stripe_pages array. We
				220	* use the page uptodate bit in the stripe cache array
				221	* to indicate if it has valid data
				222	*
				223	* once the caching is done, we set the cache ready
				224	* bit.
				225	*/
				226	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				227	{
				228	int i;
				229	char *s;
				230	char *d;
				231	int ret;
				232
				233	ret = alloc_rbio_pages(rbio);
				234	if (ret)
				235	return;
				236
				237	for (i = 0; i < rbio->nr_pages; i++) {
				238	if (!rbio->bio_pages[i])
				239	continue;
				240
				241	s = kmap(rbio->bio_pages[i]);
				242	d = kmap(rbio->stripe_pages[i]);
				243
				244	memcpy(d, s, PAGE_CACHE_SIZE);
				245
				246	kunmap(rbio->bio_pages[i]);
				247	kunmap(rbio->stripe_pages[i]);
				248	SetPageUptodate(rbio->stripe_pages[i]);
				249	}
				250	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				251	}
				252
				253	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	254	* we hash on the first logical address of the stripe
				255	*/
				256	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				257	{
				258	u64 num = rbio->raid_map[0];
				259
				260	/*
				261	* we shift down quite a bit. We're using byte
				262	* addressing, and most of the lower bits are zeros.
				263	* This tends to upset hash_64, and it consistently
				264	* returns just one or two different values.
				265	*
				266	* shifting off the lower bits fixes things.
				267	*/
				268	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				269	}
				270
				271	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	272	* stealing an rbio means taking all the uptodate pages from the stripe
				273	* array in the source rbio and putting them into the destination rbio
				274	*/
				275	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				276	{
				277	int i;
				278	struct page *s;
				279	struct page *d;
				280
				281	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				282	return;
				283
				284	for (i = 0; i < dest->nr_pages; i++) {
				285	s = src->stripe_pages[i];
				286	if (!s \|\| !PageUptodate(s)) {
				287	continue;
				288	}
				289
				290	d = dest->stripe_pages[i];
				291	if (d)
				292	__free_page(d);
				293
				294	dest->stripe_pages[i] = s;
				295	src->stripe_pages[i] = NULL;
				296	}
				297	}
				298
				299	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	300	* merging means we take the bio_list from the victim and
				301	* splice it into the destination. The victim should
				302	* be discarded afterwards.
				303	*
				304	* must be called with dest->rbio_list_lock held
				305	*/
				306	static void merge_rbio(struct btrfs_raid_bio *dest,
				307	struct btrfs_raid_bio *victim)
				308	{
				309	bio_list_merge(&dest->bio_list, &victim->bio_list);
				310	dest->bio_list_bytes += victim->bio_list_bytes;
				311	bio_list_init(&victim->bio_list);
				312	}
				313
				314	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	315	* used to prune items that are in the cache. The caller
				316	* must hold the hash table lock.
				317	*/
				318	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				319	{
				320	int bucket = rbio_bucket(rbio);
				321	struct btrfs_stripe_hash_table *table;
				322	struct btrfs_stripe_hash *h;
				323	int freeit = 0;
				324
				325	/*
				326	* check the bit again under the hash table lock.
				327	*/
				328	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				329	return;
				330
				331	table = rbio->fs_info->stripe_hash_table;
				332	h = table->table + bucket;
				333
				334	/* hold the lock for the bucket because we may be
				335	* removing it from the hash table
				336	*/
				337	spin_lock(&h->lock);
				338
				339	/*
				340	* hold the lock for the bio list because we need
				341	* to make sure the bio list is empty
				342	*/
				343	spin_lock(&rbio->bio_list_lock);
				344
				345	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				346	list_del_init(&rbio->stripe_cache);
				347	table->cache_size -= 1;
				348	freeit = 1;
				349
				350	/* if the bio list isn't empty, this rbio is
				351	* still involved in an IO. We take it out
				352	* of the cache list, and drop the ref that
				353	* was held for the list.
				354	*
				355	* If the bio_list was empty, we also remove
				356	* the rbio from the hash_table, and drop
				357	* the corresponding ref
				358	*/
				359	if (bio_list_empty(&rbio->bio_list)) {
				360	if (!list_empty(&rbio->hash_list)) {
				361	list_del_init(&rbio->hash_list);
				362	atomic_dec(&rbio->refs);
				363	BUG_ON(!list_empty(&rbio->plug_list));
				364	}
				365	}
				366	}
				367
				368	spin_unlock(&rbio->bio_list_lock);
				369	spin_unlock(&h->lock);
				370
				371	if (freeit)
				372	__free_raid_bio(rbio);
				373	}
				374
				375	/*
				376	* prune a given rbio from the cache
				377	*/
				378	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				379	{
				380	struct btrfs_stripe_hash_table *table;
				381	unsigned long flags;
				382
				383	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				384	return;
				385
				386	table = rbio->fs_info->stripe_hash_table;
				387
				388	spin_lock_irqsave(&table->cache_lock, flags);
				389	__remove_rbio_from_cache(rbio);
				390	spin_unlock_irqrestore(&table->cache_lock, flags);
				391	}
				392
				393	/*
				394	* remove everything in the cache
				395	*/
				396	void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
				397	{
				398	struct btrfs_stripe_hash_table *table;
				399	unsigned long flags;
				400	struct btrfs_raid_bio *rbio;
				401
				402	table = info->stripe_hash_table;
				403
				404	spin_lock_irqsave(&table->cache_lock, flags);
				405	while (!list_empty(&table->stripe_cache)) {
				406	rbio = list_entry(table->stripe_cache.next,
				407	struct btrfs_raid_bio,
				408	stripe_cache);
				409	__remove_rbio_from_cache(rbio);
				410	}
				411	spin_unlock_irqrestore(&table->cache_lock, flags);
				412	}
				413
				414	/*
				415	* remove all cached entries and free the hash table
				416	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	417	*/
				418	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				419	{
				420	if (!info->stripe_hash_table)
				421	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	422	btrfs_clear_rbio_cache(info);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	423	kfree(info->stripe_hash_table);
				424	info->stripe_hash_table = NULL;
				425	}
				426
				427	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	428	* insert an rbio into the stripe cache. It
				429	* must have already been prepared by calling
				430	* cache_rbio_pages
				431	*
				432	* If this rbio was already cached, it gets
				433	* moved to the front of the lru.
				434	*
				435	* If the size of the rbio cache is too big, we
				436	* prune an item.
				437	*/
				438	static void cache_rbio(struct btrfs_raid_bio *rbio)
				439	{
				440	struct btrfs_stripe_hash_table *table;
				441	unsigned long flags;
				442
				443	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				444	return;
				445
				446	table = rbio->fs_info->stripe_hash_table;
				447
				448	spin_lock_irqsave(&table->cache_lock, flags);
				449	spin_lock(&rbio->bio_list_lock);
				450
				451	/* bump our ref if we were not in the list before */
				452	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
				453	atomic_inc(&rbio->refs);
				454
				455	if (!list_empty(&rbio->stripe_cache)){
				456	list_move(&rbio->stripe_cache, &table->stripe_cache);
				457	} else {
				458	list_add(&rbio->stripe_cache, &table->stripe_cache);
				459	table->cache_size += 1;
				460	}
				461
				462	spin_unlock(&rbio->bio_list_lock);
				463
				464	if (table->cache_size > RBIO_CACHE_SIZE) {
				465	struct btrfs_raid_bio *found;
				466
				467	found = list_entry(table->stripe_cache.prev,
				468	struct btrfs_raid_bio,
				469	stripe_cache);
				470
				471	if (found != rbio)
				472	__remove_rbio_from_cache(found);
				473	}
				474
				475	spin_unlock_irqrestore(&table->cache_lock, flags);
				476	return;
				477	}
				478
				479	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	480	* helper function to run the xor_blocks api. It is only
				481	* able to do MAX_XOR_BLOCKS at a time, so we need to
				482	* loop through.
				483	*/
				484	static void run_xor(void **pages, int src_cnt, ssize_t len)
				485	{
				486	int src_off = 0;
				487	int xor_src_cnt = 0;
				488	void *dest = pages[src_cnt];
				489
				490	while(src_cnt > 0) {
				491	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				492	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				493
				494	src_cnt -= xor_src_cnt;
				495	src_off += xor_src_cnt;
				496	}
				497	}
				498
				499	/*
				500	* returns true if the bio list inside this rbio
				501	* covers an entire stripe (no rmw required).
				502	* Must be called with the bio list lock held, or
				503	* at a time when you know it is impossible to add
				504	* new bios into the list
				505	*/
				506	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				507	{
				508	unsigned long size = rbio->bio_list_bytes;
				509	int ret = 1;
				510
				511	if (size != rbio->nr_data * rbio->stripe_len)
				512	ret = 0;
				513
				514	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				515	return ret;
				516	}
				517
				518	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				519	{
				520	unsigned long flags;
				521	int ret;
				522
				523	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				524	ret = __rbio_is_full(rbio);
				525	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				526	return ret;
				527	}
				528
				529	/*
				530	* returns 1 if it is safe to merge two rbios together.
				531	* The merging is safe if the two rbios correspond to
				532	* the same stripe and if they are both going in the same
				533	* direction (read vs write), and if neither one is
				534	* locked for final IO
				535	*
				536	* The caller is responsible for locking such that
				537	* rmw_locked is safe to test
				538	*/
				539	static int rbio_can_merge(struct btrfs_raid_bio *last,
				540	struct btrfs_raid_bio *cur)
				541	{
				542	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				543	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				544	return 0;
				545
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	546	/*
				547	* we can't merge with cached rbios, since the
				548	* idea is that when we merge the destination
				549	* rbio is going to run our IO for us. We can
				550	* steal from cached rbio's though, other functions
				551	* handle that.
				552	*/
				553	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				554	test_bit(RBIO_CACHE_BIT, &cur->flags))
				555	return 0;
				556
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	557	if (last->raid_map[0] !=
				558	cur->raid_map[0])
				559	return 0;
				560
				561	/* reads can't merge with writes */
				562	if (last->read_rebuild !=
				563	cur->read_rebuild) {
				564	return 0;
				565	}
				566
				567	return 1;
				568	}
				569
				570	/*
				571	* helper to index into the pstripe
				572	*/
				573	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				574	{
				575	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				576	return rbio->stripe_pages[index];
				577	}
				578
				579	/*
				580	* helper to index into the qstripe, returns null
				581	* if there is no qstripe
				582	*/
				583	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				584	{
				585	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				586	return NULL;
				587
				588	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				589	PAGE_CACHE_SHIFT;
				590	return rbio->stripe_pages[index];
				591	}
				592
				593	/*
				594	* The first stripe in the table for a logical address
				595	* has the lock. rbios are added in one of three ways:
				596	*
				597	* 1) Nobody has the stripe locked yet. The rbio is given
				598	* the lock and 0 is returned. The caller must start the IO
				599	* themselves.
				600	*
				601	* 2) Someone has the stripe locked, but we're able to merge
				602	* with the lock owner. The rbio is freed and the IO will
				603	* start automatically along with the existing rbio. 1 is returned.
				604	*
				605	* 3) Someone has the stripe locked, but we're not able to merge.
				606	* The rbio is added to the lock owner's plug list, or merged into
				607	* an rbio already on the plug list. When the lock owner unlocks,
				608	* the next rbio on the list is run and the IO is started automatically.
				609	* 1 is returned
				610	*
				611	* If we return 0, the caller still owns the rbio and must continue with
				612	* IO submission. If we return 1, the caller must assume the rbio has
				613	* already been freed.
				614	*/
				615	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				616	{
				617	int bucket = rbio_bucket(rbio);
				618	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				619	struct btrfs_raid_bio *cur;
				620	struct btrfs_raid_bio *pending;
				621	unsigned long flags;
				622	DEFINE_WAIT(wait);
				623	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	624	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	625	int ret = 0;
				626	int walk = 0;
				627
				628	spin_lock_irqsave(&h->lock, flags);
				629	list_for_each_entry(cur, &h->hash_list, hash_list) {
				630	walk++;
				631	if (cur->raid_map[0] == rbio->raid_map[0]) {
				632	spin_lock(&cur->bio_list_lock);
				633
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	634	/* can we steal this cached rbio's pages? */
				635	if (bio_list_empty(&cur->bio_list) &&
				636	list_empty(&cur->plug_list) &&
				637	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				638	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				639	list_del_init(&cur->hash_list);
				640	atomic_dec(&cur->refs);
				641
				642	steal_rbio(cur, rbio);
				643	cache_drop = cur;
				644	spin_unlock(&cur->bio_list_lock);
				645
				646	goto lockit;
				647	}
				648
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	649	/* can we merge into the lock owner? */
				650	if (rbio_can_merge(cur, rbio)) {
				651	merge_rbio(cur, rbio);
				652	spin_unlock(&cur->bio_list_lock);
				653	freeit = rbio;
				654	ret = 1;
				655	goto out;
				656	}
				657
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	658
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	659	/*
				660	* we couldn't merge with the running
				661	* rbio, see if we can merge with the
				662	* pending ones. We don't have to
				663	* check for rmw_locked because there
				664	* is no way they are inside finish_rmw
				665	* right now
				666	*/
				667	list_for_each_entry(pending, &cur->plug_list,
				668	plug_list) {
				669	if (rbio_can_merge(pending, rbio)) {
				670	merge_rbio(pending, rbio);
				671	spin_unlock(&cur->bio_list_lock);
				672	freeit = rbio;
				673	ret = 1;
				674	goto out;
				675	}
				676	}
				677
				678	/* no merging, put us on the tail of the plug list,
				679	* our rbio will be started with the currently
				680	* running rbio unlocks
				681	*/
				682	list_add_tail(&rbio->plug_list, &cur->plug_list);
				683	spin_unlock(&cur->bio_list_lock);
				684	ret = 1;
				685	goto out;
				686	}
				687	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	688	lockit:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	689	atomic_inc(&rbio->refs);
				690	list_add(&rbio->hash_list, &h->hash_list);
				691	out:
				692	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	693	if (cache_drop)
				694	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	695	if (freeit)
				696	__free_raid_bio(freeit);
				697	return ret;
				698	}
				699
				700	/*
				701	* called as rmw or parity rebuild is completed. If the plug list has more
				702	* rbios waiting for this stripe, the next one on the list will be started
				703	*/
				704	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				705	{
				706	int bucket;
				707	struct btrfs_stripe_hash *h;
				708	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	709	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	710
				711	bucket = rbio_bucket(rbio);
				712	h = rbio->fs_info->stripe_hash_table->table + bucket;
				713
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	714	if (list_empty(&rbio->plug_list))
				715	cache_rbio(rbio);
				716
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	717	spin_lock_irqsave(&h->lock, flags);
				718	spin_lock(&rbio->bio_list_lock);
				719
				720	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	721	/*
				722	* if we're still cached and there is no other IO
				723	* to perform, just leave this rbio here for others
				724	* to steal from later
				725	*/
				726	if (list_empty(&rbio->plug_list) &&
				727	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				728	keep_cache = 1;
				729	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				730	BUG_ON(!bio_list_empty(&rbio->bio_list));
				731	goto done;
				732	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	733
				734	list_del_init(&rbio->hash_list);
				735	atomic_dec(&rbio->refs);
				736
				737	/*
				738	* we use the plug list to hold all the rbios
				739	* waiting for the chance to lock this stripe.
				740	* hand the lock over to one of them.
				741	*/
				742	if (!list_empty(&rbio->plug_list)) {
				743	struct btrfs_raid_bio *next;
				744	struct list_head *head = rbio->plug_list.next;
				745
				746	next = list_entry(head, struct btrfs_raid_bio,
				747	plug_list);
				748
				749	list_del_init(&rbio->plug_list);
				750
				751	list_add(&next->hash_list, &h->hash_list);
				752	atomic_inc(&next->refs);
				753	spin_unlock(&rbio->bio_list_lock);
				754	spin_unlock_irqrestore(&h->lock, flags);
				755
				756	if (next->read_rebuild)
				757	async_read_rebuild(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	758	else {
				759	steal_rbio(rbio, next);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	760	async_rmw_stripe(next);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	761	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	762
				763	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	764	} else if (waitqueue_active(&h->wait)) {
				765	spin_unlock(&rbio->bio_list_lock);
				766	spin_unlock_irqrestore(&h->lock, flags);
				767	wake_up(&h->wait);
				768	goto done_nolock;
				769	}
				770	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	771	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	772	spin_unlock(&rbio->bio_list_lock);
				773	spin_unlock_irqrestore(&h->lock, flags);
				774
				775	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	776	if (!keep_cache)
				777	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	778	}
				779
				780	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				781	{
				782	int i;
				783
				784	WARN_ON(atomic_read(&rbio->refs) < 0);
				785	if (!atomic_dec_and_test(&rbio->refs))
				786	return;
				787
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	788	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	789	WARN_ON(!list_empty(&rbio->hash_list));
				790	WARN_ON(!bio_list_empty(&rbio->bio_list));
				791
				792	for (i = 0; i < rbio->nr_pages; i++) {
				793	if (rbio->stripe_pages[i]) {
				794	__free_page(rbio->stripe_pages[i]);
				795	rbio->stripe_pages[i] = NULL;
				796	}
				797	}
				798	kfree(rbio->raid_map);
				799	kfree(rbio->bbio);
				800	kfree(rbio);
				801	}
				802
				803	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				804	{
				805	unlock_stripe(rbio);
				806	__free_raid_bio(rbio);
				807	}
				808
				809	/*
				810	* this frees the rbio and runs through all the bios in the
				811	* bio_list and calls end_io on them
				812	*/
				813	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				814	{
				815	struct bio *cur = bio_list_get(&rbio->bio_list);
				816	struct bio *next;
				817	free_raid_bio(rbio);
				818
				819	while (cur) {
				820	next = cur->bi_next;
				821	cur->bi_next = NULL;
				822	if (uptodate)
				823	set_bit(BIO_UPTODATE, &cur->bi_flags);
				824	bio_endio(cur, err);
				825	cur = next;
				826	}
				827	}
				828
				829	/*
				830	* end io function used by finish_rmw. When we finally
				831	* get here, we've written a full stripe
				832	*/
				833	static void raid_write_end_io(struct bio *bio, int err)
				834	{
				835	struct btrfs_raid_bio *rbio = bio->bi_private;
				836
				837	if (err)
				838	fail_bio_stripe(rbio, bio);
				839
				840	bio_put(bio);
				841
				842	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				843	return;
				844
				845	err = 0;
				846
				847	/* OK, we have read all the stripes we need to. */
				848	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				849	err = -EIO;
				850
				851	rbio_orig_end_io(rbio, err, 0);
				852	return;
				853	}
				854
				855	/*
				856	* the read/modify/write code wants to use the original bio for
				857	* any pages it included, and then use the rbio for everything
				858	* else. This function decides if a given index (stripe number)
				859	* and page number in that stripe fall inside the original bio
				860	* or the rbio.
				861	*
				862	* if you set bio_list_only, you'll get a NULL back for any ranges
				863	* that are outside the bio_list
				864	*
				865	* This doesn't take any refs on anything, you get a bare page pointer
				866	* and the caller must bump refs as required.
				867	*
				868	* You must call index_rbio_pages once before you can trust
				869	* the answers from this function.
				870	*/
				871	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				872	int index, int pagenr, int bio_list_only)
				873	{
				874	int chunk_page;
				875	struct page *p = NULL;
				876
				877	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				878
				879	spin_lock_irq(&rbio->bio_list_lock);
				880	p = rbio->bio_pages[chunk_page];
				881	spin_unlock_irq(&rbio->bio_list_lock);
				882
				883	if (p \|\| bio_list_only)
				884	return p;
				885
				886	return rbio->stripe_pages[chunk_page];
				887	}
				888
				889	/*
				890	* number of pages we need for the entire stripe across all the
				891	* drives
				892	*/
				893	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				894	{
				895	unsigned long nr = stripe_len * nr_stripes;
				896	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				897	}
				898
				899	/*
				900	* allocation and initial setup for the btrfs_raid_bio. Not
				901	* this does not allocate any pages for rbio->pages.
				902	*/
				903	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				904	struct btrfs_bio bbio, u64 raid_map,
				905	u64 stripe_len)
				906	{
				907	struct btrfs_raid_bio *rbio;
				908	int nr_data = 0;
				909	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
				910	void *p;
				911
				912	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2,
				913	GFP_NOFS);
				914	if (!rbio) {
				915	kfree(raid_map);
				916	kfree(bbio);
				917	return ERR_PTR(-ENOMEM);
				918	}
				919
				920	bio_list_init(&rbio->bio_list);
				921	INIT_LIST_HEAD(&rbio->plug_list);
				922	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	923	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	924	INIT_LIST_HEAD(&rbio->hash_list);
				925	rbio->bbio = bbio;
				926	rbio->raid_map = raid_map;
				927	rbio->fs_info = root->fs_info;
				928	rbio->stripe_len = stripe_len;
				929	rbio->nr_pages = num_pages;
				930	rbio->faila = -1;
				931	rbio->failb = -1;
				932	atomic_set(&rbio->refs, 1);
				933
				934	/*
				935	* the stripe_pages and bio_pages array point to the extra
				936	* memory we allocated past the end of the rbio
				937	*/
				938	p = rbio + 1;
				939	rbio->stripe_pages = p;
				940	rbio->bio_pages = p + sizeof(struct page ) num_pages;
				941
				942	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				943	nr_data = bbio->num_stripes - 2;
				944	else
				945	nr_data = bbio->num_stripes - 1;
				946
				947	rbio->nr_data = nr_data;
				948	return rbio;
				949	}
				950
				951	/* allocate pages for all the stripes in the bio, including parity */
				952	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				953	{
				954	int i;
				955	struct page *page;
				956
				957	for (i = 0; i < rbio->nr_pages; i++) {
				958	if (rbio->stripe_pages[i])
				959	continue;
				960	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				961	if (!page)
				962	return -ENOMEM;
				963	rbio->stripe_pages[i] = page;
				964	ClearPageUptodate(page);
				965	}
				966	return 0;
				967	}
				968
				969	/* allocate pages for just the p/q stripes */
				970	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				971	{
				972	int i;
				973	struct page *page;
				974
				975	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				976
				977	for (; i < rbio->nr_pages; i++) {
				978	if (rbio->stripe_pages[i])
				979	continue;
				980	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				981	if (!page)
				982	return -ENOMEM;
				983	rbio->stripe_pages[i] = page;
				984	}
				985	return 0;
				986	}
				987
				988	/*
				989	* add a single page from a specific stripe into our list of bios for IO
				990	* this will try to merge into existing bios if possible, and returns
				991	* zero if all went well.
				992	*/
				993	int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				994	struct bio_list *bio_list,
				995	struct page *page,
				996	int stripe_nr,
				997	unsigned long page_index,
				998	unsigned long bio_max_len)
				999	{
				1000	struct bio *last = bio_list->tail;
				1001	u64 last_end = 0;
				1002	int ret;
				1003	struct bio *bio;
				1004	struct btrfs_bio_stripe *stripe;
				1005	u64 disk_start;
				1006
				1007	stripe = &rbio->bbio->stripes[stripe_nr];
				1008	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				1009
				1010	/* if the device is missing, just fail this stripe */
				1011	if (!stripe->dev->bdev)
				1012	return fail_rbio_index(rbio, stripe_nr);
				1013
				1014	/* see if we can add this page onto our existing bio */
				1015	if (last) {
				1016	last_end = (u64)last->bi_sector << 9;
				1017	last_end += last->bi_size;
				1018
				1019	/*
				1020	* we can't merge these if they are from different
				1021	* devices or if they are not contiguous
				1022	*/
				1023	if (last_end == disk_start && stripe->dev->bdev &&
				1024	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				1025	last->bi_bdev == stripe->dev->bdev) {
				1026	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				1027	if (ret == PAGE_CACHE_SIZE)
				1028	return 0;
				1029	}
				1030	}
				1031
				1032	/* put a new bio on the list */
				1033	bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
				1034	if (!bio)
				1035	return -ENOMEM;
				1036
				1037	bio->bi_size = 0;
				1038	bio->bi_bdev = stripe->dev->bdev;
				1039	bio->bi_sector = disk_start >> 9;
				1040	set_bit(BIO_UPTODATE, &bio->bi_flags);
				1041
				1042	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				1043	bio_list_add(bio_list, bio);
				1044	return 0;
				1045	}
				1046
				1047	/*
				1048	* while we're doing the read/modify/write cycle, we could
				1049	* have errors in reading pages off the disk. This checks
				1050	* for errors and if we're not able to read the page it'll
				1051	* trigger parity reconstruction. The rmw will be finished
				1052	* after we've reconstructed the failed stripes
				1053	*/
				1054	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1055	{
				1056	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				1057	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				1058	__raid56_parity_recover(rbio);
				1059	} else {
				1060	finish_rmw(rbio);
				1061	}
				1062	}
				1063
				1064	/*
				1065	* these are just the pages from the rbio array, not from anything
				1066	* the FS sent down to us
				1067	*/
				1068	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				1069	{
				1070	int index;
				1071	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				1072	index += page;
				1073	return rbio->stripe_pages[index];
				1074	}
				1075
				1076	/*
				1077	* helper function to walk our bio list and populate the bio_pages array with
				1078	* the result. This seems expensive, but it is faster than constantly
				1079	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1080	* reconstruction.
				1081	*
				1082	* This must be called before you trust the answers from page_in_rbio
				1083	*/
				1084	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1085	{
				1086	struct bio *bio;
				1087	u64 start;
				1088	unsigned long stripe_offset;
				1089	unsigned long page_index;
				1090	struct page *p;
				1091	int i;
				1092
				1093	spin_lock_irq(&rbio->bio_list_lock);
				1094	bio_list_for_each(bio, &rbio->bio_list) {
				1095	start = (u64)bio->bi_sector << 9;
				1096	stripe_offset = start - rbio->raid_map[0];
				1097	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				1098
				1099	for (i = 0; i < bio->bi_vcnt; i++) {
				1100	p = bio->bi_io_vec[i].bv_page;
				1101	rbio->bio_pages[page_index + i] = p;
				1102	}
				1103	}
				1104	spin_unlock_irq(&rbio->bio_list_lock);
				1105	}
				1106
				1107	/*
				1108	* this is called from one of two situations. We either
				1109	* have a full stripe from the higher layers, or we've read all
				1110	* the missing bits off disk.
				1111	*
				1112	* This will calculate the parity and then send down any
				1113	* changed blocks.
				1114	*/
				1115	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1116	{
				1117	struct btrfs_bio *bbio = rbio->bbio;
				1118	void *pointers[bbio->num_stripes];
				1119	int stripe_len = rbio->stripe_len;
				1120	int nr_data = rbio->nr_data;
				1121	int stripe;
				1122	int pagenr;
				1123	int p_stripe = -1;
				1124	int q_stripe = -1;
				1125	struct bio_list bio_list;
				1126	struct bio *bio;
				1127	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				1128	int ret;
				1129
				1130	bio_list_init(&bio_list);
				1131
				1132	if (bbio->num_stripes - rbio->nr_data == 1) {
				1133	p_stripe = bbio->num_stripes - 1;
				1134	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				1135	p_stripe = bbio->num_stripes - 2;
				1136	q_stripe = bbio->num_stripes - 1;
				1137	} else {
				1138	BUG();
				1139	}
				1140
				1141	/* at this point we either have a full stripe,
				1142	* or we've read the full stripe from the drive.
				1143	* recalculate the parity and write the new results.
				1144	*
				1145	* We're not allowed to add any new bios to the
				1146	* bio list here, anyone else that wants to
				1147	* change this stripe needs to do their own rmw.
				1148	*/
				1149	spin_lock_irq(&rbio->bio_list_lock);
				1150	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1151	spin_unlock_irq(&rbio->bio_list_lock);
				1152
				1153	atomic_set(&rbio->bbio->error, 0);
				1154
				1155	/*
				1156	* now that we've set rmw_locked, run through the
				1157	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1158	*
				1159	* We don't cache full rbios because we're assuming
				1160	* the higher layers are unlikely to use this area of
				1161	* the disk again soon. If they do use it again,
				1162	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1163	*/
				1164	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1165	if (!rbio_is_full(rbio))
				1166	cache_rbio_pages(rbio);
				1167	else
				1168	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1169
				1170	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1171	struct page *p;
				1172	/* first collect one page from each data stripe */
				1173	for (stripe = 0; stripe < nr_data; stripe++) {
				1174	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1175	pointers[stripe] = kmap(p);
				1176	}
				1177
				1178	/* then add the parity stripe */
				1179	p = rbio_pstripe_page(rbio, pagenr);
				1180	SetPageUptodate(p);
				1181	pointers[stripe++] = kmap(p);
				1182
				1183	if (q_stripe != -1) {
				1184
				1185	/*
				1186	* raid6, add the qstripe and call the
				1187	* library function to fill in our p/q
				1188	*/
				1189	p = rbio_qstripe_page(rbio, pagenr);
				1190	SetPageUptodate(p);
				1191	pointers[stripe++] = kmap(p);
				1192
				1193	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				1194	pointers);
				1195	} else {
				1196	/* raid5 */
				1197	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				1198	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				1199	}
				1200
				1201
				1202	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				1203	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1204	}
				1205
				1206	/*
				1207	* time to start writing. Make bios for everything from the
				1208	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1209	* everything else.
				1210	*/
				1211	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1212	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				1213	struct page *page;
				1214	if (stripe < rbio->nr_data) {
				1215	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1216	if (!page)
				1217	continue;
				1218	} else {
				1219	page = rbio_stripe_page(rbio, stripe, pagenr);
				1220	}
				1221
				1222	ret = rbio_add_io_page(rbio, &bio_list,
				1223	page, stripe, pagenr, rbio->stripe_len);
				1224	if (ret)
				1225	goto cleanup;
				1226	}
				1227	}
				1228
				1229	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
				1230	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
				1231
				1232	while (1) {
				1233	bio = bio_list_pop(&bio_list);
				1234	if (!bio)
				1235	break;
				1236
				1237	bio->bi_private = rbio;
				1238	bio->bi_end_io = raid_write_end_io;
				1239	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1240	submit_bio(WRITE, bio);
				1241	}
				1242	return;
				1243
				1244	cleanup:
				1245	rbio_orig_end_io(rbio, -EIO, 0);
				1246	}
				1247
				1248	/*
				1249	* helper to find the stripe number for a given bio. Used to figure out which
				1250	* stripe has failed. This expects the bio to correspond to a physical disk,
				1251	* so it looks up based on physical sector numbers.
				1252	*/
				1253	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1254	struct bio *bio)
				1255	{
				1256	u64 physical = bio->bi_sector;
				1257	u64 stripe_start;
				1258	int i;
				1259	struct btrfs_bio_stripe *stripe;
				1260
				1261	physical <<= 9;
				1262
				1263	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1264	stripe = &rbio->bbio->stripes[i];
				1265	stripe_start = stripe->physical;
				1266	if (physical >= stripe_start &&
				1267	physical < stripe_start + rbio->stripe_len) {
				1268	return i;
				1269	}
				1270	}
				1271	return -1;
				1272	}
				1273
				1274	/*
				1275	* helper to find the stripe number for a given
				1276	* bio (before mapping). Used to figure out which stripe has
				1277	* failed. This looks up based on logical block numbers.
				1278	*/
				1279	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1280	struct bio *bio)
				1281	{
				1282	u64 logical = bio->bi_sector;
				1283	u64 stripe_start;
				1284	int i;
				1285
				1286	logical <<= 9;
				1287
				1288	for (i = 0; i < rbio->nr_data; i++) {
				1289	stripe_start = rbio->raid_map[i];
				1290	if (logical >= stripe_start &&
				1291	logical < stripe_start + rbio->stripe_len) {
				1292	return i;
				1293	}
				1294	}
				1295	return -1;
				1296	}
				1297
				1298	/*
				1299	* returns -EIO if we had too many failures
				1300	*/
				1301	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1302	{
				1303	unsigned long flags;
				1304	int ret = 0;
				1305
				1306	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1307
				1308	/* we already know this stripe is bad, move on */
				1309	if (rbio->faila == failed \|\| rbio->failb == failed)
				1310	goto out;
				1311
				1312	if (rbio->faila == -1) {
				1313	/* first failure on this rbio */
				1314	rbio->faila = failed;
				1315	atomic_inc(&rbio->bbio->error);
				1316	} else if (rbio->failb == -1) {
				1317	/* second failure on this rbio */
				1318	rbio->failb = failed;
				1319	atomic_inc(&rbio->bbio->error);
				1320	} else {
				1321	ret = -EIO;
				1322	}
				1323	out:
				1324	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1325
				1326	return ret;
				1327	}
				1328
				1329	/*
				1330	* helper to fail a stripe based on a physical disk
				1331	* bio.
				1332	*/
				1333	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1334	struct bio *bio)
				1335	{
				1336	int failed = find_bio_stripe(rbio, bio);
				1337
				1338	if (failed < 0)
				1339	return -EIO;
				1340
				1341	return fail_rbio_index(rbio, failed);
				1342	}
				1343
				1344	/*
				1345	* this sets each page in the bio uptodate. It should only be used on private
				1346	* rbio pages, nothing that comes in from the higher layers
				1347	*/
				1348	static void set_bio_pages_uptodate(struct bio *bio)
				1349	{
				1350	int i;
				1351	struct page *p;
				1352
				1353	for (i = 0; i < bio->bi_vcnt; i++) {
				1354	p = bio->bi_io_vec[i].bv_page;
				1355	SetPageUptodate(p);
				1356	}
				1357	}
				1358
				1359	/*
				1360	* end io for the read phase of the rmw cycle. All the bios here are physical
				1361	* stripe bios we've read from the disk so we can recalculate the parity of the
				1362	* stripe.
				1363	*
				1364	* This will usually kick off finish_rmw once all the bios are read in, but it
				1365	* may trigger parity reconstruction if we had any errors along the way
				1366	*/
				1367	static void raid_rmw_end_io(struct bio *bio, int err)
				1368	{
				1369	struct btrfs_raid_bio *rbio = bio->bi_private;
				1370
				1371	if (err)
				1372	fail_bio_stripe(rbio, bio);
				1373	else
				1374	set_bio_pages_uptodate(bio);
				1375
				1376	bio_put(bio);
				1377
				1378	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1379	return;
				1380
				1381	err = 0;
				1382	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1383	goto cleanup;
				1384
				1385	/*
				1386	* this will normally call finish_rmw to start our write
				1387	* but if there are any failed stripes we'll reconstruct
				1388	* from parity first
				1389	*/
				1390	validate_rbio_for_rmw(rbio);
				1391	return;
				1392
				1393	cleanup:
				1394
				1395	rbio_orig_end_io(rbio, -EIO, 0);
				1396	}
				1397
				1398	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1399	{
				1400	rbio->work.flags = 0;
				1401	rbio->work.func = rmw_work;
				1402
				1403	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1404	&rbio->work);
				1405	}
				1406
				1407	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1408	{
				1409	rbio->work.flags = 0;
				1410	rbio->work.func = read_rebuild_work;
				1411
				1412	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1413	&rbio->work);
				1414	}
				1415
				1416	/*
				1417	* the stripe must be locked by the caller. It will
				1418	* unlock after all the writes are done
				1419	*/
				1420	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1421	{
				1422	int bios_to_read = 0;
				1423	struct btrfs_bio *bbio = rbio->bbio;
				1424	struct bio_list bio_list;
				1425	int ret;
				1426	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1427	int pagenr;
				1428	int stripe;
				1429	struct bio *bio;
				1430
				1431	bio_list_init(&bio_list);
				1432
				1433	ret = alloc_rbio_pages(rbio);
				1434	if (ret)
				1435	goto cleanup;
				1436
				1437	index_rbio_pages(rbio);
				1438
				1439	atomic_set(&rbio->bbio->error, 0);
				1440	/*
				1441	* build a list of bios to read all the missing parts of this
				1442	* stripe
				1443	*/
				1444	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1445	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1446	struct page *page;
				1447	/*
				1448	* we want to find all the pages missing from
				1449	* the rbio and read them from the disk. If
				1450	* page_in_rbio finds a page in the bio list
				1451	* we don't need to read it off the stripe.
				1452	*/
				1453	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1454	if (page)
				1455	continue;
				1456
				1457	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1458	/*
				1459	* the bio cache may have handed us an uptodate
				1460	* page. If so, be happy and use it
				1461	*/
				1462	if (PageUptodate(page))
				1463	continue;
				1464
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1465	ret = rbio_add_io_page(rbio, &bio_list, page,
				1466	stripe, pagenr, rbio->stripe_len);
				1467	if (ret)
				1468	goto cleanup;
				1469	}
				1470	}
				1471
				1472	bios_to_read = bio_list_size(&bio_list);
				1473	if (!bios_to_read) {
				1474	/*
				1475	* this can happen if others have merged with
				1476	* us, it means there is nothing left to read.
				1477	* But if there are missing devices it may not be
				1478	* safe to do the full stripe write yet.
				1479	*/
				1480	goto finish;
				1481	}
				1482
				1483	/*
				1484	* the bbio may be freed once we submit the last bio. Make sure
				1485	* not to touch it after that
				1486	*/
				1487	atomic_set(&bbio->stripes_pending, bios_to_read);
				1488	while (1) {
				1489	bio = bio_list_pop(&bio_list);
				1490	if (!bio)
				1491	break;
				1492
				1493	bio->bi_private = rbio;
				1494	bio->bi_end_io = raid_rmw_end_io;
				1495
				1496	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1497	BTRFS_WQ_ENDIO_RAID56);
				1498
				1499	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1500	submit_bio(READ, bio);
				1501	}
				1502	/* the actual write will happen once the reads are done */
				1503	return 0;
				1504
				1505	cleanup:
				1506	rbio_orig_end_io(rbio, -EIO, 0);
				1507	return -EIO;
				1508
				1509	finish:
				1510	validate_rbio_for_rmw(rbio);
				1511	return 0;
				1512	}
				1513
				1514	/*
				1515	* if the upper layers pass in a full stripe, we thank them by only allocating
				1516	* enough pages to hold the parity, and sending it all down quickly.
				1517	*/
				1518	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1519	{
				1520	int ret;
				1521
				1522	ret = alloc_rbio_parity_pages(rbio);
				1523	if (ret)
				1524	return ret;
				1525
				1526	ret = lock_stripe_add(rbio);
				1527	if (ret == 0)
				1528	finish_rmw(rbio);
				1529	return 0;
				1530	}
				1531
				1532	/*
				1533	* partial stripe writes get handed over to async helpers.
				1534	* We're really hoping to merge a few more writes into this
				1535	* rbio before calculating new parity
				1536	*/
				1537	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1538	{
				1539	int ret;
				1540
				1541	ret = lock_stripe_add(rbio);
				1542	if (ret == 0)
				1543	async_rmw_stripe(rbio);
				1544	return 0;
				1545	}
				1546
				1547	/*
				1548	* sometimes while we were reading from the drive to
				1549	* recalculate parity, enough new bios come into create
				1550	* a full stripe. So we do a check here to see if we can
				1551	* go directly to finish_rmw
				1552	*/
				1553	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1554	{
				1555	/* head off into rmw land if we don't have a full stripe */
				1556	if (!rbio_is_full(rbio))
				1557	return partial_stripe_write(rbio);
				1558	return full_stripe_write(rbio);
				1559	}
				1560
				1561	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame^]	1562	* We use plugging call backs to collect full stripes.
				1563	* Any time we get a partial stripe write while plugged
				1564	* we collect it into a list. When the unplug comes down,
				1565	* we sort the list by logical block number and merge
				1566	* everything we can into the same rbios
				1567	*/
				1568	struct btrfs_plug_cb {
				1569	struct blk_plug_cb cb;
				1570	struct btrfs_fs_info *info;
				1571	struct list_head rbio_list;
				1572	struct btrfs_work work;
				1573	};
				1574
				1575	/*
				1576	* rbios on the plug list are sorted for easier merging.
				1577	*/
				1578	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1579	{
				1580	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1581	plug_list);
				1582	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1583	plug_list);
				1584	u64 a_sector = ra->bio_list.head->bi_sector;
				1585	u64 b_sector = rb->bio_list.head->bi_sector;
				1586
				1587	if (a_sector < b_sector)
				1588	return -1;
				1589	if (a_sector > b_sector)
				1590	return 1;
				1591	return 0;
				1592	}
				1593
				1594	static void run_plug(struct btrfs_plug_cb *plug)
				1595	{
				1596	struct btrfs_raid_bio *cur;
				1597	struct btrfs_raid_bio *last = NULL;
				1598
				1599	/*
				1600	* sort our plug list then try to merge
				1601	* everything we can in hopes of creating full
				1602	* stripes.
				1603	*/
				1604	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1605	while (!list_empty(&plug->rbio_list)) {
				1606	cur = list_entry(plug->rbio_list.next,
				1607	struct btrfs_raid_bio, plug_list);
				1608	list_del_init(&cur->plug_list);
				1609
				1610	if (rbio_is_full(cur)) {
				1611	/* we have a full stripe, send it down */
				1612	full_stripe_write(cur);
				1613	continue;
				1614	}
				1615	if (last) {
				1616	if (rbio_can_merge(last, cur)) {
				1617	merge_rbio(last, cur);
				1618	__free_raid_bio(cur);
				1619	continue;
				1620
				1621	}
				1622	__raid56_parity_write(last);
				1623	}
				1624	last = cur;
				1625	}
				1626	if (last) {
				1627	__raid56_parity_write(last);
				1628	}
				1629	kfree(plug);
				1630	}
				1631
				1632	/*
				1633	* if the unplug comes from schedule, we have to push the
				1634	* work off to a helper thread
				1635	*/
				1636	static void unplug_work(struct btrfs_work *work)
				1637	{
				1638	struct btrfs_plug_cb *plug;
				1639	plug = container_of(work, struct btrfs_plug_cb, work);
				1640	run_plug(plug);
				1641	}
				1642
				1643	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1644	{
				1645	struct btrfs_plug_cb *plug;
				1646	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1647
				1648	if (from_schedule) {
				1649	plug->work.flags = 0;
				1650	plug->work.func = unplug_work;
				1651	btrfs_queue_worker(&plug->info->rmw_workers,
				1652	&plug->work);
				1653	return;
				1654	}
				1655	run_plug(plug);
				1656	}
				1657
				1658	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1659	* our main entry point for writes from the rest of the FS.
				1660	*/
				1661	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1662	struct btrfs_bio bbio, u64 raid_map,
				1663	u64 stripe_len)
				1664	{
				1665	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame^]	1666	struct btrfs_plug_cb *plug = NULL;
				1667	struct blk_plug_cb *cb;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1668
				1669	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				1670	if (IS_ERR(rbio)) {
				1671	kfree(raid_map);
				1672	kfree(bbio);
				1673	return PTR_ERR(rbio);
				1674	}
				1675	bio_list_add(&rbio->bio_list, bio);
				1676	rbio->bio_list_bytes = bio->bi_size;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame^]	1677
				1678	/*
				1679	* don't plug on full rbios, just get them out the door
				1680	* as quickly as we can
				1681	*/
				1682	if (rbio_is_full(rbio))
				1683	return full_stripe_write(rbio);
				1684
				1685	cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
				1686	sizeof(*plug));
				1687	if (cb) {
				1688	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1689	if (!plug->info) {
				1690	plug->info = root->fs_info;
				1691	INIT_LIST_HEAD(&plug->rbio_list);
				1692	}
				1693	list_add_tail(&rbio->plug_list, &plug->rbio_list);
				1694	} else {
				1695	return __raid56_parity_write(rbio);
				1696	}
				1697	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1698	}
				1699
				1700	/*
				1701	* all parity reconstruction happens here. We've read in everything
				1702	* we can find from the drives and this does the heavy lifting of
				1703	* sorting the good from the bad.
				1704	*/
				1705	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1706	{
				1707	int pagenr, stripe;
				1708	void **pointers;
				1709	int faila = -1, failb = -1;
				1710	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1711	struct page *page;
				1712	int err;
				1713	int i;
				1714
				1715	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1716	GFP_NOFS);
				1717	if (!pointers) {
				1718	err = -ENOMEM;
				1719	goto cleanup_io;
				1720	}
				1721
				1722	faila = rbio->faila;
				1723	failb = rbio->failb;
				1724
				1725	if (rbio->read_rebuild) {
				1726	spin_lock_irq(&rbio->bio_list_lock);
				1727	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1728	spin_unlock_irq(&rbio->bio_list_lock);
				1729	}
				1730
				1731	index_rbio_pages(rbio);
				1732
				1733	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1734	/* setup our array of pointers with pages
				1735	* from each stripe
				1736	*/
				1737	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1738	/*
				1739	* if we're rebuilding a read, we have to use
				1740	* pages from the bio list
				1741	*/
				1742	if (rbio->read_rebuild &&
				1743	(stripe == faila \|\| stripe == failb)) {
				1744	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1745	} else {
				1746	page = rbio_stripe_page(rbio, stripe, pagenr);
				1747	}
				1748	pointers[stripe] = kmap(page);
				1749	}
				1750
				1751	/* all raid6 handling here */
				1752	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1753	RAID6_Q_STRIPE) {
				1754
				1755	/*
				1756	* single failure, rebuild from parity raid5
				1757	* style
				1758	*/
				1759	if (failb < 0) {
				1760	if (faila == rbio->nr_data) {
				1761	/*
				1762	* Just the P stripe has failed, without
				1763	* a bad data or Q stripe.
				1764	* TODO, we should redo the xor here.
				1765	*/
				1766	err = -EIO;
				1767	goto cleanup;
				1768	}
				1769	/*
				1770	* a single failure in raid6 is rebuilt
				1771	* in the pstripe code below
				1772	*/
				1773	goto pstripe;
				1774	}
				1775
				1776	/* make sure our ps and qs are in order */
				1777	if (faila > failb) {
				1778	int tmp = failb;
				1779	failb = faila;
				1780	faila = tmp;
				1781	}
				1782
				1783	/* if the q stripe is failed, do a pstripe reconstruction
				1784	* from the xors.
				1785	* If both the q stripe and the P stripe are failed, we're
				1786	* here due to a crc mismatch and we can't give them the
				1787	* data they want
				1788	*/
				1789	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1790	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1791	err = -EIO;
				1792	goto cleanup;
				1793	}
				1794	/*
				1795	* otherwise we have one bad data stripe and
				1796	* a good P stripe. raid5!
				1797	*/
				1798	goto pstripe;
				1799	}
				1800
				1801	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1802	raid6_datap_recov(rbio->bbio->num_stripes,
				1803	PAGE_SIZE, faila, pointers);
				1804	} else {
				1805	raid6_2data_recov(rbio->bbio->num_stripes,
				1806	PAGE_SIZE, faila, failb,
				1807	pointers);
				1808	}
				1809	} else {
				1810	void *p;
				1811
				1812	/* rebuild from P stripe here (raid5 or raid6) */
				1813	BUG_ON(failb != -1);
				1814	pstripe:
				1815	/* Copy parity block into failed block to start with */
				1816	memcpy(pointers[faila],
				1817	pointers[rbio->nr_data],
				1818	PAGE_CACHE_SIZE);
				1819
				1820	/* rearrange the pointer array */
				1821	p = pointers[faila];
				1822	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1823	pointers[stripe] = pointers[stripe + 1];
				1824	pointers[rbio->nr_data - 1] = p;
				1825
				1826	/* xor in the rest */
				1827	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1828	}
				1829	/* if we're doing this rebuild as part of an rmw, go through
				1830	* and set all of our private rbio pages in the
				1831	* failed stripes as uptodate. This way finish_rmw will
				1832	* know they can be trusted. If this was a read reconstruction,
				1833	* other endio functions will fiddle the uptodate bits
				1834	*/
				1835	if (!rbio->read_rebuild) {
				1836	for (i = 0; i < nr_pages; i++) {
				1837	if (faila != -1) {
				1838	page = rbio_stripe_page(rbio, faila, i);
				1839	SetPageUptodate(page);
				1840	}
				1841	if (failb != -1) {
				1842	page = rbio_stripe_page(rbio, failb, i);
				1843	SetPageUptodate(page);
				1844	}
				1845	}
				1846	}
				1847	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1848	/*
				1849	* if we're rebuilding a read, we have to use
				1850	* pages from the bio list
				1851	*/
				1852	if (rbio->read_rebuild &&
				1853	(stripe == faila \|\| stripe == failb)) {
				1854	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1855	} else {
				1856	page = rbio_stripe_page(rbio, stripe, pagenr);
				1857	}
				1858	kunmap(page);
				1859	}
				1860	}
				1861
				1862	err = 0;
				1863	cleanup:
				1864	kfree(pointers);
				1865
				1866	cleanup_io:
				1867
				1868	if (rbio->read_rebuild) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1869	if (err == 0)
				1870	cache_rbio_pages(rbio);
				1871	else
				1872	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1873
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1874	rbio_orig_end_io(rbio, err, err == 0);
				1875	} else if (err == 0) {
				1876	rbio->faila = -1;
				1877	rbio->failb = -1;
				1878	finish_rmw(rbio);
				1879	} else {
				1880	rbio_orig_end_io(rbio, err, 0);
				1881	}
				1882	}
				1883
				1884	/*
				1885	* This is called only for stripes we've read from disk to
				1886	* reconstruct the parity.
				1887	*/
				1888	static void raid_recover_end_io(struct bio *bio, int err)
				1889	{
				1890	struct btrfs_raid_bio *rbio = bio->bi_private;
				1891
				1892	/*
				1893	* we only read stripe pages off the disk, set them
				1894	* up to date if there were no errors
				1895	*/
				1896	if (err)
				1897	fail_bio_stripe(rbio, bio);
				1898	else
				1899	set_bio_pages_uptodate(bio);
				1900	bio_put(bio);
				1901
				1902	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1903	return;
				1904
				1905	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1906	rbio_orig_end_io(rbio, -EIO, 0);
				1907	else
				1908	__raid_recover_end_io(rbio);
				1909	}
				1910
				1911	/*
				1912	* reads everything we need off the disk to reconstruct
				1913	* the parity. endio handlers trigger final reconstruction
				1914	* when the IO is done.
				1915	*
				1916	* This is used both for reads from the higher layers and for
				1917	* parity construction required to finish a rmw cycle.
				1918	*/
				1919	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				1920	{
				1921	int bios_to_read = 0;
				1922	struct btrfs_bio *bbio = rbio->bbio;
				1923	struct bio_list bio_list;
				1924	int ret;
				1925	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1926	int pagenr;
				1927	int stripe;
				1928	struct bio *bio;
				1929
				1930	bio_list_init(&bio_list);
				1931
				1932	ret = alloc_rbio_pages(rbio);
				1933	if (ret)
				1934	goto cleanup;
				1935
				1936	atomic_set(&rbio->bbio->error, 0);
				1937
				1938	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1939	* read everything that hasn't failed. Thanks to the
				1940	* stripe cache, it is possible that some or all of these
				1941	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1942	*/
				1943	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1944	if (rbio->faila == stripe \|\|
				1945	rbio->failb == stripe)
				1946	continue;
				1947
				1948	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1949	struct page *p;
				1950
				1951	/*
				1952	* the rmw code may have already read this
				1953	* page in
				1954	*/
				1955	p = rbio_stripe_page(rbio, stripe, pagenr);
				1956	if (PageUptodate(p))
				1957	continue;
				1958
				1959	ret = rbio_add_io_page(rbio, &bio_list,
				1960	rbio_stripe_page(rbio, stripe, pagenr),
				1961	stripe, pagenr, rbio->stripe_len);
				1962	if (ret < 0)
				1963	goto cleanup;
				1964	}
				1965	}
				1966
				1967	bios_to_read = bio_list_size(&bio_list);
				1968	if (!bios_to_read) {
				1969	/*
				1970	* we might have no bios to read just because the pages
				1971	* were up to date, or we might have no bios to read because
				1972	* the devices were gone.
				1973	*/
				1974	if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
				1975	__raid_recover_end_io(rbio);
				1976	goto out;
				1977	} else {
				1978	goto cleanup;
				1979	}
				1980	}
				1981
				1982	/*
				1983	* the bbio may be freed once we submit the last bio. Make sure
				1984	* not to touch it after that
				1985	*/
				1986	atomic_set(&bbio->stripes_pending, bios_to_read);
				1987	while (1) {
				1988	bio = bio_list_pop(&bio_list);
				1989	if (!bio)
				1990	break;
				1991
				1992	bio->bi_private = rbio;
				1993	bio->bi_end_io = raid_recover_end_io;
				1994
				1995	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1996	BTRFS_WQ_ENDIO_RAID56);
				1997
				1998	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1999	submit_bio(READ, bio);
				2000	}
				2001	out:
				2002	return 0;
				2003
				2004	cleanup:
				2005	if (rbio->read_rebuild)
				2006	rbio_orig_end_io(rbio, -EIO, 0);
				2007	return -EIO;
				2008	}
				2009
				2010	/*
				2011	* the main entry point for reads from the higher layers. This
				2012	* is really only called when the normal read path had a failure,
				2013	* so we assume the bio they send down corresponds to a failed part
				2014	* of the drive.
				2015	*/
				2016	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				2017	struct btrfs_bio bbio, u64 raid_map,
				2018	u64 stripe_len, int mirror_num)
				2019	{
				2020	struct btrfs_raid_bio *rbio;
				2021	int ret;
				2022
				2023	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				2024	if (IS_ERR(rbio)) {
				2025	return PTR_ERR(rbio);
				2026	}
				2027
				2028	rbio->read_rebuild = 1;
				2029	bio_list_add(&rbio->bio_list, bio);
				2030	rbio->bio_list_bytes = bio->bi_size;
				2031
				2032	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2033	if (rbio->faila == -1) {
				2034	BUG();
				2035	kfree(rbio);
				2036	return -EIO;
				2037	}
				2038
				2039	/*
				2040	* reconstruct from the q stripe if they are
				2041	* asking for mirror 3
				2042	*/
				2043	if (mirror_num == 3)
				2044	rbio->failb = bbio->num_stripes - 2;
				2045
				2046	ret = lock_stripe_add(rbio);
				2047
				2048	/*
				2049	* __raid56_parity_recover will end the bio with
				2050	* any errors it hits. We don't want to return
				2051	* its error value up the stack because our caller
				2052	* will end up calling bio_endio with any nonzero
				2053	* return
				2054	*/
				2055	if (ret == 0)
				2056	__raid56_parity_recover(rbio);
				2057	/*
				2058	* our rbio has been added to the list of
				2059	* rbios that will be handled after the
				2060	* currently lock owner is done
				2061	*/
				2062	return 0;
				2063
				2064	}
				2065
				2066	static void rmw_work(struct btrfs_work *work)
				2067	{
				2068	struct btrfs_raid_bio *rbio;
				2069
				2070	rbio = container_of(work, struct btrfs_raid_bio, work);
				2071	raid56_rmw_stripe(rbio);
				2072	}
				2073
				2074	static void read_rebuild_work(struct btrfs_work *work)
				2075	{
				2076	struct btrfs_raid_bio *rbio;
				2077
				2078	rbio = container_of(work, struct btrfs_raid_bio, work);
				2079	__raid56_parity_recover(rbio);
				2080	}