Blame - fs/btrfs/raid56.c - android_kernel_oneplus_msm8996

blob: d02510f349363f358cac8fbb54211bba533cad28 [file] [log] [blame]

David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame^]	1	/*
				2	* Copyright (C) 2012 Fusion-io All rights reserved.
				3	* Copyright (C) 2012 Intel Corp. All rights reserved.
				4	*
				5	* This program is free software; you can redistribute it and/or
				6	* modify it under the terms of the GNU General Public
				7	* License v2 as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope that it will be useful,
				10	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				12	* General Public License for more details.
				13	*
				14	* You should have received a copy of the GNU General Public
				15	* License along with this program; if not, write to the
				16	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				17	* Boston, MA 021110-1307, USA.
				18	*/
				19	#include <linux/sched.h>
				20	#include <linux/wait.h>
				21	#include <linux/bio.h>
				22	#include <linux/slab.h>
				23	#include <linux/buffer_head.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/random.h>
				26	#include <linux/iocontext.h>
				27	#include <linux/capability.h>
				28	#include <linux/ratelimit.h>
				29	#include <linux/kthread.h>
				30	#include <linux/raid/pq.h>
				31	#include <linux/hash.h>
				32	#include <linux/list_sort.h>
				33	#include <linux/raid/xor.h>
				34	#include <asm/div64.h>
				35	#include "compat.h"
				36	#include "ctree.h"
				37	#include "extent_map.h"
				38	#include "disk-io.h"
				39	#include "transaction.h"
				40	#include "print-tree.h"
				41	#include "volumes.h"
				42	#include "raid56.h"
				43	#include "async-thread.h"
				44	#include "check-integrity.h"
				45	#include "rcu-string.h"
				46
				47	/* set when additional merges to this rbio are not allowed */
				48	#define RBIO_RMW_LOCKED_BIT 1
				49
				50	struct btrfs_raid_bio {
				51	struct btrfs_fs_info *fs_info;
				52	struct btrfs_bio *bbio;
				53
				54	/*
				55	* logical block numbers for the start of each stripe
				56	* The last one or two are p/q. These are sorted,
				57	* so raid_map[0] is the start of our full stripe
				58	*/
				59	u64 *raid_map;
				60
				61	/* while we're doing rmw on a stripe
				62	* we put it into a hash table so we can
				63	* lock the stripe and merge more rbios
				64	* into it.
				65	*/
				66	struct list_head hash_list;
				67
				68	/*
				69	* for scheduling work in the helper threads
				70	*/
				71	struct btrfs_work work;
				72
				73	/*
				74	* bio list and bio_list_lock are used
				75	* to add more bios into the stripe
				76	* in hopes of avoiding the full rmw
				77	*/
				78	struct bio_list bio_list;
				79	spinlock_t bio_list_lock;
				80
				81	/*
				82	* also protected by the bio_list_lock, the
				83	* stripe locking code uses plug_list to hand off
				84	* the stripe lock to the next pending IO
				85	*/
				86	struct list_head plug_list;
				87
				88	/*
				89	* flags that tell us if it is safe to
				90	* merge with this bio
				91	*/
				92	unsigned long flags;
				93
				94	/* size of each individual stripe on disk */
				95	int stripe_len;
				96
				97	/* number of data stripes (no p/q) */
				98	int nr_data;
				99
				100	/*
				101	* set if we're doing a parity rebuild
				102	* for a read from higher up, which is handled
				103	* differently from a parity rebuild as part of
				104	* rmw
				105	*/
				106	int read_rebuild;
				107
				108	/* first bad stripe */
				109	int faila;
				110
				111	/* second bad stripe (for raid6 use) */
				112	int failb;
				113
				114	/*
				115	* number of pages needed to represent the full
				116	* stripe
				117	*/
				118	int nr_pages;
				119
				120	/*
				121	* size of all the bios in the bio_list. This
				122	* helps us decide if the rbio maps to a full
				123	* stripe or not
				124	*/
				125	int bio_list_bytes;
				126
				127	atomic_t refs;
				128
				129	/*
				130	* these are two arrays of pointers. We allocate the
				131	* rbio big enough to hold them both and setup their
				132	* locations when the rbio is allocated
				133	*/
				134
				135	/* pointers to pages that we allocated for
				136	* reading/writing stripes directly from the disk (including P/Q)
				137	*/
				138	struct page **stripe_pages;
				139
				140	/*
				141	* pointers to the pages in the bio_list. Stored
				142	* here for faster lookup
				143	*/
				144	struct page **bio_pages;
				145	};
				146
				147	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				148	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				149	static void rmw_work(struct btrfs_work *work);
				150	static void read_rebuild_work(struct btrfs_work *work);
				151	static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
				152	static void async_read_rebuild(struct btrfs_raid_bio *rbio);
				153	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				154	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				155	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				156	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				157	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				158
				159	/*
				160	* the stripe hash table is used for locking, and to collect
				161	* bios in hopes of making a full stripe
				162	*/
				163	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				164	{
				165	struct btrfs_stripe_hash_table *table;
				166	struct btrfs_stripe_hash_table *x;
				167	struct btrfs_stripe_hash *cur;
				168	struct btrfs_stripe_hash *h;
				169	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				170	int i;
				171
				172	if (info->stripe_hash_table)
				173	return 0;
				174
				175	table = kzalloc(sizeof(table) + sizeof(h) * num_entries, GFP_NOFS);
				176	if (!table)
				177	return -ENOMEM;
				178
				179	table->table = (void *)(table + 1);
				180	h = table->table;
				181
				182	for (i = 0; i < num_entries; i++) {
				183	cur = h + i;
				184	INIT_LIST_HEAD(&cur->hash_list);
				185	spin_lock_init(&cur->lock);
				186	init_waitqueue_head(&cur->wait);
				187	}
				188
				189	x = cmpxchg(&info->stripe_hash_table, NULL, table);
				190	if (x)
				191	kfree(x);
				192	return 0;
				193	}
				194
				195	/*
				196	* we hash on the first logical address of the stripe
				197	*/
				198	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				199	{
				200	u64 num = rbio->raid_map[0];
				201
				202	/*
				203	* we shift down quite a bit. We're using byte
				204	* addressing, and most of the lower bits are zeros.
				205	* This tends to upset hash_64, and it consistently
				206	* returns just one or two different values.
				207	*
				208	* shifting off the lower bits fixes things.
				209	*/
				210	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				211	}
				212
				213	/*
				214	* merging means we take the bio_list from the victim and
				215	* splice it into the destination. The victim should
				216	* be discarded afterwards.
				217	*
				218	* must be called with dest->rbio_list_lock held
				219	*/
				220	static void merge_rbio(struct btrfs_raid_bio *dest,
				221	struct btrfs_raid_bio *victim)
				222	{
				223	bio_list_merge(&dest->bio_list, &victim->bio_list);
				224	dest->bio_list_bytes += victim->bio_list_bytes;
				225	bio_list_init(&victim->bio_list);
				226	}
				227
				228	/*
				229	* free the hash table used by unmount
				230	*/
				231	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				232	{
				233	if (!info->stripe_hash_table)
				234	return;
				235	kfree(info->stripe_hash_table);
				236	info->stripe_hash_table = NULL;
				237	}
				238
				239	/*
				240	* helper function to run the xor_blocks api. It is only
				241	* able to do MAX_XOR_BLOCKS at a time, so we need to
				242	* loop through.
				243	*/
				244	static void run_xor(void **pages, int src_cnt, ssize_t len)
				245	{
				246	int src_off = 0;
				247	int xor_src_cnt = 0;
				248	void *dest = pages[src_cnt];
				249
				250	while(src_cnt > 0) {
				251	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				252	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				253
				254	src_cnt -= xor_src_cnt;
				255	src_off += xor_src_cnt;
				256	}
				257	}
				258
				259	/*
				260	* returns true if the bio list inside this rbio
				261	* covers an entire stripe (no rmw required).
				262	* Must be called with the bio list lock held, or
				263	* at a time when you know it is impossible to add
				264	* new bios into the list
				265	*/
				266	static int __rbio_is_full(struct btrfs_raid_bio *rbio)
				267	{
				268	unsigned long size = rbio->bio_list_bytes;
				269	int ret = 1;
				270
				271	if (size != rbio->nr_data * rbio->stripe_len)
				272	ret = 0;
				273
				274	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
				275	return ret;
				276	}
				277
				278	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				279	{
				280	unsigned long flags;
				281	int ret;
				282
				283	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				284	ret = __rbio_is_full(rbio);
				285	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				286	return ret;
				287	}
				288
				289	/*
				290	* returns 1 if it is safe to merge two rbios together.
				291	* The merging is safe if the two rbios correspond to
				292	* the same stripe and if they are both going in the same
				293	* direction (read vs write), and if neither one is
				294	* locked for final IO
				295	*
				296	* The caller is responsible for locking such that
				297	* rmw_locked is safe to test
				298	*/
				299	static int rbio_can_merge(struct btrfs_raid_bio *last,
				300	struct btrfs_raid_bio *cur)
				301	{
				302	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				303	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				304	return 0;
				305
				306	if (last->raid_map[0] !=
				307	cur->raid_map[0])
				308	return 0;
				309
				310	/* reads can't merge with writes */
				311	if (last->read_rebuild !=
				312	cur->read_rebuild) {
				313	return 0;
				314	}
				315
				316	return 1;
				317	}
				318
				319	/*
				320	* helper to index into the pstripe
				321	*/
				322	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				323	{
				324	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				325	return rbio->stripe_pages[index];
				326	}
				327
				328	/*
				329	* helper to index into the qstripe, returns null
				330	* if there is no qstripe
				331	*/
				332	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				333	{
				334	if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
				335	return NULL;
				336
				337	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
				338	PAGE_CACHE_SHIFT;
				339	return rbio->stripe_pages[index];
				340	}
				341
				342	/*
				343	* The first stripe in the table for a logical address
				344	* has the lock. rbios are added in one of three ways:
				345	*
				346	* 1) Nobody has the stripe locked yet. The rbio is given
				347	* the lock and 0 is returned. The caller must start the IO
				348	* themselves.
				349	*
				350	* 2) Someone has the stripe locked, but we're able to merge
				351	* with the lock owner. The rbio is freed and the IO will
				352	* start automatically along with the existing rbio. 1 is returned.
				353	*
				354	* 3) Someone has the stripe locked, but we're not able to merge.
				355	* The rbio is added to the lock owner's plug list, or merged into
				356	* an rbio already on the plug list. When the lock owner unlocks,
				357	* the next rbio on the list is run and the IO is started automatically.
				358	* 1 is returned
				359	*
				360	* If we return 0, the caller still owns the rbio and must continue with
				361	* IO submission. If we return 1, the caller must assume the rbio has
				362	* already been freed.
				363	*/
				364	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				365	{
				366	int bucket = rbio_bucket(rbio);
				367	struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
				368	struct btrfs_raid_bio *cur;
				369	struct btrfs_raid_bio *pending;
				370	unsigned long flags;
				371	DEFINE_WAIT(wait);
				372	struct btrfs_raid_bio *freeit = NULL;
				373	int ret = 0;
				374	int walk = 0;
				375
				376	spin_lock_irqsave(&h->lock, flags);
				377	list_for_each_entry(cur, &h->hash_list, hash_list) {
				378	walk++;
				379	if (cur->raid_map[0] == rbio->raid_map[0]) {
				380	spin_lock(&cur->bio_list_lock);
				381
				382	/* can we merge into the lock owner? */
				383	if (rbio_can_merge(cur, rbio)) {
				384	merge_rbio(cur, rbio);
				385	spin_unlock(&cur->bio_list_lock);
				386	freeit = rbio;
				387	ret = 1;
				388	goto out;
				389	}
				390
				391	/*
				392	* we couldn't merge with the running
				393	* rbio, see if we can merge with the
				394	* pending ones. We don't have to
				395	* check for rmw_locked because there
				396	* is no way they are inside finish_rmw
				397	* right now
				398	*/
				399	list_for_each_entry(pending, &cur->plug_list,
				400	plug_list) {
				401	if (rbio_can_merge(pending, rbio)) {
				402	merge_rbio(pending, rbio);
				403	spin_unlock(&cur->bio_list_lock);
				404	freeit = rbio;
				405	ret = 1;
				406	goto out;
				407	}
				408	}
				409
				410	/* no merging, put us on the tail of the plug list,
				411	* our rbio will be started with the currently
				412	* running rbio unlocks
				413	*/
				414	list_add_tail(&rbio->plug_list, &cur->plug_list);
				415	spin_unlock(&cur->bio_list_lock);
				416	ret = 1;
				417	goto out;
				418	}
				419	}
				420
				421	atomic_inc(&rbio->refs);
				422	list_add(&rbio->hash_list, &h->hash_list);
				423	out:
				424	spin_unlock_irqrestore(&h->lock, flags);
				425	if (freeit)
				426	__free_raid_bio(freeit);
				427	return ret;
				428	}
				429
				430	/*
				431	* called as rmw or parity rebuild is completed. If the plug list has more
				432	* rbios waiting for this stripe, the next one on the list will be started
				433	*/
				434	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				435	{
				436	int bucket;
				437	struct btrfs_stripe_hash *h;
				438	unsigned long flags;
				439
				440	bucket = rbio_bucket(rbio);
				441	h = rbio->fs_info->stripe_hash_table->table + bucket;
				442
				443	spin_lock_irqsave(&h->lock, flags);
				444	spin_lock(&rbio->bio_list_lock);
				445
				446	if (!list_empty(&rbio->hash_list)) {
				447
				448	list_del_init(&rbio->hash_list);
				449	atomic_dec(&rbio->refs);
				450
				451	/*
				452	* we use the plug list to hold all the rbios
				453	* waiting for the chance to lock this stripe.
				454	* hand the lock over to one of them.
				455	*/
				456	if (!list_empty(&rbio->plug_list)) {
				457	struct btrfs_raid_bio *next;
				458	struct list_head *head = rbio->plug_list.next;
				459
				460	next = list_entry(head, struct btrfs_raid_bio,
				461	plug_list);
				462
				463	list_del_init(&rbio->plug_list);
				464
				465	list_add(&next->hash_list, &h->hash_list);
				466	atomic_inc(&next->refs);
				467	spin_unlock(&rbio->bio_list_lock);
				468	spin_unlock_irqrestore(&h->lock, flags);
				469
				470	if (next->read_rebuild)
				471	async_read_rebuild(next);
				472	else
				473	async_rmw_stripe(next);
				474
				475	goto done_nolock;
				476
				477	} else if (waitqueue_active(&h->wait)) {
				478	spin_unlock(&rbio->bio_list_lock);
				479	spin_unlock_irqrestore(&h->lock, flags);
				480	wake_up(&h->wait);
				481	goto done_nolock;
				482	}
				483	}
				484	spin_unlock(&rbio->bio_list_lock);
				485	spin_unlock_irqrestore(&h->lock, flags);
				486
				487	done_nolock:
				488	return;
				489	}
				490
				491	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				492	{
				493	int i;
				494
				495	WARN_ON(atomic_read(&rbio->refs) < 0);
				496	if (!atomic_dec_and_test(&rbio->refs))
				497	return;
				498
				499	WARN_ON(!list_empty(&rbio->hash_list));
				500	WARN_ON(!bio_list_empty(&rbio->bio_list));
				501
				502	for (i = 0; i < rbio->nr_pages; i++) {
				503	if (rbio->stripe_pages[i]) {
				504	__free_page(rbio->stripe_pages[i]);
				505	rbio->stripe_pages[i] = NULL;
				506	}
				507	}
				508	kfree(rbio->raid_map);
				509	kfree(rbio->bbio);
				510	kfree(rbio);
				511	}
				512
				513	static void free_raid_bio(struct btrfs_raid_bio *rbio)
				514	{
				515	unlock_stripe(rbio);
				516	__free_raid_bio(rbio);
				517	}
				518
				519	/*
				520	* this frees the rbio and runs through all the bios in the
				521	* bio_list and calls end_io on them
				522	*/
				523	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
				524	{
				525	struct bio *cur = bio_list_get(&rbio->bio_list);
				526	struct bio *next;
				527	free_raid_bio(rbio);
				528
				529	while (cur) {
				530	next = cur->bi_next;
				531	cur->bi_next = NULL;
				532	if (uptodate)
				533	set_bit(BIO_UPTODATE, &cur->bi_flags);
				534	bio_endio(cur, err);
				535	cur = next;
				536	}
				537	}
				538
				539	/*
				540	* end io function used by finish_rmw. When we finally
				541	* get here, we've written a full stripe
				542	*/
				543	static void raid_write_end_io(struct bio *bio, int err)
				544	{
				545	struct btrfs_raid_bio *rbio = bio->bi_private;
				546
				547	if (err)
				548	fail_bio_stripe(rbio, bio);
				549
				550	bio_put(bio);
				551
				552	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				553	return;
				554
				555	err = 0;
				556
				557	/* OK, we have read all the stripes we need to. */
				558	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				559	err = -EIO;
				560
				561	rbio_orig_end_io(rbio, err, 0);
				562	return;
				563	}
				564
				565	/*
				566	* the read/modify/write code wants to use the original bio for
				567	* any pages it included, and then use the rbio for everything
				568	* else. This function decides if a given index (stripe number)
				569	* and page number in that stripe fall inside the original bio
				570	* or the rbio.
				571	*
				572	* if you set bio_list_only, you'll get a NULL back for any ranges
				573	* that are outside the bio_list
				574	*
				575	* This doesn't take any refs on anything, you get a bare page pointer
				576	* and the caller must bump refs as required.
				577	*
				578	* You must call index_rbio_pages once before you can trust
				579	* the answers from this function.
				580	*/
				581	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				582	int index, int pagenr, int bio_list_only)
				583	{
				584	int chunk_page;
				585	struct page *p = NULL;
				586
				587	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				588
				589	spin_lock_irq(&rbio->bio_list_lock);
				590	p = rbio->bio_pages[chunk_page];
				591	spin_unlock_irq(&rbio->bio_list_lock);
				592
				593	if (p \|\| bio_list_only)
				594	return p;
				595
				596	return rbio->stripe_pages[chunk_page];
				597	}
				598
				599	/*
				600	* number of pages we need for the entire stripe across all the
				601	* drives
				602	*/
				603	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				604	{
				605	unsigned long nr = stripe_len * nr_stripes;
				606	return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				607	}
				608
				609	/*
				610	* allocation and initial setup for the btrfs_raid_bio. Not
				611	* this does not allocate any pages for rbio->pages.
				612	*/
				613	static struct btrfs_raid_bio alloc_rbio(struct btrfs_root root,
				614	struct btrfs_bio bbio, u64 raid_map,
				615	u64 stripe_len)
				616	{
				617	struct btrfs_raid_bio *rbio;
				618	int nr_data = 0;
				619	int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
				620	void *p;
				621
				622	rbio = kzalloc(sizeof(rbio) + num_pages sizeof(struct page ) 2,
				623	GFP_NOFS);
				624	if (!rbio) {
				625	kfree(raid_map);
				626	kfree(bbio);
				627	return ERR_PTR(-ENOMEM);
				628	}
				629
				630	bio_list_init(&rbio->bio_list);
				631	INIT_LIST_HEAD(&rbio->plug_list);
				632	spin_lock_init(&rbio->bio_list_lock);
				633	INIT_LIST_HEAD(&rbio->hash_list);
				634	rbio->bbio = bbio;
				635	rbio->raid_map = raid_map;
				636	rbio->fs_info = root->fs_info;
				637	rbio->stripe_len = stripe_len;
				638	rbio->nr_pages = num_pages;
				639	rbio->faila = -1;
				640	rbio->failb = -1;
				641	atomic_set(&rbio->refs, 1);
				642
				643	/*
				644	* the stripe_pages and bio_pages array point to the extra
				645	* memory we allocated past the end of the rbio
				646	*/
				647	p = rbio + 1;
				648	rbio->stripe_pages = p;
				649	rbio->bio_pages = p + sizeof(struct page ) num_pages;
				650
				651	if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
				652	nr_data = bbio->num_stripes - 2;
				653	else
				654	nr_data = bbio->num_stripes - 1;
				655
				656	rbio->nr_data = nr_data;
				657	return rbio;
				658	}
				659
				660	/* allocate pages for all the stripes in the bio, including parity */
				661	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				662	{
				663	int i;
				664	struct page *page;
				665
				666	for (i = 0; i < rbio->nr_pages; i++) {
				667	if (rbio->stripe_pages[i])
				668	continue;
				669	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				670	if (!page)
				671	return -ENOMEM;
				672	rbio->stripe_pages[i] = page;
				673	ClearPageUptodate(page);
				674	}
				675	return 0;
				676	}
				677
				678	/* allocate pages for just the p/q stripes */
				679	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				680	{
				681	int i;
				682	struct page *page;
				683
				684	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
				685
				686	for (; i < rbio->nr_pages; i++) {
				687	if (rbio->stripe_pages[i])
				688	continue;
				689	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				690	if (!page)
				691	return -ENOMEM;
				692	rbio->stripe_pages[i] = page;
				693	}
				694	return 0;
				695	}
				696
				697	/*
				698	* add a single page from a specific stripe into our list of bios for IO
				699	* this will try to merge into existing bios if possible, and returns
				700	* zero if all went well.
				701	*/
				702	int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				703	struct bio_list *bio_list,
				704	struct page *page,
				705	int stripe_nr,
				706	unsigned long page_index,
				707	unsigned long bio_max_len)
				708	{
				709	struct bio *last = bio_list->tail;
				710	u64 last_end = 0;
				711	int ret;
				712	struct bio *bio;
				713	struct btrfs_bio_stripe *stripe;
				714	u64 disk_start;
				715
				716	stripe = &rbio->bbio->stripes[stripe_nr];
				717	disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
				718
				719	/* if the device is missing, just fail this stripe */
				720	if (!stripe->dev->bdev)
				721	return fail_rbio_index(rbio, stripe_nr);
				722
				723	/* see if we can add this page onto our existing bio */
				724	if (last) {
				725	last_end = (u64)last->bi_sector << 9;
				726	last_end += last->bi_size;
				727
				728	/*
				729	* we can't merge these if they are from different
				730	* devices or if they are not contiguous
				731	*/
				732	if (last_end == disk_start && stripe->dev->bdev &&
				733	test_bit(BIO_UPTODATE, &last->bi_flags) &&
				734	last->bi_bdev == stripe->dev->bdev) {
				735	ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
				736	if (ret == PAGE_CACHE_SIZE)
				737	return 0;
				738	}
				739	}
				740
				741	/* put a new bio on the list */
				742	bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
				743	if (!bio)
				744	return -ENOMEM;
				745
				746	bio->bi_size = 0;
				747	bio->bi_bdev = stripe->dev->bdev;
				748	bio->bi_sector = disk_start >> 9;
				749	set_bit(BIO_UPTODATE, &bio->bi_flags);
				750
				751	bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
				752	bio_list_add(bio_list, bio);
				753	return 0;
				754	}
				755
				756	/*
				757	* while we're doing the read/modify/write cycle, we could
				758	* have errors in reading pages off the disk. This checks
				759	* for errors and if we're not able to read the page it'll
				760	* trigger parity reconstruction. The rmw will be finished
				761	* after we've reconstructed the failed stripes
				762	*/
				763	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				764	{
				765	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				766	BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
				767	__raid56_parity_recover(rbio);
				768	} else {
				769	finish_rmw(rbio);
				770	}
				771	}
				772
				773	/*
				774	* these are just the pages from the rbio array, not from anything
				775	* the FS sent down to us
				776	*/
				777	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe, int page)
				778	{
				779	int index;
				780	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
				781	index += page;
				782	return rbio->stripe_pages[index];
				783	}
				784
				785	/*
				786	* helper function to walk our bio list and populate the bio_pages array with
				787	* the result. This seems expensive, but it is faster than constantly
				788	* searching through the bio list as we setup the IO in finish_rmw or stripe
				789	* reconstruction.
				790	*
				791	* This must be called before you trust the answers from page_in_rbio
				792	*/
				793	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				794	{
				795	struct bio *bio;
				796	u64 start;
				797	unsigned long stripe_offset;
				798	unsigned long page_index;
				799	struct page *p;
				800	int i;
				801
				802	spin_lock_irq(&rbio->bio_list_lock);
				803	bio_list_for_each(bio, &rbio->bio_list) {
				804	start = (u64)bio->bi_sector << 9;
				805	stripe_offset = start - rbio->raid_map[0];
				806	page_index = stripe_offset >> PAGE_CACHE_SHIFT;
				807
				808	for (i = 0; i < bio->bi_vcnt; i++) {
				809	p = bio->bi_io_vec[i].bv_page;
				810	rbio->bio_pages[page_index + i] = p;
				811	}
				812	}
				813	spin_unlock_irq(&rbio->bio_list_lock);
				814	}
				815
				816	/*
				817	* this is called from one of two situations. We either
				818	* have a full stripe from the higher layers, or we've read all
				819	* the missing bits off disk.
				820	*
				821	* This will calculate the parity and then send down any
				822	* changed blocks.
				823	*/
				824	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				825	{
				826	struct btrfs_bio *bbio = rbio->bbio;
				827	void *pointers[bbio->num_stripes];
				828	int stripe_len = rbio->stripe_len;
				829	int nr_data = rbio->nr_data;
				830	int stripe;
				831	int pagenr;
				832	int p_stripe = -1;
				833	int q_stripe = -1;
				834	struct bio_list bio_list;
				835	struct bio *bio;
				836	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
				837	int ret;
				838
				839	bio_list_init(&bio_list);
				840
				841	if (bbio->num_stripes - rbio->nr_data == 1) {
				842	p_stripe = bbio->num_stripes - 1;
				843	} else if (bbio->num_stripes - rbio->nr_data == 2) {
				844	p_stripe = bbio->num_stripes - 2;
				845	q_stripe = bbio->num_stripes - 1;
				846	} else {
				847	BUG();
				848	}
				849
				850	/* at this point we either have a full stripe,
				851	* or we've read the full stripe from the drive.
				852	* recalculate the parity and write the new results.
				853	*
				854	* We're not allowed to add any new bios to the
				855	* bio list here, anyone else that wants to
				856	* change this stripe needs to do their own rmw.
				857	*/
				858	spin_lock_irq(&rbio->bio_list_lock);
				859	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				860	spin_unlock_irq(&rbio->bio_list_lock);
				861
				862	atomic_set(&rbio->bbio->error, 0);
				863
				864	/*
				865	* now that we've set rmw_locked, run through the
				866	* bio list one last time and map the page pointers
				867	*/
				868	index_rbio_pages(rbio);
				869
				870	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				871	struct page *p;
				872	/* first collect one page from each data stripe */
				873	for (stripe = 0; stripe < nr_data; stripe++) {
				874	p = page_in_rbio(rbio, stripe, pagenr, 0);
				875	pointers[stripe] = kmap(p);
				876	}
				877
				878	/* then add the parity stripe */
				879	p = rbio_pstripe_page(rbio, pagenr);
				880	SetPageUptodate(p);
				881	pointers[stripe++] = kmap(p);
				882
				883	if (q_stripe != -1) {
				884
				885	/*
				886	* raid6, add the qstripe and call the
				887	* library function to fill in our p/q
				888	*/
				889	p = rbio_qstripe_page(rbio, pagenr);
				890	SetPageUptodate(p);
				891	pointers[stripe++] = kmap(p);
				892
				893	raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
				894	pointers);
				895	} else {
				896	/* raid5 */
				897	memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
				898	run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
				899	}
				900
				901
				902	for (stripe = 0; stripe < bbio->num_stripes; stripe++)
				903	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				904	}
				905
				906	/*
				907	* time to start writing. Make bios for everything from the
				908	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				909	* everything else.
				910	*/
				911	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				912	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
				913	struct page *page;
				914	if (stripe < rbio->nr_data) {
				915	page = page_in_rbio(rbio, stripe, pagenr, 1);
				916	if (!page)
				917	continue;
				918	} else {
				919	page = rbio_stripe_page(rbio, stripe, pagenr);
				920	}
				921
				922	ret = rbio_add_io_page(rbio, &bio_list,
				923	page, stripe, pagenr, rbio->stripe_len);
				924	if (ret)
				925	goto cleanup;
				926	}
				927	}
				928
				929	atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
				930	BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
				931
				932	while (1) {
				933	bio = bio_list_pop(&bio_list);
				934	if (!bio)
				935	break;
				936
				937	bio->bi_private = rbio;
				938	bio->bi_end_io = raid_write_end_io;
				939	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				940	submit_bio(WRITE, bio);
				941	}
				942	return;
				943
				944	cleanup:
				945	rbio_orig_end_io(rbio, -EIO, 0);
				946	}
				947
				948	/*
				949	* helper to find the stripe number for a given bio. Used to figure out which
				950	* stripe has failed. This expects the bio to correspond to a physical disk,
				951	* so it looks up based on physical sector numbers.
				952	*/
				953	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				954	struct bio *bio)
				955	{
				956	u64 physical = bio->bi_sector;
				957	u64 stripe_start;
				958	int i;
				959	struct btrfs_bio_stripe *stripe;
				960
				961	physical <<= 9;
				962
				963	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				964	stripe = &rbio->bbio->stripes[i];
				965	stripe_start = stripe->physical;
				966	if (physical >= stripe_start &&
				967	physical < stripe_start + rbio->stripe_len) {
				968	return i;
				969	}
				970	}
				971	return -1;
				972	}
				973
				974	/*
				975	* helper to find the stripe number for a given
				976	* bio (before mapping). Used to figure out which stripe has
				977	* failed. This looks up based on logical block numbers.
				978	*/
				979	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				980	struct bio *bio)
				981	{
				982	u64 logical = bio->bi_sector;
				983	u64 stripe_start;
				984	int i;
				985
				986	logical <<= 9;
				987
				988	for (i = 0; i < rbio->nr_data; i++) {
				989	stripe_start = rbio->raid_map[i];
				990	if (logical >= stripe_start &&
				991	logical < stripe_start + rbio->stripe_len) {
				992	return i;
				993	}
				994	}
				995	return -1;
				996	}
				997
				998	/*
				999	* returns -EIO if we had too many failures
				1000	*/
				1001	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1002	{
				1003	unsigned long flags;
				1004	int ret = 0;
				1005
				1006	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1007
				1008	/* we already know this stripe is bad, move on */
				1009	if (rbio->faila == failed \|\| rbio->failb == failed)
				1010	goto out;
				1011
				1012	if (rbio->faila == -1) {
				1013	/* first failure on this rbio */
				1014	rbio->faila = failed;
				1015	atomic_inc(&rbio->bbio->error);
				1016	} else if (rbio->failb == -1) {
				1017	/* second failure on this rbio */
				1018	rbio->failb = failed;
				1019	atomic_inc(&rbio->bbio->error);
				1020	} else {
				1021	ret = -EIO;
				1022	}
				1023	out:
				1024	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1025
				1026	return ret;
				1027	}
				1028
				1029	/*
				1030	* helper to fail a stripe based on a physical disk
				1031	* bio.
				1032	*/
				1033	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1034	struct bio *bio)
				1035	{
				1036	int failed = find_bio_stripe(rbio, bio);
				1037
				1038	if (failed < 0)
				1039	return -EIO;
				1040
				1041	return fail_rbio_index(rbio, failed);
				1042	}
				1043
				1044	/*
				1045	* this sets each page in the bio uptodate. It should only be used on private
				1046	* rbio pages, nothing that comes in from the higher layers
				1047	*/
				1048	static void set_bio_pages_uptodate(struct bio *bio)
				1049	{
				1050	int i;
				1051	struct page *p;
				1052
				1053	for (i = 0; i < bio->bi_vcnt; i++) {
				1054	p = bio->bi_io_vec[i].bv_page;
				1055	SetPageUptodate(p);
				1056	}
				1057	}
				1058
				1059	/*
				1060	* end io for the read phase of the rmw cycle. All the bios here are physical
				1061	* stripe bios we've read from the disk so we can recalculate the parity of the
				1062	* stripe.
				1063	*
				1064	* This will usually kick off finish_rmw once all the bios are read in, but it
				1065	* may trigger parity reconstruction if we had any errors along the way
				1066	*/
				1067	static void raid_rmw_end_io(struct bio *bio, int err)
				1068	{
				1069	struct btrfs_raid_bio *rbio = bio->bi_private;
				1070
				1071	if (err)
				1072	fail_bio_stripe(rbio, bio);
				1073	else
				1074	set_bio_pages_uptodate(bio);
				1075
				1076	bio_put(bio);
				1077
				1078	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1079	return;
				1080
				1081	err = 0;
				1082	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1083	goto cleanup;
				1084
				1085	/*
				1086	* this will normally call finish_rmw to start our write
				1087	* but if there are any failed stripes we'll reconstruct
				1088	* from parity first
				1089	*/
				1090	validate_rbio_for_rmw(rbio);
				1091	return;
				1092
				1093	cleanup:
				1094
				1095	rbio_orig_end_io(rbio, -EIO, 0);
				1096	}
				1097
				1098	static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
				1099	{
				1100	rbio->work.flags = 0;
				1101	rbio->work.func = rmw_work;
				1102
				1103	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1104	&rbio->work);
				1105	}
				1106
				1107	static void async_read_rebuild(struct btrfs_raid_bio *rbio)
				1108	{
				1109	rbio->work.flags = 0;
				1110	rbio->work.func = read_rebuild_work;
				1111
				1112	btrfs_queue_worker(&rbio->fs_info->rmw_workers,
				1113	&rbio->work);
				1114	}
				1115
				1116	/*
				1117	* the stripe must be locked by the caller. It will
				1118	* unlock after all the writes are done
				1119	*/
				1120	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1121	{
				1122	int bios_to_read = 0;
				1123	struct btrfs_bio *bbio = rbio->bbio;
				1124	struct bio_list bio_list;
				1125	int ret;
				1126	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1127	int pagenr;
				1128	int stripe;
				1129	struct bio *bio;
				1130
				1131	bio_list_init(&bio_list);
				1132
				1133	ret = alloc_rbio_pages(rbio);
				1134	if (ret)
				1135	goto cleanup;
				1136
				1137	index_rbio_pages(rbio);
				1138
				1139	atomic_set(&rbio->bbio->error, 0);
				1140	/*
				1141	* build a list of bios to read all the missing parts of this
				1142	* stripe
				1143	*/
				1144	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
				1145	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1146	struct page *page;
				1147	/*
				1148	* we want to find all the pages missing from
				1149	* the rbio and read them from the disk. If
				1150	* page_in_rbio finds a page in the bio list
				1151	* we don't need to read it off the stripe.
				1152	*/
				1153	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1154	if (page)
				1155	continue;
				1156
				1157	page = rbio_stripe_page(rbio, stripe, pagenr);
				1158	ret = rbio_add_io_page(rbio, &bio_list, page,
				1159	stripe, pagenr, rbio->stripe_len);
				1160	if (ret)
				1161	goto cleanup;
				1162	}
				1163	}
				1164
				1165	bios_to_read = bio_list_size(&bio_list);
				1166	if (!bios_to_read) {
				1167	/*
				1168	* this can happen if others have merged with
				1169	* us, it means there is nothing left to read.
				1170	* But if there are missing devices it may not be
				1171	* safe to do the full stripe write yet.
				1172	*/
				1173	goto finish;
				1174	}
				1175
				1176	/*
				1177	* the bbio may be freed once we submit the last bio. Make sure
				1178	* not to touch it after that
				1179	*/
				1180	atomic_set(&bbio->stripes_pending, bios_to_read);
				1181	while (1) {
				1182	bio = bio_list_pop(&bio_list);
				1183	if (!bio)
				1184	break;
				1185
				1186	bio->bi_private = rbio;
				1187	bio->bi_end_io = raid_rmw_end_io;
				1188
				1189	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1190	BTRFS_WQ_ENDIO_RAID56);
				1191
				1192	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1193	submit_bio(READ, bio);
				1194	}
				1195	/* the actual write will happen once the reads are done */
				1196	return 0;
				1197
				1198	cleanup:
				1199	rbio_orig_end_io(rbio, -EIO, 0);
				1200	return -EIO;
				1201
				1202	finish:
				1203	validate_rbio_for_rmw(rbio);
				1204	return 0;
				1205	}
				1206
				1207	/*
				1208	* if the upper layers pass in a full stripe, we thank them by only allocating
				1209	* enough pages to hold the parity, and sending it all down quickly.
				1210	*/
				1211	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1212	{
				1213	int ret;
				1214
				1215	ret = alloc_rbio_parity_pages(rbio);
				1216	if (ret)
				1217	return ret;
				1218
				1219	ret = lock_stripe_add(rbio);
				1220	if (ret == 0)
				1221	finish_rmw(rbio);
				1222	return 0;
				1223	}
				1224
				1225	/*
				1226	* partial stripe writes get handed over to async helpers.
				1227	* We're really hoping to merge a few more writes into this
				1228	* rbio before calculating new parity
				1229	*/
				1230	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1231	{
				1232	int ret;
				1233
				1234	ret = lock_stripe_add(rbio);
				1235	if (ret == 0)
				1236	async_rmw_stripe(rbio);
				1237	return 0;
				1238	}
				1239
				1240	/*
				1241	* sometimes while we were reading from the drive to
				1242	* recalculate parity, enough new bios come into create
				1243	* a full stripe. So we do a check here to see if we can
				1244	* go directly to finish_rmw
				1245	*/
				1246	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1247	{
				1248	/* head off into rmw land if we don't have a full stripe */
				1249	if (!rbio_is_full(rbio))
				1250	return partial_stripe_write(rbio);
				1251	return full_stripe_write(rbio);
				1252	}
				1253
				1254	/*
				1255	* our main entry point for writes from the rest of the FS.
				1256	*/
				1257	int raid56_parity_write(struct btrfs_root root, struct bio bio,
				1258	struct btrfs_bio bbio, u64 raid_map,
				1259	u64 stripe_len)
				1260	{
				1261	struct btrfs_raid_bio *rbio;
				1262
				1263	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				1264	if (IS_ERR(rbio)) {
				1265	kfree(raid_map);
				1266	kfree(bbio);
				1267	return PTR_ERR(rbio);
				1268	}
				1269	bio_list_add(&rbio->bio_list, bio);
				1270	rbio->bio_list_bytes = bio->bi_size;
				1271	return __raid56_parity_write(rbio);
				1272	}
				1273
				1274	/*
				1275	* all parity reconstruction happens here. We've read in everything
				1276	* we can find from the drives and this does the heavy lifting of
				1277	* sorting the good from the bad.
				1278	*/
				1279	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1280	{
				1281	int pagenr, stripe;
				1282	void **pointers;
				1283	int faila = -1, failb = -1;
				1284	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1285	struct page *page;
				1286	int err;
				1287	int i;
				1288
				1289	pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
				1290	GFP_NOFS);
				1291	if (!pointers) {
				1292	err = -ENOMEM;
				1293	goto cleanup_io;
				1294	}
				1295
				1296	faila = rbio->faila;
				1297	failb = rbio->failb;
				1298
				1299	if (rbio->read_rebuild) {
				1300	spin_lock_irq(&rbio->bio_list_lock);
				1301	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1302	spin_unlock_irq(&rbio->bio_list_lock);
				1303	}
				1304
				1305	index_rbio_pages(rbio);
				1306
				1307	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1308	/* setup our array of pointers with pages
				1309	* from each stripe
				1310	*/
				1311	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1312	/*
				1313	* if we're rebuilding a read, we have to use
				1314	* pages from the bio list
				1315	*/
				1316	if (rbio->read_rebuild &&
				1317	(stripe == faila \|\| stripe == failb)) {
				1318	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1319	} else {
				1320	page = rbio_stripe_page(rbio, stripe, pagenr);
				1321	}
				1322	pointers[stripe] = kmap(page);
				1323	}
				1324
				1325	/* all raid6 handling here */
				1326	if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
				1327	RAID6_Q_STRIPE) {
				1328
				1329	/*
				1330	* single failure, rebuild from parity raid5
				1331	* style
				1332	*/
				1333	if (failb < 0) {
				1334	if (faila == rbio->nr_data) {
				1335	/*
				1336	* Just the P stripe has failed, without
				1337	* a bad data or Q stripe.
				1338	* TODO, we should redo the xor here.
				1339	*/
				1340	err = -EIO;
				1341	goto cleanup;
				1342	}
				1343	/*
				1344	* a single failure in raid6 is rebuilt
				1345	* in the pstripe code below
				1346	*/
				1347	goto pstripe;
				1348	}
				1349
				1350	/* make sure our ps and qs are in order */
				1351	if (faila > failb) {
				1352	int tmp = failb;
				1353	failb = faila;
				1354	faila = tmp;
				1355	}
				1356
				1357	/* if the q stripe is failed, do a pstripe reconstruction
				1358	* from the xors.
				1359	* If both the q stripe and the P stripe are failed, we're
				1360	* here due to a crc mismatch and we can't give them the
				1361	* data they want
				1362	*/
				1363	if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1364	if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
				1365	err = -EIO;
				1366	goto cleanup;
				1367	}
				1368	/*
				1369	* otherwise we have one bad data stripe and
				1370	* a good P stripe. raid5!
				1371	*/
				1372	goto pstripe;
				1373	}
				1374
				1375	if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
				1376	raid6_datap_recov(rbio->bbio->num_stripes,
				1377	PAGE_SIZE, faila, pointers);
				1378	} else {
				1379	raid6_2data_recov(rbio->bbio->num_stripes,
				1380	PAGE_SIZE, faila, failb,
				1381	pointers);
				1382	}
				1383	} else {
				1384	void *p;
				1385
				1386	/* rebuild from P stripe here (raid5 or raid6) */
				1387	BUG_ON(failb != -1);
				1388	pstripe:
				1389	/* Copy parity block into failed block to start with */
				1390	memcpy(pointers[faila],
				1391	pointers[rbio->nr_data],
				1392	PAGE_CACHE_SIZE);
				1393
				1394	/* rearrange the pointer array */
				1395	p = pointers[faila];
				1396	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1397	pointers[stripe] = pointers[stripe + 1];
				1398	pointers[rbio->nr_data - 1] = p;
				1399
				1400	/* xor in the rest */
				1401	run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
				1402	}
				1403	/* if we're doing this rebuild as part of an rmw, go through
				1404	* and set all of our private rbio pages in the
				1405	* failed stripes as uptodate. This way finish_rmw will
				1406	* know they can be trusted. If this was a read reconstruction,
				1407	* other endio functions will fiddle the uptodate bits
				1408	*/
				1409	if (!rbio->read_rebuild) {
				1410	for (i = 0; i < nr_pages; i++) {
				1411	if (faila != -1) {
				1412	page = rbio_stripe_page(rbio, faila, i);
				1413	SetPageUptodate(page);
				1414	}
				1415	if (failb != -1) {
				1416	page = rbio_stripe_page(rbio, failb, i);
				1417	SetPageUptodate(page);
				1418	}
				1419	}
				1420	}
				1421	for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
				1422	/*
				1423	* if we're rebuilding a read, we have to use
				1424	* pages from the bio list
				1425	*/
				1426	if (rbio->read_rebuild &&
				1427	(stripe == faila \|\| stripe == failb)) {
				1428	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1429	} else {
				1430	page = rbio_stripe_page(rbio, stripe, pagenr);
				1431	}
				1432	kunmap(page);
				1433	}
				1434	}
				1435
				1436	err = 0;
				1437	cleanup:
				1438	kfree(pointers);
				1439
				1440	cleanup_io:
				1441
				1442	if (rbio->read_rebuild) {
				1443	rbio_orig_end_io(rbio, err, err == 0);
				1444	} else if (err == 0) {
				1445	rbio->faila = -1;
				1446	rbio->failb = -1;
				1447	finish_rmw(rbio);
				1448	} else {
				1449	rbio_orig_end_io(rbio, err, 0);
				1450	}
				1451	}
				1452
				1453	/*
				1454	* This is called only for stripes we've read from disk to
				1455	* reconstruct the parity.
				1456	*/
				1457	static void raid_recover_end_io(struct bio *bio, int err)
				1458	{
				1459	struct btrfs_raid_bio *rbio = bio->bi_private;
				1460
				1461	/*
				1462	* we only read stripe pages off the disk, set them
				1463	* up to date if there were no errors
				1464	*/
				1465	if (err)
				1466	fail_bio_stripe(rbio, bio);
				1467	else
				1468	set_bio_pages_uptodate(bio);
				1469	bio_put(bio);
				1470
				1471	if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
				1472	return;
				1473
				1474	if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
				1475	rbio_orig_end_io(rbio, -EIO, 0);
				1476	else
				1477	__raid_recover_end_io(rbio);
				1478	}
				1479
				1480	/*
				1481	* reads everything we need off the disk to reconstruct
				1482	* the parity. endio handlers trigger final reconstruction
				1483	* when the IO is done.
				1484	*
				1485	* This is used both for reads from the higher layers and for
				1486	* parity construction required to finish a rmw cycle.
				1487	*/
				1488	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				1489	{
				1490	int bios_to_read = 0;
				1491	struct btrfs_bio *bbio = rbio->bbio;
				1492	struct bio_list bio_list;
				1493	int ret;
				1494	int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
				1495	int pagenr;
				1496	int stripe;
				1497	struct bio *bio;
				1498
				1499	bio_list_init(&bio_list);
				1500
				1501	ret = alloc_rbio_pages(rbio);
				1502	if (ret)
				1503	goto cleanup;
				1504
				1505	atomic_set(&rbio->bbio->error, 0);
				1506
				1507	/*
				1508	* read everything that hasn't failed.
				1509	*/
				1510	for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
				1511	if (rbio->faila == stripe \|\|
				1512	rbio->failb == stripe)
				1513	continue;
				1514
				1515	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
				1516	struct page *p;
				1517
				1518	/*
				1519	* the rmw code may have already read this
				1520	* page in
				1521	*/
				1522	p = rbio_stripe_page(rbio, stripe, pagenr);
				1523	if (PageUptodate(p))
				1524	continue;
				1525
				1526	ret = rbio_add_io_page(rbio, &bio_list,
				1527	rbio_stripe_page(rbio, stripe, pagenr),
				1528	stripe, pagenr, rbio->stripe_len);
				1529	if (ret < 0)
				1530	goto cleanup;
				1531	}
				1532	}
				1533
				1534	bios_to_read = bio_list_size(&bio_list);
				1535	if (!bios_to_read) {
				1536	/*
				1537	* we might have no bios to read just because the pages
				1538	* were up to date, or we might have no bios to read because
				1539	* the devices were gone.
				1540	*/
				1541	if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
				1542	__raid_recover_end_io(rbio);
				1543	goto out;
				1544	} else {
				1545	goto cleanup;
				1546	}
				1547	}
				1548
				1549	/*
				1550	* the bbio may be freed once we submit the last bio. Make sure
				1551	* not to touch it after that
				1552	*/
				1553	atomic_set(&bbio->stripes_pending, bios_to_read);
				1554	while (1) {
				1555	bio = bio_list_pop(&bio_list);
				1556	if (!bio)
				1557	break;
				1558
				1559	bio->bi_private = rbio;
				1560	bio->bi_end_io = raid_recover_end_io;
				1561
				1562	btrfs_bio_wq_end_io(rbio->fs_info, bio,
				1563	BTRFS_WQ_ENDIO_RAID56);
				1564
				1565	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
				1566	submit_bio(READ, bio);
				1567	}
				1568	out:
				1569	return 0;
				1570
				1571	cleanup:
				1572	if (rbio->read_rebuild)
				1573	rbio_orig_end_io(rbio, -EIO, 0);
				1574	return -EIO;
				1575	}
				1576
				1577	/*
				1578	* the main entry point for reads from the higher layers. This
				1579	* is really only called when the normal read path had a failure,
				1580	* so we assume the bio they send down corresponds to a failed part
				1581	* of the drive.
				1582	*/
				1583	int raid56_parity_recover(struct btrfs_root root, struct bio bio,
				1584	struct btrfs_bio bbio, u64 raid_map,
				1585	u64 stripe_len, int mirror_num)
				1586	{
				1587	struct btrfs_raid_bio *rbio;
				1588	int ret;
				1589
				1590	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
				1591	if (IS_ERR(rbio)) {
				1592	return PTR_ERR(rbio);
				1593	}
				1594
				1595	rbio->read_rebuild = 1;
				1596	bio_list_add(&rbio->bio_list, bio);
				1597	rbio->bio_list_bytes = bio->bi_size;
				1598
				1599	rbio->faila = find_logical_bio_stripe(rbio, bio);
				1600	if (rbio->faila == -1) {
				1601	BUG();
				1602	kfree(rbio);
				1603	return -EIO;
				1604	}
				1605
				1606	/*
				1607	* reconstruct from the q stripe if they are
				1608	* asking for mirror 3
				1609	*/
				1610	if (mirror_num == 3)
				1611	rbio->failb = bbio->num_stripes - 2;
				1612
				1613	ret = lock_stripe_add(rbio);
				1614
				1615	/*
				1616	* __raid56_parity_recover will end the bio with
				1617	* any errors it hits. We don't want to return
				1618	* its error value up the stack because our caller
				1619	* will end up calling bio_endio with any nonzero
				1620	* return
				1621	*/
				1622	if (ret == 0)
				1623	__raid56_parity_recover(rbio);
				1624	/*
				1625	* our rbio has been added to the list of
				1626	* rbios that will be handled after the
				1627	* currently lock owner is done
				1628	*/
				1629	return 0;
				1630
				1631	}
				1632
				1633	static void rmw_work(struct btrfs_work *work)
				1634	{
				1635	struct btrfs_raid_bio *rbio;
				1636
				1637	rbio = container_of(work, struct btrfs_raid_bio, work);
				1638	raid56_rmw_stripe(rbio);
				1639	}
				1640
				1641	static void read_rebuild_work(struct btrfs_work *work)
				1642	{
				1643	struct btrfs_raid_bio *rbio;
				1644
				1645	rbio = container_of(work, struct btrfs_raid_bio, work);
				1646	__raid56_parity_recover(rbio);
				1647	}