[PATCH] md: improve locking on 'safemode' and move superblock writes
When md marks the superblock dirty before a write, it calls
generic_make_request (to write the superblock) from within
generic_make_request (to write the first dirty block), which could cause
problems later.
With this patch, the superblock write is always done by the helper thread, and
write request are delayed until that write completes.
Also, the locking around marking the array dirty and writing the superblock is
improved to avoid possible races.
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c842e34..177d2a7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -218,6 +218,8 @@
INIT_LIST_HEAD(&new->all_mddevs);
init_timer(&new->safemode_timer);
atomic_set(&new->active, 1);
+ bio_list_init(&new->write_list);
+ spin_lock_init(&new->write_lock);
new->queue = blk_alloc_queue(GFP_KERNEL);
if (!new->queue) {
@@ -1251,9 +1253,11 @@
int err, count = 100;
struct list_head *tmp;
mdk_rdev_t *rdev;
+ int sync_req;
- mddev->sb_dirty = 0;
repeat:
+ spin_lock(&mddev->write_lock);
+ sync_req = mddev->in_sync;
mddev->utime = get_seconds();
mddev->events ++;
@@ -1272,8 +1276,12 @@
* do not write anything to disk if using
* nonpersistent superblocks
*/
- if (!mddev->persistent)
+ if (!mddev->persistent) {
+ mddev->sb_dirty = 0;
+ spin_unlock(&mddev->write_lock);
return;
+ }
+ spin_unlock(&mddev->write_lock);
dprintk(KERN_INFO
"md: updating %s RAID superblock on device (in sync %d)\n",
@@ -1304,6 +1312,15 @@
printk(KERN_ERR \
"md: excessive errors occurred during superblock update, exiting\n");
}
+ spin_lock(&mddev->write_lock);
+ if (mddev->in_sync != sync_req) {
+ /* have to write it out again */
+ spin_unlock(&mddev->write_lock);
+ goto repeat;
+ }
+ mddev->sb_dirty = 0;
+ spin_unlock(&mddev->write_lock);
+
}
/*
@@ -3178,19 +3195,31 @@
}
-void md_write_start(mddev_t *mddev)
+/* md_write_start(mddev, bi)
+ * If we need to update some array metadata (e.g. 'active' flag
+ * in superblock) before writing, queue bi for later writing
+ * and return 0, else return 1 and it will be written now
+ */
+int md_write_start(mddev_t *mddev, struct bio *bi)
{
- if (!atomic_read(&mddev->writes_pending)) {
- mddev_lock_uninterruptible(mddev);
- if (mddev->in_sync) {
- mddev->in_sync = 0;
- del_timer(&mddev->safemode_timer);
- md_update_sb(mddev);
- }
- atomic_inc(&mddev->writes_pending);
- mddev_unlock(mddev);
- } else
- atomic_inc(&mddev->writes_pending);
+ if (bio_data_dir(bi) != WRITE)
+ return 1;
+
+ atomic_inc(&mddev->writes_pending);
+ spin_lock(&mddev->write_lock);
+ if (mddev->in_sync == 0 && mddev->sb_dirty == 0) {
+ spin_unlock(&mddev->write_lock);
+ return 1;
+ }
+ bio_list_add(&mddev->write_list, bi);
+
+ if (mddev->in_sync) {
+ mddev->in_sync = 0;
+ mddev->sb_dirty = 1;
+ }
+ spin_unlock(&mddev->write_lock);
+ md_wakeup_thread(mddev->thread);
+ return 0;
}
void md_write_end(mddev_t *mddev)
@@ -3472,6 +3501,7 @@
mddev->sb_dirty ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
+ mddev->write_list.head ||
(mddev->safemode == 1) ||
(mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
&& !mddev->in_sync && mddev->recovery_cp == MaxSector)
@@ -3480,7 +3510,9 @@
if (mddev_trylock(mddev)==0) {
int spares =0;
+ struct bio *blist;
+ spin_lock(&mddev->write_lock);
if (mddev->safemode && !atomic_read(&mddev->writes_pending) &&
!mddev->in_sync && mddev->recovery_cp == MaxSector) {
mddev->in_sync = 1;
@@ -3488,9 +3520,22 @@
}
if (mddev->safemode == 1)
mddev->safemode = 0;
+ blist = bio_list_get(&mddev->write_list);
+ spin_unlock(&mddev->write_lock);
if (mddev->sb_dirty)
md_update_sb(mddev);
+
+ while (blist) {
+ struct bio *b = blist;
+ blist = blist->bi_next;
+ b->bi_next = NULL;
+ generic_make_request(b);
+ /* we already counted this, so need to un-count */
+ md_write_end(mddev);
+ }
+
+
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
/* resync/recovery still happening */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index b34ad56..3f1280b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -530,6 +530,8 @@
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
*/
+ if (md_write_start(mddev, bio)==0)
+ return 0;
spin_lock_irq(&conf->resync_lock);
wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
conf->nr_pending++;
@@ -611,7 +613,7 @@
rcu_read_unlock();
atomic_set(&r1_bio->remaining, 1);
- md_write_start(mddev);
+
for (i = 0; i < disks; i++) {
struct bio *mbio;
if (!r1_bio->bios[i])
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9ae2150..bfc9f52 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -700,6 +700,9 @@
return 0;
}
+ if (md_write_start(mddev, bio) == 0)
+ return 0;
+
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
@@ -774,7 +777,7 @@
rcu_read_unlock();
atomic_set(&r10_bio->remaining, 1);
- md_write_start(mddev);
+
for (i = 0; i < conf->copies; i++) {
struct bio *mbio;
int d = r10_bio->devs[i].devnum;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 63b1c59..677ce49 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1411,6 +1411,9 @@
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ if (md_write_start(mddev, bi)==0)
+ return 0;
+
if (bio_data_dir(bi)==WRITE) {
disk_stat_inc(mddev->gendisk, writes);
disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1423,8 +1426,7 @@
last_sector = bi->bi_sector + (bi->bi_size>>9);
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
- if ( bio_data_dir(bi) == WRITE )
- md_write_start(mddev);
+
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 9d0e0e4..fede16c 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1570,6 +1570,9 @@
sector_t logical_sector, last_sector;
struct stripe_head *sh;
+ if (md_write_start(mddev, bi)==0)
+ return 0;
+
if (bio_data_dir(bi)==WRITE) {
disk_stat_inc(mddev->gendisk, writes);
disk_stat_add(mddev->gendisk, write_sectors, bio_sectors(bi));
@@ -1583,8 +1586,7 @@
bi->bi_next = NULL;
bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
- if ( bio_data_dir(bi) == WRITE )
- md_write_start(mddev);
+
for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
DEFINE_WAIT(w);