diff --git a/drivers/md/md.c b/drivers/md/md.c index 5b98bea4ff9..103f2d33fa8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -359,6 +359,7 @@ static mddev_t * mddev_find(dev_t unit) else new->md_minor = MINOR(unit) >> MdpMinorShift; + mutex_init(&new->open_mutex); mutex_init(&new->reconfig_mutex); INIT_LIST_HEAD(&new->disks); INIT_LIST_HEAD(&new->all_mddevs); @@ -1974,17 +1975,14 @@ repeat: /* otherwise we have to go forward and ... */ mddev->events ++; if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ - /* .. if the array isn't clean, insist on an odd 'events' */ - if ((mddev->events&1)==0) { - mddev->events++; + /* .. if the array isn't clean, an 'even' event must also go + * to spares. */ + if ((mddev->events&1)==0) nospares = 0; - } } else { - /* otherwise insist on an even 'events' (for clean states) */ - if ((mddev->events&1)) { - mddev->events++; + /* otherwise an 'odd' event must go to spares */ + if ((mddev->events&1)) nospares = 0; - } } } @@ -3601,6 +3599,7 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len) if (max < mddev->resync_min) return -EINVAL; if (max < mddev->resync_max && + mddev->ro == 0 && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; @@ -4304,12 +4303,11 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) struct gendisk *disk = mddev->gendisk; mdk_rdev_t *rdev; + mutex_lock(&mddev->open_mutex); if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); - return -EBUSY; - } - - if (mddev->pers) { + err = -EBUSY; + } else if (mddev->pers) { if (mddev->sync_thread) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -4367,7 +4365,10 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) set_disk_ro(disk, 1); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); } - +out: + mutex_unlock(&mddev->open_mutex); + if (err) + return err; /* * Free resources if final stop */ @@ -4433,7 +4434,6 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) blk_integrity_unregister(disk); md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); -out: return err; } @@ -5518,12 +5518,12 @@ static int md_open(struct block_device *bdev, fmode_t mode) } BUG_ON(mddev != bdev->bd_disk->private_data); - if ((err = mutex_lock_interruptible_nested(&mddev->reconfig_mutex, 1))) + if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; err = 0; atomic_inc(&mddev->openers); - mddev_unlock(mddev); + mutex_unlock(&mddev->open_mutex); check_disk_change(bdev); out: diff --git a/drivers/md/md.h b/drivers/md/md.h index 78f03168baf..f8fc188bc76 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -223,6 +223,16 @@ struct mddev_s * so we don't loop trying */ int in_sync; /* know to not need resync */ + /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so + * that we are never stopping an array while it is open. + * 'reconfig_mutex' protects all other reconfiguration. + * These locks are separate due to conflicting interactions + * with bdev->bd_mutex. + * Lock ordering is: + * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk + * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open + */ + struct mutex open_mutex; struct mutex reconfig_mutex; atomic_t active; /* general refcount */ atomic_t openers; /* number of active opens */ diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2b521ee67df..b8a2c5dc67b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3785,7 +3785,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped conf->reshape_progress < raid5_size(mddev, 0, 0)) { sector_nr = raid5_size(mddev, 0, 0) - conf->reshape_progress; - } else if (mddev->delta_disks > 0 && + } else if (mddev->delta_disks >= 0 && conf->reshape_progress > 0) sector_nr = conf->reshape_progress; sector_div(sector_nr, new_data_disks); @@ -4509,7 +4509,26 @@ static int run(mddev_t *mddev) (old_disks-max_degraded)); /* here_old is the first stripe that we might need to read * from */ - if (here_new >= here_old) { + if (mddev->delta_disks == 0) { + /* We cannot be sure it is safe to start an in-place + * reshape. It is only safe if user-space if monitoring + * and taking constant backups. + * mdadm always starts a situation like this in + * readonly mode so it can take control before + * allowing any writes. So just check for that. + */ + if ((here_new * mddev->new_chunk_sectors != + here_old * mddev->chunk_sectors) || + mddev->ro == 0) { + printk(KERN_ERR "raid5: in-place reshape must be started" + " in read-only mode - aborting\n"); + return -EINVAL; + } + } else if (mddev->delta_disks < 0 + ? (here_new * mddev->new_chunk_sectors <= + here_old * mddev->chunk_sectors) + : (here_new * mddev->new_chunk_sectors >= + here_old * mddev->chunk_sectors)) { /* Reading from the same stripe as writing to - bad */ printk(KERN_ERR "raid5: reshape_position too early for " "auto-recovery - aborting.\n"); @@ -5078,8 +5097,15 @@ static void raid5_finish_reshape(mddev_t *mddev) mddev->degraded--; for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; - d++) - raid5_remove_disk(mddev, d); + d++) { + mdk_rdev_t *rdev = conf->disks[d].rdev; + if (rdev && raid5_remove_disk(mddev, d) == 0) { + char nm[20]; + sprintf(nm, "rd%d", rdev->raid_disk); + sysfs_remove_link(&mddev->kobj, nm); + rdev->raid_disk = -1; + } + } } mddev->layout = conf->algorithm; mddev->chunk_sectors = conf->chunk_sectors;