Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm

* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm: (80 commits)
  dm snapshot: use merge origin if snapshot invalid
  dm snapshot: report merge failure in status
  dm snapshot: merge consecutive chunks together
  dm snapshot: trigger exceptions in remaining snapshots during merge
  dm snapshot: delay merging a chunk until writes to it complete
  dm snapshot: queue writes to chunks being merged
  dm snapshot: add merging
  dm snapshot: permit only one merge at once
  dm snapshot: support barriers in snapshot merge target
  dm snapshot: avoid allocating exceptions in merge
  dm snapshot: rework writing to origin
  dm snapshot: add merge target
  dm exception store: add merge specific methods
  dm snapshot: create function for chunk_is_tracked wait
  dm snapshot: make bio optional in __origin_write
  dm mpath: reject messages when device is suspended
  dm: export suspended state to targets
  dm: rename dm_suspended to dm_suspended_md
  dm: swap target postsuspend call and setting suspended flag
  dm crypt: add plain64 iv
  ...
This commit is contained in:
Linus Torvalds 2009-12-15 09:12:01 -08:00
commit 53365383c4
23 changed files with 2361 additions and 901 deletions

View file

@ -8,13 +8,19 @@ the block device which are also writable without interfering with the
original content;
*) To create device "forks", i.e. multiple different versions of the
same data stream.
*) To merge a snapshot of a block device back into the snapshot's origin
device.
In the first two cases, dm copies only the chunks of data that get
changed and uses a separate copy-on-write (COW) block device for
storage.
For snapshot merge the contents of the COW storage are merged back into
the origin device.
In both cases, dm copies only the chunks of data that get changed and
uses a separate copy-on-write (COW) block device for storage.
There are two dm targets available: snapshot and snapshot-origin.
There are three dm targets available:
snapshot, snapshot-origin, and snapshot-merge.
*) snapshot-origin <origin>
@ -40,8 +46,25 @@ The difference is that for transient snapshots less metadata must be
saved on disk - they can be kept in memory by the kernel.
How this is used by LVM2
========================
* snapshot-merge <origin> <COW device> <persistent> <chunksize>
takes the same table arguments as the snapshot target except it only
works with persistent snapshots. This target assumes the role of the
"snapshot-origin" target and must not be loaded if the "snapshot-origin"
is still present for <origin>.
Creates a merging snapshot that takes control of the changed chunks
stored in the <COW device> of an existing snapshot, through a handover
procedure, and merges these chunks back into the <origin>. Once merging
has started (in the background) the <origin> may be opened and the merge
will continue while I/O is flowing to it. Changes to the <origin> are
deferred until the merging snapshot's corresponding chunk(s) have been
merged. Once merging has started the snapshot device, associated with
the "snapshot" target, will return -EIO when accessed.
How snapshot is used by LVM2
============================
When you create the first LVM2 snapshot of a volume, four dm devices are used:
1) a device containing the original mapping table of the source volume;
@ -72,3 +95,30 @@ brw------- 1 root root 254, 12 29 ago 18:15 /dev/mapper/volumeGroup-snap-cow
brw------- 1 root root 254, 13 29 ago 18:15 /dev/mapper/volumeGroup-snap
brw------- 1 root root 254, 10 29 ago 18:14 /dev/mapper/volumeGroup-base
How snapshot-merge is used by LVM2
==================================
A merging snapshot assumes the role of the "snapshot-origin" while
merging. As such the "snapshot-origin" is replaced with
"snapshot-merge". The "-real" device is not changed and the "-cow"
device is renamed to <origin name>-cow to aid LVM2's cleanup of the
merging snapshot after it completes. The "snapshot" that hands over its
COW device to the "snapshot-merge" is deactivated (unless using lvchange
--refresh); but if it is left active it will simply return I/O errors.
A snapshot will merge into its origin with the following command:
lvconvert --merge volumeGroup/snap
we'll now have this situation:
# dmsetup table|grep volumeGroup
volumeGroup-base-real: 0 2097152 linear 8:19 384
volumeGroup-base-cow: 0 204800 linear 8:19 2097536
volumeGroup-base: 0 2097152 snapshot-merge 254:11 254:12 P 16
# ls -lL /dev/mapper/volumeGroup-*
brw------- 1 root root 254, 11 29 ago 18:15 /dev/mapper/volumeGroup-base-real
brw------- 1 root root 254, 12 29 ago 18:16 /dev/mapper/volumeGroup-base-cow
brw------- 1 root root 254, 10 29 ago 18:16 /dev/mapper/volumeGroup-base

View file

@ -1,7 +1,7 @@
/*
* Copyright (C) 2003 Christophe Saout <christophe@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved.
* Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
*
* This file is released under the GPL.
*/
@ -71,10 +71,21 @@ struct crypt_iv_operations {
int (*ctr)(struct crypt_config *cc, struct dm_target *ti,
const char *opts);
void (*dtr)(struct crypt_config *cc);
const char *(*status)(struct crypt_config *cc);
int (*init)(struct crypt_config *cc);
int (*wipe)(struct crypt_config *cc);
int (*generator)(struct crypt_config *cc, u8 *iv, sector_t sector);
};
struct iv_essiv_private {
struct crypto_cipher *tfm;
struct crypto_hash *hash_tfm;
u8 *salt;
};
struct iv_benbi_private {
int shift;
};
/*
* Crypt: maps a linear range of a block device
* and encrypts / decrypts at the same time.
@ -102,8 +113,8 @@ struct crypt_config {
struct crypt_iv_operations *iv_gen_ops;
char *iv_mode;
union {
struct crypto_cipher *essiv_tfm;
int benbi_shift;
struct iv_essiv_private essiv;
struct iv_benbi_private benbi;
} iv_gen_private;
sector_t iv_offset;
unsigned int iv_size;
@ -147,6 +158,9 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io);
* plain: the initial vector is the 32-bit little-endian version of the sector
* number, padded with zeros if necessary.
*
* plain64: the initial vector is the 64-bit little-endian version of the sector
* number, padded with zeros if necessary.
*
* essiv: "encrypted sector|salt initial vector", the sector number is
* encrypted with the bulk cipher using a salt as key. The salt
* should be derived from the bulk cipher's key via hashing.
@ -169,88 +183,123 @@ static int crypt_iv_plain_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
return 0;
}
static int crypt_iv_plain64_gen(struct crypt_config *cc, u8 *iv,
sector_t sector)
{
memset(iv, 0, cc->iv_size);
*(u64 *)iv = cpu_to_le64(sector);
return 0;
}
/* Initialise ESSIV - compute salt but no local memory allocations */
static int crypt_iv_essiv_init(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
struct hash_desc desc;
struct scatterlist sg;
int err;
sg_init_one(&sg, cc->key, cc->key_size);
desc.tfm = essiv->hash_tfm;
desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_hash_digest(&desc, &sg, cc->key_size, essiv->salt);
if (err)
return err;
return crypto_cipher_setkey(essiv->tfm, essiv->salt,
crypto_hash_digestsize(essiv->hash_tfm));
}
/* Wipe salt and reset key derived from volume key */
static int crypt_iv_essiv_wipe(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
unsigned salt_size = crypto_hash_digestsize(essiv->hash_tfm);
memset(essiv->salt, 0, salt_size);
return crypto_cipher_setkey(essiv->tfm, essiv->salt, salt_size);
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
struct iv_essiv_private *essiv = &cc->iv_gen_private.essiv;
crypto_free_cipher(essiv->tfm);
essiv->tfm = NULL;
crypto_free_hash(essiv->hash_tfm);
essiv->hash_tfm = NULL;
kzfree(essiv->salt);
essiv->salt = NULL;
}
static int crypt_iv_essiv_ctr(struct crypt_config *cc, struct dm_target *ti,
const char *opts)
{
struct crypto_cipher *essiv_tfm;
struct crypto_hash *hash_tfm;
struct hash_desc desc;
struct scatterlist sg;
unsigned int saltsize;
u8 *salt;
struct crypto_cipher *essiv_tfm = NULL;
struct crypto_hash *hash_tfm = NULL;
u8 *salt = NULL;
int err;
if (opts == NULL) {
if (!opts) {
ti->error = "Digest algorithm missing for ESSIV mode";
return -EINVAL;
}
/* Hash the cipher key with the given hash algorithm */
/* Allocate hash algorithm */
hash_tfm = crypto_alloc_hash(opts, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(hash_tfm)) {
ti->error = "Error initializing ESSIV hash";
return PTR_ERR(hash_tfm);
err = PTR_ERR(hash_tfm);
goto bad;
}
saltsize = crypto_hash_digestsize(hash_tfm);
salt = kmalloc(saltsize, GFP_KERNEL);
if (salt == NULL) {
salt = kzalloc(crypto_hash_digestsize(hash_tfm), GFP_KERNEL);
if (!salt) {
ti->error = "Error kmallocing salt storage in ESSIV";
crypto_free_hash(hash_tfm);
return -ENOMEM;
err = -ENOMEM;
goto bad;
}
sg_init_one(&sg, cc->key, cc->key_size);
desc.tfm = hash_tfm;
desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
err = crypto_hash_digest(&desc, &sg, cc->key_size, salt);
crypto_free_hash(hash_tfm);
if (err) {
ti->error = "Error calculating hash in ESSIV";
kfree(salt);
return err;
}
/* Setup the essiv_tfm with the given salt */
/* Allocate essiv_tfm */
essiv_tfm = crypto_alloc_cipher(cc->cipher, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(essiv_tfm)) {
ti->error = "Error allocating crypto tfm for ESSIV";
kfree(salt);
return PTR_ERR(essiv_tfm);
err = PTR_ERR(essiv_tfm);
goto bad;
}
if (crypto_cipher_blocksize(essiv_tfm) !=
crypto_ablkcipher_ivsize(cc->tfm)) {
ti->error = "Block size of ESSIV cipher does "
"not match IV size of block cipher";
crypto_free_cipher(essiv_tfm);
kfree(salt);
return -EINVAL;
err = -EINVAL;
goto bad;
}
err = crypto_cipher_setkey(essiv_tfm, salt, saltsize);
if (err) {
ti->error = "Failed to set key for ESSIV cipher";
crypto_free_cipher(essiv_tfm);
kfree(salt);
return err;
}
kfree(salt);
cc->iv_gen_private.essiv_tfm = essiv_tfm;
cc->iv_gen_private.essiv.salt = salt;
cc->iv_gen_private.essiv.tfm = essiv_tfm;
cc->iv_gen_private.essiv.hash_tfm = hash_tfm;
return 0;
}
static void crypt_iv_essiv_dtr(struct crypt_config *cc)
{
crypto_free_cipher(cc->iv_gen_private.essiv_tfm);
cc->iv_gen_private.essiv_tfm = NULL;
bad:
if (essiv_tfm && !IS_ERR(essiv_tfm))
crypto_free_cipher(essiv_tfm);
if (hash_tfm && !IS_ERR(hash_tfm))
crypto_free_hash(hash_tfm);
kfree(salt);
return err;
}
static int crypt_iv_essiv_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
{
memset(iv, 0, cc->iv_size);
*(u64 *)iv = cpu_to_le64(sector);
crypto_cipher_encrypt_one(cc->iv_gen_private.essiv_tfm, iv, iv);
crypto_cipher_encrypt_one(cc->iv_gen_private.essiv.tfm, iv, iv);
return 0;
}
@ -273,7 +322,7 @@ static int crypt_iv_benbi_ctr(struct crypt_config *cc, struct dm_target *ti,
return -EINVAL;
}
cc->iv_gen_private.benbi_shift = 9 - log;
cc->iv_gen_private.benbi.shift = 9 - log;
return 0;
}
@ -288,7 +337,7 @@ static int crypt_iv_benbi_gen(struct crypt_config *cc, u8 *iv, sector_t sector)
memset(iv, 0, cc->iv_size - sizeof(u64)); /* rest is cleared below */
val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi_shift) + 1);
val = cpu_to_be64(((u64)sector << cc->iv_gen_private.benbi.shift) + 1);
put_unaligned(val, (__be64 *)(iv + cc->iv_size - sizeof(u64)));
return 0;
@ -305,9 +354,15 @@ static struct crypt_iv_operations crypt_iv_plain_ops = {
.generator = crypt_iv_plain_gen
};
static struct crypt_iv_operations crypt_iv_plain64_ops = {
.generator = crypt_iv_plain64_gen
};
static struct crypt_iv_operations crypt_iv_essiv_ops = {
.ctr = crypt_iv_essiv_ctr,
.dtr = crypt_iv_essiv_dtr,
.init = crypt_iv_essiv_init,
.wipe = crypt_iv_essiv_wipe,
.generator = crypt_iv_essiv_gen
};
@ -934,14 +989,14 @@ static int crypt_set_key(struct crypt_config *cc, char *key)
set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
return 0;
return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
static int crypt_wipe_key(struct crypt_config *cc)
{
clear_bit(DM_CRYPT_KEY_VALID, &cc->flags);
memset(&cc->key, 0, cc->key_size * sizeof(u8));
return 0;
return crypto_ablkcipher_setkey(cc->tfm, cc->key, cc->key_size);
}
/*
@ -983,11 +1038,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
return -ENOMEM;
}
if (crypt_set_key(cc, argv[1])) {
ti->error = "Error decoding key";
goto bad_cipher;
}
/* Compatibility mode for old dm-crypt cipher strings */
if (!chainmode || (strcmp(chainmode, "plain") == 0 && !ivmode)) {
chainmode = "cbc";
@ -1015,6 +1065,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
strcpy(cc->chainmode, chainmode);
cc->tfm = tfm;
if (crypt_set_key(cc, argv[1]) < 0) {
ti->error = "Error decoding and setting key";
goto bad_ivmode;
}
/*
* Choose ivmode. Valid modes: "plain", "essiv:<esshash>", "benbi".
* See comments at iv code
@ -1024,6 +1079,8 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops = NULL;
else if (strcmp(ivmode, "plain") == 0)
cc->iv_gen_ops = &crypt_iv_plain_ops;
else if (strcmp(ivmode, "plain64") == 0)
cc->iv_gen_ops = &crypt_iv_plain64_ops;
else if (strcmp(ivmode, "essiv") == 0)
cc->iv_gen_ops = &crypt_iv_essiv_ops;
else if (strcmp(ivmode, "benbi") == 0)
@ -1039,6 +1096,12 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
cc->iv_gen_ops->ctr(cc, ti, ivopts) < 0)
goto bad_ivmode;
if (cc->iv_gen_ops && cc->iv_gen_ops->init &&
cc->iv_gen_ops->init(cc) < 0) {
ti->error = "Error initialising IV";
goto bad_slab_pool;
}
cc->iv_size = crypto_ablkcipher_ivsize(tfm);
if (cc->iv_size)
/* at least a 64 bit sector number should fit in our buffer */
@ -1085,11 +1148,6 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad_bs;
}
if (crypto_ablkcipher_setkey(tfm, cc->key, key_size) < 0) {
ti->error = "Error setting key";
goto bad_device;
}
if (sscanf(argv[2], "%llu", &tmpll) != 1) {
ti->error = "Invalid iv_offset sector";
goto bad_device;
@ -1278,6 +1336,7 @@ static void crypt_resume(struct dm_target *ti)
static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
{
struct crypt_config *cc = ti->private;
int ret = -EINVAL;
if (argc < 2)
goto error;
@ -1287,10 +1346,22 @@ static int crypt_message(struct dm_target *ti, unsigned argc, char **argv)
DMWARN("not suspended during key manipulation.");
return -EINVAL;
}
if (argc == 3 && !strnicmp(argv[1], MESG_STR("set")))
return crypt_set_key(cc, argv[2]);
if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe")))
if (argc == 3 && !strnicmp(argv[1], MESG_STR("set"))) {
ret = crypt_set_key(cc, argv[2]);
if (ret)
return ret;
if (cc->iv_gen_ops && cc->iv_gen_ops->init)
ret = cc->iv_gen_ops->init(cc);
return ret;
}
if (argc == 2 && !strnicmp(argv[1], MESG_STR("wipe"))) {
if (cc->iv_gen_ops && cc->iv_gen_ops->wipe) {
ret = cc->iv_gen_ops->wipe(cc);
if (ret)
return ret;
}
return crypt_wipe_key(cc);
}
}
error:

View file

@ -172,7 +172,8 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}
/* Validate the chunk size against the device block size */
if (chunk_size % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
if (chunk_size %
(bdev_logical_block_size(dm_snap_cow(store->snap)->bdev) >> 9)) {
*error = "Chunk size is not a multiple of device blocksize";
return -EINVAL;
}
@ -190,6 +191,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
}
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_snapshot *snap,
unsigned *args_used,
struct dm_exception_store **store)
{
@ -198,7 +200,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_exception_store *tmp_store;
char persistent;
if (argc < 3) {
if (argc < 2) {
ti->error = "Insufficient exception store arguments";
return -EINVAL;
}
@ -209,14 +211,15 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
return -ENOMEM;
}
persistent = toupper(*argv[1]);
persistent = toupper(*argv[0]);
if (persistent == 'P')
type = get_type("P");
else if (persistent == 'N')
type = get_type("N");
else {
ti->error = "Persistent flag is not P or N";
return -EINVAL;
r = -EINVAL;
goto bad_type;
}
if (!type) {
@ -226,32 +229,23 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
}
tmp_store->type = type;
tmp_store->ti = ti;
tmp_store->snap = snap;
r = dm_get_device(ti, argv[0], 0, 0,
FMODE_READ | FMODE_WRITE, &tmp_store->cow);
if (r) {
ti->error = "Cannot get COW device";
goto bad_cow;
}
r = set_chunk_size(tmp_store, argv[2], &ti->error);
r = set_chunk_size(tmp_store, argv[1], &ti->error);
if (r)
goto bad_ctr;
goto bad;
r = type->ctr(tmp_store, 0, NULL);
if (r) {
ti->error = "Exception store type constructor failed";
goto bad_ctr;
goto bad;
}
*args_used = 3;
*args_used = 2;
*store = tmp_store;
return 0;
bad_ctr:
dm_put_device(ti, tmp_store->cow);
bad_cow:
bad:
put_type(type);
bad_type:
kfree(tmp_store);
@ -262,7 +256,6 @@ EXPORT_SYMBOL(dm_exception_store_create);
void dm_exception_store_destroy(struct dm_exception_store *store)
{
store->type->dtr(store);
dm_put_device(store->ti, store->cow);
put_type(store->type);
kfree(store);
}

View file

@ -26,7 +26,7 @@ typedef sector_t chunk_t;
* of chunks that follow contiguously. Remaining bits hold the number of the
* chunk within the device.
*/
struct dm_snap_exception {
struct dm_exception {
struct list_head hash_list;
chunk_t old_chunk;
@ -64,16 +64,33 @@ struct dm_exception_store_type {
* Find somewhere to store the next exception.
*/
int (*prepare_exception) (struct dm_exception_store *store,
struct dm_snap_exception *e);
struct dm_exception *e);
/*
* Update the metadata with this exception.
*/
void (*commit_exception) (struct dm_exception_store *store,
struct dm_snap_exception *e,
struct dm_exception *e,
void (*callback) (void *, int success),
void *callback_context);
/*
* Returns 0 if the exception store is empty.
*
* If there are exceptions still to be merged, sets
* *last_old_chunk and *last_new_chunk to the most recent
* still-to-be-merged chunk and returns the number of
* consecutive previous ones.
*/
int (*prepare_merge) (struct dm_exception_store *store,
chunk_t *last_old_chunk, chunk_t *last_new_chunk);
/*
* Clear the last n exceptions.
* nr_merged must be <= the value returned by prepare_merge.
*/
int (*commit_merge) (struct dm_exception_store *store, int nr_merged);
/*
* The snapshot is invalid, note this in the metadata.
*/
@ -86,19 +103,19 @@ struct dm_exception_store_type {
/*
* Return how full the snapshot is.
*/
void (*fraction_full) (struct dm_exception_store *store,
sector_t *numerator,
sector_t *denominator);
void (*usage) (struct dm_exception_store *store,
sector_t *total_sectors, sector_t *sectors_allocated,
sector_t *metadata_sectors);
/* For internal device-mapper use only. */
struct list_head list;
};
struct dm_snapshot;
struct dm_exception_store {
struct dm_exception_store_type *type;
struct dm_target *ti;
struct dm_dev *cow;
struct dm_snapshot *snap;
/* Size of data blocks saved - must be a power of 2 */
unsigned chunk_size;
@ -108,6 +125,11 @@ struct dm_exception_store {
void *context;
};
/*
* Obtain the cow device used by a given snapshot.
*/
struct dm_dev *dm_snap_cow(struct dm_snapshot *snap);
/*
* Funtions to manipulate consecutive chunks
*/
@ -120,18 +142,25 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
return chunk & (chunk_t)((1ULL << DM_CHUNK_NUMBER_BITS) - 1ULL);
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
return e->new_chunk >> DM_CHUNK_NUMBER_BITS;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
e->new_chunk += (1ULL << DM_CHUNK_NUMBER_BITS);
BUG_ON(!dm_consecutive_chunk_count(e));
}
static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
BUG_ON(!dm_consecutive_chunk_count(e));
e->new_chunk -= (1ULL << DM_CHUNK_NUMBER_BITS);
}
# else
# define DM_CHUNK_CONSECUTIVE_BITS 0
@ -140,12 +169,16 @@ static inline chunk_t dm_chunk_number(chunk_t chunk)
return chunk;
}
static inline unsigned dm_consecutive_chunk_count(struct dm_snap_exception *e)
static inline unsigned dm_consecutive_chunk_count(struct dm_exception *e)
{
return 0;
}
static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
static inline void dm_consecutive_chunk_count_inc(struct dm_exception *e)
{
}
static inline void dm_consecutive_chunk_count_dec(struct dm_exception *e)
{
}
@ -162,7 +195,7 @@ static inline sector_t get_dev_size(struct block_device *bdev)
static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
sector_t sector)
{
return (sector & ~store->chunk_mask) >> store->chunk_shift;
return sector >> store->chunk_shift;
}
int dm_exception_store_type_register(struct dm_exception_store_type *type);
@ -173,6 +206,7 @@ int dm_exception_store_set_chunk_size(struct dm_exception_store *store,
char **error);
int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
struct dm_snapshot *snap,
unsigned *args_used,
struct dm_exception_store **store);
void dm_exception_store_destroy(struct dm_exception_store *store);

View file

@ -5,6 +5,8 @@
* This file is released under the GPL.
*/
#include "dm.h"
#include <linux/device-mapper.h>
#include <linux/bio.h>
@ -14,12 +16,19 @@
#include <linux/slab.h>
#include <linux/dm-io.h>
#define DM_MSG_PREFIX "io"
#define DM_IO_MAX_REGIONS BITS_PER_LONG
struct dm_io_client {
mempool_t *pool;
struct bio_set *bios;
};
/* FIXME: can we shrink this ? */
/*
* Aligning 'struct io' reduces the number of bits required to store
* its address. Refer to store_io_and_region_in_bio() below.
*/
struct io {
unsigned long error_bits;
unsigned long eopnotsupp_bits;
@ -28,7 +37,9 @@ struct io {
struct dm_io_client *client;
io_notify_fn callback;
void *context;
};
} __attribute__((aligned(DM_IO_MAX_REGIONS)));
static struct kmem_cache *_dm_io_cache;
/*
* io contexts are only dynamically allocated for asynchronous
@ -53,7 +64,7 @@ struct dm_io_client *dm_io_client_create(unsigned num_pages)
if (!client)
return ERR_PTR(-ENOMEM);
client->pool = mempool_create_kmalloc_pool(ios, sizeof(struct io));
client->pool = mempool_create_slab_pool(ios, _dm_io_cache);
if (!client->pool)
goto bad;
@ -88,18 +99,29 @@ EXPORT_SYMBOL(dm_io_client_destroy);
/*-----------------------------------------------------------------
* We need to keep track of which region a bio is doing io for.
* In order to save a memory allocation we store this the last
* bvec which we know is unused (blech).
* XXX This is ugly and can OOPS with some configs... find another way.
* To avoid a memory allocation to store just 5 or 6 bits, we
* ensure the 'struct io' pointer is aligned so enough low bits are
* always zero and then combine it with the region number directly in
* bi_private.
*---------------------------------------------------------------*/
static inline void bio_set_region(struct bio *bio, unsigned region)
static void store_io_and_region_in_bio(struct bio *bio, struct io *io,
unsigned region)
{
bio->bi_io_vec[bio->bi_max_vecs].bv_len = region;
if (unlikely(!IS_ALIGNED((unsigned long)io, DM_IO_MAX_REGIONS))) {
DMCRIT("Unaligned struct io pointer %p", io);
BUG();
}
bio->bi_private = (void *)((unsigned long)io | region);
}
static inline unsigned bio_get_region(struct bio *bio)
static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
unsigned *region)
{
return bio->bi_io_vec[bio->bi_max_vecs].bv_len;
unsigned long val = (unsigned long)bio->bi_private;
*io = (void *)(val & -(unsigned long)DM_IO_MAX_REGIONS);
*region = val & (DM_IO_MAX_REGIONS - 1);
}
/*-----------------------------------------------------------------
@ -140,10 +162,8 @@ static void endio(struct bio *bio, int error)
/*
* The bio destructor in bio_put() may use the io object.
*/
io = bio->bi_private;
region = bio_get_region(bio);
retrieve_io_and_region_from_bio(bio, &io, &region);
bio->bi_max_vecs++;
bio_put(bio);
dec_count(io, region, error);
@ -243,7 +263,10 @@ static void vm_dp_init(struct dpages *dp, void *data)
static void dm_bio_destructor(struct bio *bio)
{
struct io *io = bio->bi_private;
unsigned region;
struct io *io;
retrieve_io_and_region_from_bio(bio, &io, &region);
bio_free(bio, io->client->bios);
}
@ -286,26 +309,23 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
unsigned num_bvecs;
sector_t remaining = where->count;
while (remaining) {
/*
* where->count may be zero if rw holds a write barrier and we
* need to send a zero-sized barrier.
*/
do {
/*
* Allocate a suitably sized-bio: we add an extra
* bvec for bio_get/set_region() and decrement bi_max_vecs
* to hide it from bio_add_page().
* Allocate a suitably sized-bio.
*/
num_bvecs = dm_sector_div_up(remaining,
(PAGE_SIZE >> SECTOR_SHIFT));
num_bvecs = 1 + min_t(int, bio_get_nr_vecs(where->bdev),
num_bvecs);
if (unlikely(num_bvecs > BIO_MAX_PAGES))
num_bvecs = BIO_MAX_PAGES;
num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs);
bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
bio->bi_sector = where->sector + (where->count - remaining);
bio->bi_bdev = where->bdev;
bio->bi_end_io = endio;
bio->bi_private = io;
bio->bi_destructor = dm_bio_destructor;
bio->bi_max_vecs--;
bio_set_region(bio, region);
store_io_and_region_in_bio(bio, io, region);
/*
* Try and add as many pages as possible.
@ -323,7 +343,7 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
atomic_inc(&io->count);
submit_bio(rw, bio);
}
} while (remaining);
}
static void dispatch_io(int rw, unsigned int num_regions,
@ -333,6 +353,8 @@ static void dispatch_io(int rw, unsigned int num_regions,
int i;
struct dpages old_pages = *dp;
BUG_ON(num_regions > DM_IO_MAX_REGIONS);
if (sync)
rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
@ -342,7 +364,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
*/
for (i = 0; i < num_regions; i++) {
*dp = old_pages;
if (where[i].count)
if (where[i].count || (rw & (1 << BIO_RW_BARRIER)))
do_region(rw, i, where + i, dp, io);
}
@ -357,7 +379,14 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
struct dm_io_region *where, int rw, struct dpages *dp,
unsigned long *error_bits)
{
struct io io;
/*
* gcc <= 4.3 can't do the alignment for stack variables, so we must
* align it on our own.
* volatile prevents the optimizer from removing or reusing
* "io_" field from the stack frame (allowed in ANSI C).
*/
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
@ -365,33 +394,33 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
}
retry:
io.error_bits = 0;
io.eopnotsupp_bits = 0;
atomic_set(&io.count, 1); /* see dispatch_io() */
io.sleeper = current;
io.client = client;
io->error_bits = 0;
io->eopnotsupp_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
io->sleeper = current;
io->client = client;
dispatch_io(rw, num_regions, where, dp, &io, 1);
dispatch_io(rw, num_regions, where, dp, io, 1);
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!atomic_read(&io.count))
if (!atomic_read(&io->count))
break;
io_schedule();
}
set_current_state(TASK_RUNNING);
if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
if (io->eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
rw &= ~(1 << BIO_RW_BARRIER);
goto retry;
}
if (error_bits)
*error_bits = io.error_bits;
*error_bits = io->error_bits;
return io.error_bits ? -EIO : 0;
return io->error_bits ? -EIO : 0;
}
static int async_io(struct dm_io_client *client, unsigned int num_regions,
@ -472,3 +501,18 @@ int dm_io(struct dm_io_request *io_req, unsigned num_regions,
&dp, io_req->notify.fn, io_req->notify.context);
}
EXPORT_SYMBOL(dm_io);
int __init dm_io_init(void)
{
_dm_io_cache = KMEM_CACHE(io, 0);
if (!_dm_io_cache)
return -ENOMEM;
return 0;
}
void dm_io_exit(void)
{
kmem_cache_destroy(_dm_io_cache);
_dm_io_cache = NULL;
}

View file

@ -56,6 +56,11 @@ static void dm_hash_remove_all(int keep_open_devices);
*/
static DECLARE_RWSEM(_hash_lock);
/*
* Protects use of mdptr to obtain hash cell name and uuid from mapped device.
*/
static DEFINE_MUTEX(dm_hash_cells_mutex);
static void init_buckets(struct list_head *buckets)
{
unsigned int i;
@ -206,7 +211,9 @@ static int dm_hash_insert(const char *name, const char *uuid, struct mapped_devi
list_add(&cell->uuid_list, _uuid_buckets + hash_str(uuid));
}
dm_get(md);
mutex_lock(&dm_hash_cells_mutex);
dm_set_mdptr(md, cell);
mutex_unlock(&dm_hash_cells_mutex);
up_write(&_hash_lock);
return 0;
@ -224,9 +231,11 @@ static void __hash_remove(struct hash_cell *hc)
/* remove from the dev hash */
list_del(&hc->uuid_list);
list_del(&hc->name_list);
mutex_lock(&dm_hash_cells_mutex);
dm_set_mdptr(hc->md, NULL);
mutex_unlock(&dm_hash_cells_mutex);
table = dm_get_table(hc->md);
table = dm_get_live_table(hc->md);
if (table) {
dm_table_event(table);
dm_table_put(table);
@ -321,13 +330,15 @@ static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
*/
list_del(&hc->name_list);
old_name = hc->name;
mutex_lock(&dm_hash_cells_mutex);
hc->name = new_name;
mutex_unlock(&dm_hash_cells_mutex);
list_add(&hc->name_list, _name_buckets + hash_str(new_name));
/*
* Wake up any dm event waiters.
*/
table = dm_get_table(hc->md);
table = dm_get_live_table(hc->md);
if (table) {
dm_table_event(table);
dm_table_put(table);
@ -512,8 +523,6 @@ static int list_versions(struct dm_ioctl *param, size_t param_size)
return 0;
}
static int check_name(const char *name)
{
if (strchr(name, '/')) {
@ -524,6 +533,40 @@ static int check_name(const char *name)
return 0;
}
/*
* On successful return, the caller must not attempt to acquire
* _hash_lock without first calling dm_table_put, because dm_table_destroy
* waits for this dm_table_put and could be called under this lock.
*/
static struct dm_table *dm_get_inactive_table(struct mapped_device *md)
{
struct hash_cell *hc;
struct dm_table *table = NULL;
down_read(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
DMWARN("device has been removed from the dev hash table.");
goto out;
}
table = hc->new_map;
if (table)
dm_table_get(table);
out:
up_read(&_hash_lock);
return table;
}
static struct dm_table *dm_get_live_or_inactive_table(struct mapped_device *md,
struct dm_ioctl *param)
{
return (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) ?
dm_get_inactive_table(md) : dm_get_live_table(md);
}
/*
* Fills in a dm_ioctl structure, ready for sending back to
* userland.
@ -536,7 +579,7 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
param->flags &= ~(DM_SUSPEND_FLAG | DM_READONLY_FLAG |
DM_ACTIVE_PRESENT_FLAG);
if (dm_suspended(md))
if (dm_suspended_md(md))
param->flags |= DM_SUSPEND_FLAG;
param->dev = huge_encode_dev(disk_devt(disk));
@ -548,18 +591,30 @@ static int __dev_status(struct mapped_device *md, struct dm_ioctl *param)
*/
param->open_count = dm_open_count(md);
if (get_disk_ro(disk))
param->flags |= DM_READONLY_FLAG;
param->event_nr = dm_get_event_nr(md);
param->target_count = 0;
table = dm_get_table(md);
table = dm_get_live_table(md);
if (table) {
param->flags |= DM_ACTIVE_PRESENT_FLAG;
param->target_count = dm_table_get_num_targets(table);
if (!(param->flags & DM_QUERY_INACTIVE_TABLE_FLAG)) {
if (get_disk_ro(disk))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
}
dm_table_put(table);
} else
param->target_count = 0;
param->flags |= DM_ACTIVE_PRESENT_FLAG;
}
if (param->flags & DM_QUERY_INACTIVE_TABLE_FLAG) {
table = dm_get_inactive_table(md);
if (table) {
if (!(dm_table_get_mode(table) & FMODE_WRITE))
param->flags |= DM_READONLY_FLAG;
param->target_count = dm_table_get_num_targets(table);
dm_table_put(table);
}
}
return 0;
}
@ -634,9 +689,9 @@ static struct mapped_device *find_device(struct dm_ioctl *param)
* Sneakily write in both the name and the uuid
* while we have the cell.
*/
strncpy(param->name, hc->name, sizeof(param->name));
strlcpy(param->name, hc->name, sizeof(param->name));
if (hc->uuid)
strncpy(param->uuid, hc->uuid, sizeof(param->uuid)-1);
strlcpy(param->uuid, hc->uuid, sizeof(param->uuid));
else
param->uuid[0] = '\0';
@ -784,7 +839,7 @@ static int do_suspend(struct dm_ioctl *param)
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
if (!dm_suspended_md(md))
r = dm_suspend(md, suspend_flags);
if (!r)
@ -800,7 +855,7 @@ static int do_resume(struct dm_ioctl *param)
unsigned suspend_flags = DM_SUSPEND_LOCKFS_FLAG;
struct hash_cell *hc;
struct mapped_device *md;
struct dm_table *new_map;
struct dm_table *new_map, *old_map = NULL;
down_write(&_hash_lock);
@ -826,14 +881,14 @@ static int do_resume(struct dm_ioctl *param)
suspend_flags &= ~DM_SUSPEND_LOCKFS_FLAG;
if (param->flags & DM_NOFLUSH_FLAG)
suspend_flags |= DM_SUSPEND_NOFLUSH_FLAG;
if (!dm_suspended(md))
if (!dm_suspended_md(md))
dm_suspend(md, suspend_flags);
r = dm_swap_table(md, new_map);
if (r) {
old_map = dm_swap_table(md, new_map);
if (IS_ERR(old_map)) {
dm_table_destroy(new_map);
dm_put(md);
return r;
return PTR_ERR(old_map);
}
if (dm_table_get_mode(new_map) & FMODE_WRITE)
@ -842,9 +897,11 @@ static int do_resume(struct dm_ioctl *param)
set_disk_ro(dm_disk(md), 1);
}
if (dm_suspended(md))
if (dm_suspended_md(md))
r = dm_resume(md);
if (old_map)
dm_table_destroy(old_map);
if (!r) {
dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
@ -982,7 +1039,7 @@ static int dev_wait(struct dm_ioctl *param, size_t param_size)
if (r)
goto out;
table = dm_get_table(md);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
retrieve_status(table, param, param_size);
dm_table_put(table);
@ -1215,7 +1272,7 @@ static int table_deps(struct dm_ioctl *param, size_t param_size)
if (r)
goto out;
table = dm_get_table(md);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
retrieve_deps(table, param, param_size);
dm_table_put(table);
@ -1244,13 +1301,13 @@ static int table_status(struct dm_ioctl *param, size_t param_size)
if (r)
goto out;
table = dm_get_table(md);
table = dm_get_live_or_inactive_table(md, param);
if (table) {
retrieve_status(table, param, param_size);
dm_table_put(table);
}
out:
out:
dm_put(md);
return r;
}
@ -1288,10 +1345,15 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
goto out;
}
table = dm_get_table(md);
table = dm_get_live_table(md);
if (!table)
goto out_argv;
if (dm_deleting_md(md)) {
r = -ENXIO;
goto out_table;
}
ti = dm_table_find_target(table, tmsg->sector);
if (!dm_target_is_valid(ti)) {
DMWARN("Target message sector outside device.");
@ -1303,6 +1365,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
r = -EINVAL;
}
out_table:
dm_table_put(table);
out_argv:
kfree(argv);
@ -1582,8 +1645,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
if (!md)
return -ENXIO;
dm_get(md);
down_read(&_hash_lock);
mutex_lock(&dm_hash_cells_mutex);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
r = -ENXIO;
@ -1596,8 +1658,7 @@ int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid)
strcpy(uuid, hc->uuid ? : "");
out:
up_read(&_hash_lock);
dm_put(md);
mutex_unlock(&dm_hash_cells_mutex);
return r;
}

View file

@ -450,7 +450,10 @@ static void dispatch_job(struct kcopyd_job *job)
{
struct dm_kcopyd_client *kc = job->kc;
atomic_inc(&kc->nr_jobs);
push(&kc->pages_jobs, job);
if (unlikely(!job->source.count))
push(&kc->complete_jobs, job);
else
push(&kc->pages_jobs, job);
wake(kc);
}

View file

@ -145,8 +145,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type)
EXPORT_SYMBOL(dm_dirty_log_type_unregister);
struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
struct dm_target *ti,
unsigned int argc, char **argv)
struct dm_target *ti,
int (*flush_callback_fn)(struct dm_target *ti),
unsigned int argc, char **argv)
{
struct dm_dirty_log_type *type;
struct dm_dirty_log *log;
@ -161,6 +162,7 @@ struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
return NULL;
}
log->flush_callback_fn = flush_callback_fn;
log->type = type;
if (type->ctr(log, ti, argc, argv)) {
kfree(log);
@ -208,7 +210,9 @@ struct log_header {
struct log_c {
struct dm_target *ti;
int touched;
int touched_dirtied;
int touched_cleaned;
int flush_failed;
uint32_t region_size;
unsigned int region_count;
region_t sync_count;
@ -233,6 +237,7 @@ struct log_c {
* Disk log fields
*/
int log_dev_failed;
int log_dev_flush_failed;
struct dm_dev *log_dev;
struct log_header header;
@ -253,14 +258,14 @@ static inline void log_set_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
ext2_set_bit(bit, (unsigned long *) bs);
l->touched = 1;
l->touched_cleaned = 1;
}
static inline void log_clear_bit(struct log_c *l,
uint32_t *bs, unsigned bit)
{
ext2_clear_bit(bit, (unsigned long *) bs);
l->touched = 1;
l->touched_dirtied = 1;
}
/*----------------------------------------------------------------
@ -287,6 +292,19 @@ static int rw_header(struct log_c *lc, int rw)
return dm_io(&lc->io_req, 1, &lc->header_location, NULL);
}
static int flush_header(struct log_c *lc)
{
struct dm_io_region null_location = {
.bdev = lc->header_location.bdev,
.sector = 0,
.count = 0,
};
lc->io_req.bi_rw = WRITE_BARRIER;
return dm_io(&lc->io_req, 1, &null_location, NULL);
}
static int read_header(struct log_c *log)
{
int r;
@ -378,7 +396,9 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
}
lc->ti = ti;
lc->touched = 0;
lc->touched_dirtied = 0;
lc->touched_cleaned = 0;
lc->flush_failed = 0;
lc->region_size = region_size;
lc->region_count = region_count;
lc->sync = sync;
@ -406,6 +426,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
} else {
lc->log_dev = dev;
lc->log_dev_failed = 0;
lc->log_dev_flush_failed = 0;
lc->header_location.bdev = lc->log_dev->bdev;
lc->header_location.sector = 0;
@ -614,6 +635,11 @@ static int disk_resume(struct dm_dirty_log *log)
/* write the new header */
r = rw_header(lc, WRITE);
if (!r) {
r = flush_header(lc);
if (r)
lc->log_dev_flush_failed = 1;
}
if (r) {
DMWARN("%s: Failed to write header on dirty region log device",
lc->log_dev->name);
@ -656,18 +682,40 @@ static int core_flush(struct dm_dirty_log *log)
static int disk_flush(struct dm_dirty_log *log)
{
int r;
struct log_c *lc = (struct log_c *) log->context;
int r, i;
struct log_c *lc = log->context;
/* only write if the log has changed */
if (!lc->touched)
if (!lc->touched_cleaned && !lc->touched_dirtied)
return 0;
if (lc->touched_cleaned && log->flush_callback_fn &&
log->flush_callback_fn(lc->ti)) {
/*
* At this point it is impossible to determine which
* regions are clean and which are dirty (without
* re-reading the log off disk). So mark all of them
* dirty.
*/
lc->flush_failed = 1;
for (i = 0; i < lc->region_count; i++)
log_clear_bit(lc, lc->clean_bits, i);
}
r = rw_header(lc, WRITE);
if (r)
fail_log_device(lc);
else
lc->touched = 0;
else {
if (lc->touched_dirtied) {
r = flush_header(lc);
if (r) {
lc->log_dev_flush_failed = 1;
fail_log_device(lc);
} else
lc->touched_dirtied = 0;
}
lc->touched_cleaned = 0;
}
return r;
}
@ -681,7 +729,8 @@ static void core_mark_region(struct dm_dirty_log *log, region_t region)
static void core_clear_region(struct dm_dirty_log *log, region_t region)
{
struct log_c *lc = (struct log_c *) log->context;
log_set_bit(lc, lc->clean_bits, region);
if (likely(!lc->flush_failed))
log_set_bit(lc, lc->clean_bits, region);
}
static int core_get_resync_work(struct dm_dirty_log *log, region_t *region)
@ -762,7 +811,9 @@ static int disk_status(struct dm_dirty_log *log, status_type_t status,
switch(status) {
case STATUSTYPE_INFO:
DMEMIT("3 %s %s %c", log->type->name, lc->log_dev->name,
lc->log_dev_failed ? 'D' : 'A');
lc->log_dev_flush_failed ? 'F' :
lc->log_dev_failed ? 'D' :
'A');
break;
case STATUSTYPE_TABLE:

View file

@ -93,6 +93,10 @@ struct multipath {
* can resubmit bios on error.
*/
mempool_t *mpio_pool;
struct mutex work_mutex;
unsigned suspended; /* Don't create new I/O internally when set. */
};
/*
@ -198,6 +202,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m->queue_io = 1;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
INIT_WORK(&m->trigger_event, trigger_event);
mutex_init(&m->work_mutex);
m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
if (!m->mpio_pool) {
kfree(m);
@ -885,13 +890,18 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
return r;
}
static void multipath_dtr(struct dm_target *ti)
static void flush_multipath_work(void)
{
struct multipath *m = (struct multipath *) ti->private;
flush_workqueue(kmpath_handlerd);
flush_workqueue(kmultipathd);
flush_scheduled_work();
}
static void multipath_dtr(struct dm_target *ti)
{
struct multipath *m = ti->private;
flush_multipath_work();
free_multipath(m);
}
@ -1261,6 +1271,16 @@ static void multipath_presuspend(struct dm_target *ti)
queue_if_no_path(m, 0, 1);
}
static void multipath_postsuspend(struct dm_target *ti)
{
struct multipath *m = ti->private;
mutex_lock(&m->work_mutex);
m->suspended = 1;
flush_multipath_work();
mutex_unlock(&m->work_mutex);
}
/*
* Restore the queue_if_no_path setting.
*/
@ -1269,6 +1289,10 @@ static void multipath_resume(struct dm_target *ti)
struct multipath *m = (struct multipath *) ti->private;
unsigned long flags;
mutex_lock(&m->work_mutex);
m->suspended = 0;
mutex_unlock(&m->work_mutex);
spin_lock_irqsave(&m->lock, flags);
m->queue_if_no_path = m->saved_queue_if_no_path;
spin_unlock_irqrestore(&m->lock, flags);
@ -1397,51 +1421,71 @@ static int multipath_status(struct dm_target *ti, status_type_t type,
static int multipath_message(struct dm_target *ti, unsigned argc, char **argv)
{
int r;
int r = -EINVAL;
struct dm_dev *dev;
struct multipath *m = (struct multipath *) ti->private;
action_fn action;
if (argc == 1) {
if (!strnicmp(argv[0], MESG_STR("queue_if_no_path")))
return queue_if_no_path(m, 1, 0);
else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path")))
return queue_if_no_path(m, 0, 0);
mutex_lock(&m->work_mutex);
if (m->suspended) {
r = -EBUSY;
goto out;
}
if (argc != 2)
goto error;
if (dm_suspended(ti)) {
r = -EBUSY;
goto out;
}
if (!strnicmp(argv[0], MESG_STR("disable_group")))
return bypass_pg_num(m, argv[1], 1);
else if (!strnicmp(argv[0], MESG_STR("enable_group")))
return bypass_pg_num(m, argv[1], 0);
else if (!strnicmp(argv[0], MESG_STR("switch_group")))
return switch_pg_num(m, argv[1]);
else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
if (argc == 1) {
if (!strnicmp(argv[0], MESG_STR("queue_if_no_path"))) {
r = queue_if_no_path(m, 1, 0);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("fail_if_no_path"))) {
r = queue_if_no_path(m, 0, 0);
goto out;
}
}
if (argc != 2) {
DMWARN("Unrecognised multipath message received.");
goto out;
}
if (!strnicmp(argv[0], MESG_STR("disable_group"))) {
r = bypass_pg_num(m, argv[1], 1);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("enable_group"))) {
r = bypass_pg_num(m, argv[1], 0);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("switch_group"))) {
r = switch_pg_num(m, argv[1]);
goto out;
} else if (!strnicmp(argv[0], MESG_STR("reinstate_path")))
action = reinstate_path;
else if (!strnicmp(argv[0], MESG_STR("fail_path")))
action = fail_path;
else
goto error;
else {
DMWARN("Unrecognised multipath message received.");
goto out;
}
r = dm_get_device(ti, argv[1], ti->begin, ti->len,
dm_table_get_mode(ti->table), &dev);
if (r) {
DMWARN("message: error getting device %s",
argv[1]);
return -EINVAL;
goto out;
}
r = action_dev(m, dev, action);
dm_put_device(ti, dev);
out:
mutex_unlock(&m->work_mutex);
return r;
error:
DMWARN("Unrecognised multipath message received.");
return -EINVAL;
}
static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
@ -1567,13 +1611,14 @@ out:
*---------------------------------------------------------------*/
static struct target_type multipath_target = {
.name = "multipath",
.version = {1, 1, 0},
.version = {1, 1, 1},
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
.map_rq = multipath_map,
.rq_end_io = multipath_end_io,
.presuspend = multipath_presuspend,
.postsuspend = multipath_postsuspend,
.resume = multipath_resume,
.status = multipath_status,
.message = multipath_message,

View file

@ -35,6 +35,7 @@ static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
*---------------------------------------------------------------*/
enum dm_raid1_error {
DM_RAID1_WRITE_ERROR,
DM_RAID1_FLUSH_ERROR,
DM_RAID1_SYNC_ERROR,
DM_RAID1_READ_ERROR
};
@ -57,6 +58,7 @@ struct mirror_set {
struct bio_list reads;
struct bio_list writes;
struct bio_list failures;
struct bio_list holds; /* bios are waiting until suspend */
struct dm_region_hash *rh;
struct dm_kcopyd_client *kcopyd_client;
@ -67,6 +69,7 @@ struct mirror_set {
region_t nr_regions;
int in_sync;
int log_failure;
int leg_failure;
atomic_t suspend;
atomic_t default_mirror; /* Default mirror */
@ -179,6 +182,17 @@ static void set_default_mirror(struct mirror *m)
atomic_set(&ms->default_mirror, m - m0);
}
static struct mirror *get_valid_mirror(struct mirror_set *ms)
{
struct mirror *m;
for (m = ms->mirror; m < ms->mirror + ms->nr_mirrors; m++)
if (!atomic_read(&m->error_count))
return m;
return NULL;
}
/* fail_mirror
* @m: mirror device to fail
* @error_type: one of the enum's, DM_RAID1_*_ERROR
@ -198,6 +212,8 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
struct mirror_set *ms = m->ms;
struct mirror *new;
ms->leg_failure = 1;
/*
* error_count is used for nothing more than a
* simple way to tell if a device has encountered
@ -224,19 +240,50 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
goto out;
}
for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
if (!atomic_read(&new->error_count)) {
set_default_mirror(new);
break;
}
if (unlikely(new == ms->mirror + ms->nr_mirrors))
new = get_valid_mirror(ms);
if (new)
set_default_mirror(new);
else
DMWARN("All sides of mirror have failed.");
out:
schedule_work(&ms->trigger_event);
}
static int mirror_flush(struct dm_target *ti)
{
struct mirror_set *ms = ti->private;
unsigned long error_bits;
unsigned int i;
struct dm_io_region io[ms->nr_mirrors];
struct mirror *m;
struct dm_io_request io_req = {
.bi_rw = WRITE_BARRIER,
.mem.type = DM_IO_KMEM,
.mem.ptr.bvec = NULL,
.client = ms->io_client,
};
for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) {
io[i].bdev = m->dev->bdev;
io[i].sector = 0;
io[i].count = 0;
}
error_bits = -1;
dm_io(&io_req, ms->nr_mirrors, io, &error_bits);
if (unlikely(error_bits != 0)) {
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error_bits))
fail_mirror(ms->mirror + i,
DM_RAID1_FLUSH_ERROR);
return -EIO;
}
return 0;
}
/*-----------------------------------------------------------------
* Recovery.
*
@ -396,6 +443,8 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
*/
static sector_t map_sector(struct mirror *m, struct bio *bio)
{
if (unlikely(!bio->bi_size))
return 0;
return m->offset + (bio->bi_sector - m->ms->ti->begin);
}
@ -413,6 +462,27 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
io->count = bio->bi_size >> 9;
}
static void hold_bio(struct mirror_set *ms, struct bio *bio)
{
/*
* If device is suspended, complete the bio.
*/
if (atomic_read(&ms->suspend)) {
if (dm_noflush_suspending(ms->ti))
bio_endio(bio, DM_ENDIO_REQUEUE);
else
bio_endio(bio, -EIO);
return;
}
/*
* Hold bio until the suspend is complete.
*/
spin_lock_irq(&ms->lock);
bio_list_add(&ms->holds, bio);
spin_unlock_irq(&ms->lock);
}
/*-----------------------------------------------------------------
* Reads
*---------------------------------------------------------------*/
@ -511,7 +581,6 @@ static void write_callback(unsigned long error, void *context)
unsigned i, ret = 0;
struct bio *bio = (struct bio *) context;
struct mirror_set *ms;
int uptodate = 0;
int should_wake = 0;
unsigned long flags;
@ -524,36 +593,27 @@ static void write_callback(unsigned long error, void *context)
* This way we handle both writes to SYNC and NOSYNC
* regions with the same code.
*/
if (likely(!error))
goto out;
if (likely(!error)) {
bio_endio(bio, ret);
return;
}
for (i = 0; i < ms->nr_mirrors; i++)
if (test_bit(i, &error))
fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
else
uptodate = 1;
if (unlikely(!uptodate)) {
DMERR("All replicated volumes dead, failing I/O");
/* None of the writes succeeded, fail the I/O. */
ret = -EIO;
} else if (errors_handled(ms)) {
/*
* Need to raise event. Since raising
* events can block, we need to do it in
* the main thread.
*/
spin_lock_irqsave(&ms->lock, flags);
if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio);
spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
return;
}
out:
bio_endio(bio, ret);
/*
* Need to raise event. Since raising
* events can block, we need to do it in
* the main thread.
*/
spin_lock_irqsave(&ms->lock, flags);
if (!ms->failures.head)
should_wake = 1;
bio_list_add(&ms->failures, bio);
spin_unlock_irqrestore(&ms->lock, flags);
if (should_wake)
wakeup_mirrord(ms);
}
static void do_write(struct mirror_set *ms, struct bio *bio)
@ -562,7 +622,7 @@ static void do_write(struct mirror_set *ms, struct bio *bio)
struct dm_io_region io[ms->nr_mirrors], *dest = io;
struct mirror *m;
struct dm_io_request io_req = {
.bi_rw = WRITE,
.bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
.mem.type = DM_IO_BVEC,
.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
.notify.fn = write_callback,
@ -603,6 +663,11 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
bio_list_init(&requeue);
while ((bio = bio_list_pop(writes))) {
if (unlikely(bio_empty_barrier(bio))) {
bio_list_add(&sync, bio);
continue;
}
region = dm_rh_bio_to_region(ms->rh, bio);
if (log->type->is_remote_recovering &&
@ -672,8 +737,12 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
dm_rh_delay(ms->rh, bio);
while ((bio = bio_list_pop(&nosync))) {
map_bio(get_default_mirror(ms), bio);
generic_make_request(bio);
if (unlikely(ms->leg_failure) && errors_handled(ms))
hold_bio(ms, bio);
else {
map_bio(get_default_mirror(ms), bio);
generic_make_request(bio);
}
}
}
@ -681,20 +750,12 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
{
struct bio *bio;
if (!failures->head)
if (likely(!failures->head))
return;
if (!ms->log_failure) {
while ((bio = bio_list_pop(failures))) {
ms->in_sync = 0;
dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
}
return;
}
/*
* If the log has failed, unattempted writes are being
* put on the failures list. We can't issue those writes
* put on the holds list. We can't issue those writes
* until a log has been marked, so we must store them.
*
* If a 'noflush' suspend is in progress, we can requeue
@ -709,23 +770,27 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
* for us to treat them the same and requeue them
* as well.
*/
if (dm_noflush_suspending(ms->ti)) {
while ((bio = bio_list_pop(failures)))
bio_endio(bio, DM_ENDIO_REQUEUE);
return;
}
while ((bio = bio_list_pop(failures))) {
if (!ms->log_failure) {
ms->in_sync = 0;
dm_rh_mark_nosync(ms->rh, bio);
}
if (atomic_read(&ms->suspend)) {
while ((bio = bio_list_pop(failures)))
/*
* If all the legs are dead, fail the I/O.
* If we have been told to handle errors, hold the bio
* and wait for userspace to deal with the problem.
* Otherwise pretend that the I/O succeeded. (This would
* be wrong if the failed leg returned after reboot and
* got replicated back to the good legs.)
*/
if (!get_valid_mirror(ms))
bio_endio(bio, -EIO);
return;
else if (errors_handled(ms))
hold_bio(ms, bio);
else
bio_endio(bio, 0);
}
spin_lock_irq(&ms->lock);
bio_list_merge(&ms->failures, failures);
spin_unlock_irq(&ms->lock);
delayed_wake(ms);
}
static void trigger_event(struct work_struct *work)
@ -784,12 +849,17 @@ static struct mirror_set *alloc_context(unsigned int nr_mirrors,
}
spin_lock_init(&ms->lock);
bio_list_init(&ms->reads);
bio_list_init(&ms->writes);
bio_list_init(&ms->failures);
bio_list_init(&ms->holds);
ms->ti = ti;
ms->nr_mirrors = nr_mirrors;
ms->nr_regions = dm_sector_div_up(ti->len, region_size);
ms->in_sync = 0;
ms->log_failure = 0;
ms->leg_failure = 0;
atomic_set(&ms->suspend, 0);
atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
@ -889,7 +959,8 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
return NULL;
}
dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2);
dl = dm_dirty_log_create(argv[0], ti, mirror_flush, param_count,
argv + 2);
if (!dl) {
ti->error = "Error creating mirror dirty log";
return NULL;
@ -995,6 +1066,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
ti->private = ms;
ti->split_io = dm_rh_get_region_size(ms->rh);
ti->num_flush_requests = 1;
ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
if (!ms->kmirrord_wq) {
@ -1122,7 +1194,8 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio,
* We need to dec pending if this was a write.
*/
if (rw == WRITE) {
dm_rh_dec(ms->rh, map_context->ll);
if (likely(!bio_empty_barrier(bio)))
dm_rh_dec(ms->rh, map_context->ll);
return error;
}
@ -1180,6 +1253,9 @@ static void mirror_presuspend(struct dm_target *ti)
struct mirror_set *ms = (struct mirror_set *) ti->private;
struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
struct bio_list holds;
struct bio *bio;
atomic_set(&ms->suspend, 1);
/*
@ -1202,6 +1278,22 @@ static void mirror_presuspend(struct dm_target *ti)
* we know that all of our I/O has been pushed.
*/
flush_workqueue(ms->kmirrord_wq);
/*
* Now set ms->suspend is set and the workqueue flushed, no more
* entries can be added to ms->hold list, so process it.
*
* Bios can still arrive concurrently with or after this
* presuspend function, but they cannot join the hold list
* because ms->suspend is set.
*/
spin_lock_irq(&ms->lock);
holds = ms->holds;
bio_list_init(&ms->holds);
spin_unlock_irq(&ms->lock);
while ((bio = bio_list_pop(&holds)))
hold_bio(ms, bio);
}
static void mirror_postsuspend(struct dm_target *ti)
@ -1244,7 +1336,8 @@ static char device_status_char(struct mirror *m)
if (!atomic_read(&(m->error_count)))
return 'A';
return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
return (test_bit(DM_RAID1_FLUSH_ERROR, &(m->error_type))) ? 'F' :
(test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
(test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
(test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
}

View file

@ -79,6 +79,11 @@ struct dm_region_hash {
struct list_head recovered_regions;
struct list_head failed_recovered_regions;
/*
* If there was a barrier failure no regions can be marked clean.
*/
int barrier_failure;
void *context;
sector_t target_begin;
@ -211,6 +216,7 @@ struct dm_region_hash *dm_region_hash_create(
INIT_LIST_HEAD(&rh->quiesced_regions);
INIT_LIST_HEAD(&rh->recovered_regions);
INIT_LIST_HEAD(&rh->failed_recovered_regions);
rh->barrier_failure = 0;
rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
sizeof(struct dm_region));
@ -377,8 +383,6 @@ static void complete_resync_work(struct dm_region *reg, int success)
/* dm_rh_mark_nosync
* @ms
* @bio
* @done
* @error
*
* The bio was written on some mirror(s) but failed on other mirror(s).
* We can successfully endio the bio but should avoid the region being
@ -386,8 +390,7 @@ static void complete_resync_work(struct dm_region *reg, int success)
*
* This function is _not_ safe in interrupt context!
*/
void dm_rh_mark_nosync(struct dm_region_hash *rh,
struct bio *bio, unsigned done, int error)
void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
{
unsigned long flags;
struct dm_dirty_log *log = rh->log;
@ -395,6 +398,11 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
region_t region = dm_rh_bio_to_region(rh, bio);
int recovering = 0;
if (bio_empty_barrier(bio)) {
rh->barrier_failure = 1;
return;
}
/* We must inform the log that the sync count has changed. */
log->type->set_region_sync(log, region, 0);
@ -419,7 +427,6 @@ void dm_rh_mark_nosync(struct dm_region_hash *rh,
BUG_ON(!list_empty(&reg->list));
spin_unlock_irqrestore(&rh->region_lock, flags);
bio_endio(bio, error);
if (recovering)
complete_resync_work(reg, 0);
}
@ -515,8 +522,11 @@ void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
{
struct bio *bio;
for (bio = bios->head; bio; bio = bio->bi_next)
for (bio = bios->head; bio; bio = bio->bi_next) {
if (bio_empty_barrier(bio))
continue;
rh_inc(rh, dm_rh_bio_to_region(rh, bio));
}
}
EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
@ -544,7 +554,14 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region)
*/
/* do nothing for DM_RH_NOSYNC */
if (reg->state == DM_RH_RECOVERING) {
if (unlikely(rh->barrier_failure)) {
/*
* If a write barrier failed some time ago, we
* don't know whether or not this write made it
* to the disk, so we must resync the device.
*/
reg->state = DM_RH_NOSYNC;
} else if (reg->state == DM_RH_RECOVERING) {
list_add_tail(&reg->list, &rh->quiesced_regions);
} else if (reg->state == DM_RH_DIRTY) {
reg->state = DM_RH_CLEAN;

View file

@ -55,6 +55,8 @@
*/
#define SNAPSHOT_DISK_VERSION 1
#define NUM_SNAPSHOT_HDR_CHUNKS 1
struct disk_header {
uint32_t magic;
@ -120,7 +122,22 @@ struct pstore {
/*
* The next free chunk for an exception.
*
* When creating exceptions, all the chunks here and above are
* free. It holds the next chunk to be allocated. On rare
* occasions (e.g. after a system crash) holes can be left in
* the exception store because chunks can be committed out of
* order.
*
* When merging exceptions, it does not necessarily mean all the
* chunks here and above are free. It holds the value it would
* have held if all chunks had been committed in order of
* allocation. Consequently the value may occasionally be
* slightly too low, but since it's only used for 'status' and
* it can never reach its minimum value too early this doesn't
* matter.
*/
chunk_t next_free;
/*
@ -214,7 +231,7 @@ static int chunk_io(struct pstore *ps, void *area, chunk_t chunk, int rw,
int metadata)
{
struct dm_io_region where = {
.bdev = ps->store->cow->bdev,
.bdev = dm_snap_cow(ps->store->snap)->bdev,
.sector = ps->store->chunk_size * chunk,
.count = ps->store->chunk_size,
};
@ -294,7 +311,8 @@ static int read_header(struct pstore *ps, int *new_snapshot)
*/
if (!ps->store->chunk_size) {
ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
bdev_logical_block_size(ps->store->cow->bdev) >> 9);
bdev_logical_block_size(dm_snap_cow(ps->store->snap)->
bdev) >> 9);
ps->store->chunk_mask = ps->store->chunk_size - 1;
ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
chunk_size_supplied = 0;
@ -408,6 +426,15 @@ static void write_exception(struct pstore *ps,
e->new_chunk = cpu_to_le64(de->new_chunk);
}
static void clear_exception(struct pstore *ps, uint32_t index)
{
struct disk_exception *e = get_exception(ps, index);
/* clear it */
e->old_chunk = 0;
e->new_chunk = 0;
}
/*
* Registers the exceptions that are present in the current area.
* 'full' is filled in to indicate if the area has been
@ -489,11 +516,23 @@ static struct pstore *get_info(struct dm_exception_store *store)
return (struct pstore *) store->context;
}
static void persistent_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
static void persistent_usage(struct dm_exception_store *store,
sector_t *total_sectors,
sector_t *sectors_allocated,
sector_t *metadata_sectors)
{
*numerator = get_info(store)->next_free * store->chunk_size;
*denominator = get_dev_size(store->cow->bdev);
struct pstore *ps = get_info(store);
*sectors_allocated = ps->next_free * store->chunk_size;
*total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
/*
* First chunk is the fixed header.
* Then there are (ps->current_area + 1) metadata chunks, each one
* separated from the next by ps->exceptions_per_area data chunks.
*/
*metadata_sectors = (ps->current_area + 1 + NUM_SNAPSHOT_HDR_CHUNKS) *
store->chunk_size;
}
static void persistent_dtr(struct dm_exception_store *store)
@ -552,44 +591,40 @@ static int persistent_read_metadata(struct dm_exception_store *store,
ps->current_area = 0;
zero_memory_area(ps);
r = zero_disk_area(ps, 0);
if (r) {
DMWARN("zero_disk_area(0) failed");
return r;
}
} else {
/*
* Sanity checks.
*/
if (ps->version != SNAPSHOT_DISK_VERSION) {
DMWARN("unable to handle snapshot disk version %d",
ps->version);
return -EINVAL;
}
/*
* Metadata are valid, but snapshot is invalidated
*/
if (!ps->valid)
return 1;
/*
* Read the metadata.
*/
r = read_exceptions(ps, callback, callback_context);
if (r)
return r;
DMWARN("zero_disk_area(0) failed");
return r;
}
/*
* Sanity checks.
*/
if (ps->version != SNAPSHOT_DISK_VERSION) {
DMWARN("unable to handle snapshot disk version %d",
ps->version);
return -EINVAL;
}
return 0;
/*
* Metadata are valid, but snapshot is invalidated
*/
if (!ps->valid)
return 1;
/*
* Read the metadata.
*/
r = read_exceptions(ps, callback, callback_context);
return r;
}
static int persistent_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
struct dm_exception *e)
{
struct pstore *ps = get_info(store);
uint32_t stride;
chunk_t next_free;
sector_t size = get_dev_size(store->cow->bdev);
sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
/* Is there enough room ? */
if (size < ((ps->next_free + 1) * store->chunk_size))
@ -611,7 +646,7 @@ static int persistent_prepare_exception(struct dm_exception_store *store,
}
static void persistent_commit_exception(struct dm_exception_store *store,
struct dm_snap_exception *e,
struct dm_exception *e,
void (*callback) (void *, int success),
void *callback_context)
{
@ -672,6 +707,85 @@ static void persistent_commit_exception(struct dm_exception_store *store,
ps->callback_count = 0;
}
static int persistent_prepare_merge(struct dm_exception_store *store,
chunk_t *last_old_chunk,
chunk_t *last_new_chunk)
{
struct pstore *ps = get_info(store);
struct disk_exception de;
int nr_consecutive;
int r;
/*
* When current area is empty, move back to preceding area.
*/
if (!ps->current_committed) {
/*
* Have we finished?
*/
if (!ps->current_area)
return 0;
ps->current_area--;
r = area_io(ps, READ);
if (r < 0)
return r;
ps->current_committed = ps->exceptions_per_area;
}
read_exception(ps, ps->current_committed - 1, &de);
*last_old_chunk = de.old_chunk;
*last_new_chunk = de.new_chunk;
/*
* Find number of consecutive chunks within the current area,
* working backwards.
*/
for (nr_consecutive = 1; nr_consecutive < ps->current_committed;
nr_consecutive++) {
read_exception(ps, ps->current_committed - 1 - nr_consecutive,
&de);
if (de.old_chunk != *last_old_chunk - nr_consecutive ||
de.new_chunk != *last_new_chunk - nr_consecutive)
break;
}
return nr_consecutive;
}
static int persistent_commit_merge(struct dm_exception_store *store,
int nr_merged)
{
int r, i;
struct pstore *ps = get_info(store);
BUG_ON(nr_merged > ps->current_committed);
for (i = 0; i < nr_merged; i++)
clear_exception(ps, ps->current_committed - 1 - i);
r = area_io(ps, WRITE);
if (r < 0)
return r;
ps->current_committed -= nr_merged;
/*
* At this stage, only persistent_usage() uses ps->next_free, so
* we make no attempt to keep ps->next_free strictly accurate
* as exceptions may have been committed out-of-order originally.
* Once a snapshot has become merging, we set it to the value it
* would have held had all the exceptions been committed in order.
*
* ps->current_area does not get reduced by prepare_merge() until
* after commit_merge() has removed the nr_merged previous exceptions.
*/
ps->next_free = (area_location(ps, ps->current_area) - 1) +
(ps->current_committed + 1) + NUM_SNAPSHOT_HDR_CHUNKS;
return 0;
}
static void persistent_drop_snapshot(struct dm_exception_store *store)
{
struct pstore *ps = get_info(store);
@ -697,7 +811,7 @@ static int persistent_ctr(struct dm_exception_store *store,
ps->area = NULL;
ps->zero_area = NULL;
ps->header_area = NULL;
ps->next_free = 2; /* skipping the header and first area */
ps->next_free = NUM_SNAPSHOT_HDR_CHUNKS + 1; /* header and 1st area */
ps->current_committed = 0;
ps->callback_count = 0;
@ -726,8 +840,7 @@ static unsigned persistent_status(struct dm_exception_store *store,
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
DMEMIT(" %s P %llu", store->cow->name,
(unsigned long long)store->chunk_size);
DMEMIT(" P %llu", (unsigned long long)store->chunk_size);
}
return sz;
@ -741,8 +854,10 @@ static struct dm_exception_store_type _persistent_type = {
.read_metadata = persistent_read_metadata,
.prepare_exception = persistent_prepare_exception,
.commit_exception = persistent_commit_exception,
.prepare_merge = persistent_prepare_merge,
.commit_merge = persistent_commit_merge,
.drop_snapshot = persistent_drop_snapshot,
.fraction_full = persistent_fraction_full,
.usage = persistent_usage,
.status = persistent_status,
};
@ -754,8 +869,10 @@ static struct dm_exception_store_type _persistent_compat_type = {
.read_metadata = persistent_read_metadata,
.prepare_exception = persistent_prepare_exception,
.commit_exception = persistent_commit_exception,
.prepare_merge = persistent_prepare_merge,
.commit_merge = persistent_commit_merge,
.drop_snapshot = persistent_drop_snapshot,
.fraction_full = persistent_fraction_full,
.usage = persistent_usage,
.status = persistent_status,
};

View file

@ -36,10 +36,10 @@ static int transient_read_metadata(struct dm_exception_store *store,
}
static int transient_prepare_exception(struct dm_exception_store *store,
struct dm_snap_exception *e)
struct dm_exception *e)
{
struct transient_c *tc = store->context;
sector_t size = get_dev_size(store->cow->bdev);
sector_t size = get_dev_size(dm_snap_cow(store->snap)->bdev);
if (size < (tc->next_free + store->chunk_size))
return -1;
@ -51,7 +51,7 @@ static int transient_prepare_exception(struct dm_exception_store *store,
}
static void transient_commit_exception(struct dm_exception_store *store,
struct dm_snap_exception *e,
struct dm_exception *e,
void (*callback) (void *, int success),
void *callback_context)
{
@ -59,11 +59,14 @@ static void transient_commit_exception(struct dm_exception_store *store,
callback(callback_context, 1);
}
static void transient_fraction_full(struct dm_exception_store *store,
sector_t *numerator, sector_t *denominator)
static void transient_usage(struct dm_exception_store *store,
sector_t *total_sectors,
sector_t *sectors_allocated,
sector_t *metadata_sectors)
{
*numerator = ((struct transient_c *) store->context)->next_free;
*denominator = get_dev_size(store->cow->bdev);
*sectors_allocated = ((struct transient_c *) store->context)->next_free;
*total_sectors = get_dev_size(dm_snap_cow(store->snap)->bdev);
*metadata_sectors = 0;
}
static int transient_ctr(struct dm_exception_store *store,
@ -91,8 +94,7 @@ static unsigned transient_status(struct dm_exception_store *store,
case STATUSTYPE_INFO:
break;
case STATUSTYPE_TABLE:
DMEMIT(" %s N %llu", store->cow->name,
(unsigned long long)store->chunk_size);
DMEMIT(" N %llu", (unsigned long long)store->chunk_size);
}
return sz;
@ -106,7 +108,7 @@ static struct dm_exception_store_type _transient_type = {
.read_metadata = transient_read_metadata,
.prepare_exception = transient_prepare_exception,
.commit_exception = transient_commit_exception,
.fraction_full = transient_fraction_full,
.usage = transient_usage,
.status = transient_status,
};
@ -118,7 +120,7 @@ static struct dm_exception_store_type _transient_compat_type = {
.read_metadata = transient_read_metadata,
.prepare_exception = transient_prepare_exception,
.commit_exception = transient_commit_exception,
.fraction_full = transient_fraction_full,
.usage = transient_usage,
.status = transient_status,
};

File diff suppressed because it is too large Load diff

View file

@ -59,7 +59,7 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
{
sprintf(buf, "%d\n", dm_suspended(md));
sprintf(buf, "%d\n", dm_suspended_md(md));
return strlen(buf);
}
@ -79,6 +79,13 @@ static struct sysfs_ops dm_sysfs_ops = {
.show = dm_attr_show,
};
/*
* The sysfs structure is embedded in md struct, nothing to do here
*/
static void dm_sysfs_release(struct kobject *kobj)
{
}
/*
* dm kobject is embedded in mapped_device structure
* no need to define release function here
@ -86,6 +93,7 @@ static struct sysfs_ops dm_sysfs_ops = {
static struct kobj_type dm_ktype = {
.sysfs_ops = &dm_sysfs_ops,
.default_attrs = dm_attrs,
.release = dm_sysfs_release
};
/*

View file

@ -238,6 +238,9 @@ void dm_table_destroy(struct dm_table *t)
{
unsigned int i;
if (!t)
return;
while (atomic_read(&t->holders))
msleep(1);
smp_mb();

View file

@ -139,14 +139,13 @@ void dm_send_uevents(struct list_head *events, struct kobject *kobj)
list_del_init(&event->elist);
/*
* Need to call dm_copy_name_and_uuid from here for now.
* Context of previous var adds and locking used for
* hash_cell not compatable.
* When a device is being removed this copy fails and we
* discard these unsent events.
*/
if (dm_copy_name_and_uuid(event->md, event->name,
event->uuid)) {
DMERR("%s: dm_copy_name_and_uuid() failed",
__func__);
DMINFO("%s: skipping sending uevent for lost device",
__func__);
goto uevent_free;
}

File diff suppressed because it is too large Load diff

View file

@ -88,6 +88,16 @@ int dm_target_iterate(void (*iter_func)(struct target_type *tt,
int dm_split_args(int *argc, char ***argvp, char *input);
/*
* Is this mapped_device being deleted?
*/
int dm_deleting_md(struct mapped_device *md);
/*
* Is this mapped_device suspended?
*/
int dm_suspended_md(struct mapped_device *md);
/*
* The device-mapper can be driven through one of two interfaces;
* ioctl or filesystem, depending which patch you have applied.
@ -118,6 +128,9 @@ int dm_lock_for_deletion(struct mapped_device *md);
void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
unsigned cookie);
int dm_io_init(void);
void dm_io_exit(void);
int dm_kcopyd_init(void);
void dm_kcopyd_exit(void);

View file

@ -235,7 +235,7 @@ void dm_uevent_add(struct mapped_device *md, struct list_head *elist);
const char *dm_device_name(struct mapped_device *md);
int dm_copy_name_and_uuid(struct mapped_device *md, char *name, char *uuid);
struct gendisk *dm_disk(struct mapped_device *md);
int dm_suspended(struct mapped_device *md);
int dm_suspended(struct dm_target *ti);
int dm_noflush_suspending(struct dm_target *ti);
union map_info *dm_get_mapinfo(struct bio *bio);
union map_info *dm_get_rq_mapinfo(struct request *rq);
@ -276,7 +276,7 @@ void dm_table_unplug_all(struct dm_table *t);
/*
* Table reference counting.
*/
struct dm_table *dm_get_table(struct mapped_device *md);
struct dm_table *dm_get_live_table(struct mapped_device *md);
void dm_table_get(struct dm_table *t);
void dm_table_put(struct dm_table *t);
@ -295,8 +295,10 @@ void dm_table_event(struct dm_table *t);
/*
* The device must be suspended before calling this method.
* Returns the previous table, which the caller must destroy.
*/
int dm_swap_table(struct mapped_device *md, struct dm_table *t);
struct dm_table *dm_swap_table(struct mapped_device *md,
struct dm_table *t);
/*
* A wrapper around vmalloc.

View file

@ -21,6 +21,7 @@ struct dm_dirty_log_type;
struct dm_dirty_log {
struct dm_dirty_log_type *type;
int (*flush_callback_fn)(struct dm_target *ti);
void *context;
};
@ -136,8 +137,9 @@ int dm_dirty_log_type_unregister(struct dm_dirty_log_type *type);
* type->constructor/destructor() directly.
*/
struct dm_dirty_log *dm_dirty_log_create(const char *type_name,
struct dm_target *ti,
unsigned argc, char **argv);
struct dm_target *ti,
int (*flush_callback_fn)(struct dm_target *ti),
unsigned argc, char **argv);
void dm_dirty_log_destroy(struct dm_dirty_log *log);
#endif /* __KERNEL__ */

View file

@ -1,6 +1,6 @@
/*
* Copyright (C) 2001 - 2003 Sistina Software (UK) Limited.
* Copyright (C) 2004 - 2005 Red Hat, Inc. All rights reserved.
* Copyright (C) 2004 - 2009 Red Hat, Inc. All rights reserved.
*
* This file is released under the LGPL.
*/
@ -266,9 +266,9 @@ enum {
#define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
#define DM_VERSION_MAJOR 4
#define DM_VERSION_MINOR 15
#define DM_VERSION_MINOR 16
#define DM_VERSION_PATCHLEVEL 0
#define DM_VERSION_EXTRA "-ioctl (2009-04-01)"
#define DM_VERSION_EXTRA "-ioctl (2009-11-05)"
/* Status bits */
#define DM_READONLY_FLAG (1 << 0) /* In/Out */
@ -309,4 +309,11 @@ enum {
*/
#define DM_NOFLUSH_FLAG (1 << 11) /* In */
/*
* If set, any table information returned will relate to the inactive
* table instead of the live one. Always check DM_INACTIVE_PRESENT_FLAG
* is set before using the data returned.
*/
#define DM_QUERY_INACTIVE_TABLE_FLAG (1 << 12) /* In */
#endif /* _LINUX_DM_IOCTL_H */

View file

@ -78,8 +78,7 @@ void dm_rh_dec(struct dm_region_hash *rh, region_t region);
/* Delay bios on regions. */
void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio);
void dm_rh_mark_nosync(struct dm_region_hash *rh,
struct bio *bio, unsigned done, int error);
void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio);
/*
* Region recovery control.