From 1122a26f2abe4245ccdaed95ec23f63fe086b332 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Sep 2009 13:52:12 +0200 Subject: [PATCH 001/113] block: use normal I/O path for discard requests prepare_discard_fn() was being called in a place where memory allocation was effectively impossible. This makes it inappropriate for all but the most trivial translations of Linux's DISCARD operation to the block command set. Additionally adding a payload there makes the ownership of the bio backing unclear as it's now allocated by the device driver and not the submitter as usual. It is replaced with QUEUE_FLAG_DISCARD which is used to indicate whether the queue supports discard operations or not. blkdev_issue_discard now allocates a one-page, sector-length payload which is the right thing for the common ATA and SCSI implementations. The mtd implementation of prepare_discard_fn() is replaced with simply checking for the request being a discard. Largely based on a previous patch from Matthew Wilcox which did the prepare_discard_fn but not the different payload allocation yet. Signed-off-by: Christoph Hellwig --- block/blk-barrier.c | 35 ++++++++++++++++++++++++++++++----- block/blk-core.c | 3 +-- block/blk-settings.c | 17 ----------------- drivers/mtd/mtd_blkdevs.c | 19 +++++-------------- drivers/staging/dst/dcore.c | 2 +- include/linux/blkdev.h | 6 ++---- 6 files changed, 39 insertions(+), 43 deletions(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 6593ab39cfe..21f5025c394 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -350,6 +350,7 @@ static void blkdev_discard_end_io(struct bio *bio, int err) if (bio->bi_private) complete(bio->bi_private); + __free_page(bio_page(bio)); bio_put(bio); } @@ -372,26 +373,44 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, struct request_queue *q = bdev_get_queue(bdev); int type = flags & DISCARD_FL_BARRIER ? DISCARD_BARRIER : DISCARD_NOBARRIER; + struct bio *bio; + struct page *page; int ret = 0; if (!q) return -ENXIO; - if (!q->prepare_discard_fn) + if (!blk_queue_discard(q)) return -EOPNOTSUPP; while (nr_sects && !ret) { - struct bio *bio = bio_alloc(gfp_mask, 0); - if (!bio) - return -ENOMEM; + unsigned int sector_size = q->limits.logical_block_size; + bio = bio_alloc(gfp_mask, 1); + if (!bio) + goto out; + bio->bi_sector = sector; bio->bi_end_io = blkdev_discard_end_io; bio->bi_bdev = bdev; if (flags & DISCARD_FL_WAIT) bio->bi_private = &wait; - bio->bi_sector = sector; + /* + * Add a zeroed one-sector payload as that's what + * our current implementations need. If we'll ever need + * more the interface will need revisiting. + */ + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + goto out_free_bio; + if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) + goto out_free_page; + /* + * And override the bio size - the way discard works we + * touch many more blocks on disk than the actual payload + * length. + */ if (nr_sects > queue_max_hw_sectors(q)) { bio->bi_size = queue_max_hw_sectors(q) << 9; nr_sects -= queue_max_hw_sectors(q); @@ -414,5 +433,11 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio_put(bio); } return ret; +out_free_page: + __free_page(page); +out_free_bio: + bio_put(bio); +out: + return -ENOMEM; } EXPORT_SYMBOL(blkdev_issue_discard); diff --git a/block/blk-core.c b/block/blk-core.c index 8135228e4b2..80a020dd158 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1124,7 +1124,6 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_flags |= REQ_DISCARD; if (bio_rw_flagged(bio, BIO_RW_BARRIER)) req->cmd_flags |= REQ_SOFTBARRIER; - req->q->prepare_discard_fn(req->q, req); } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) req->cmd_flags |= REQ_HARDBARRIER; @@ -1470,7 +1469,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (bio_rw_flagged(bio, BIO_RW_DISCARD) && - !q->prepare_discard_fn) { + !blk_queue_discard(q)) { err = -EOPNOTSUPP; goto end_io; } diff --git a/block/blk-settings.c b/block/blk-settings.c index eaf122ff5f1..d29498ef1eb 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -33,23 +33,6 @@ void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) } EXPORT_SYMBOL(blk_queue_prep_rq); -/** - * blk_queue_set_discard - set a discard_sectors function for queue - * @q: queue - * @dfn: prepare_discard function - * - * It's possible for a queue to register a discard callback which is used - * to transform a discard request into the appropriate type for the - * hardware. If none is registered, then discard requests are failed - * with %EOPNOTSUPP. - * - */ -void blk_queue_set_discard(struct request_queue *q, prepare_discard_fn *dfn) -{ - q->prepare_discard_fn = dfn; -} -EXPORT_SYMBOL(blk_queue_set_discard); - /** * blk_queue_merge_bvec - set a merge_bvec function for queue * @q: queue diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 0acbf4f5be5..8ca17a3e96e 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -32,14 +32,6 @@ struct mtd_blkcore_priv { spinlock_t queue_lock; }; -static int blktrans_discard_request(struct request_queue *q, - struct request *req) -{ - req->cmd_type = REQ_TYPE_LINUX_BLOCK; - req->cmd[0] = REQ_LB_OP_DISCARD; - return 0; -} - static int do_blktrans_request(struct mtd_blktrans_ops *tr, struct mtd_blktrans_dev *dev, struct request *req) @@ -52,10 +44,6 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, buf = req->buffer; - if (req->cmd_type == REQ_TYPE_LINUX_BLOCK && - req->cmd[0] == REQ_LB_OP_DISCARD) - return tr->discard(dev, block, nsect); - if (!blk_fs_request(req)) return -EIO; @@ -63,6 +51,9 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, get_capacity(req->rq_disk)) return -EIO; + if (blk_discard_rq(req)) + return tr->discard(dev, block, nsect); + switch(rq_data_dir(req)) { case READ: for (; nsect > 0; nsect--, block++, buf += tr->blksize) @@ -380,8 +371,8 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr) tr->blkcore_priv->rq->queuedata = tr; blk_queue_logical_block_size(tr->blkcore_priv->rq, tr->blksize); if (tr->discard) - blk_queue_set_discard(tr->blkcore_priv->rq, - blktrans_discard_request); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, + tr->blkcore_priv->rq); tr->blkshift = ffs(tr->blksize) - 1; diff --git a/drivers/staging/dst/dcore.c b/drivers/staging/dst/dcore.c index ac8577358ba..5e8db067758 100644 --- a/drivers/staging/dst/dcore.c +++ b/drivers/staging/dst/dcore.c @@ -102,7 +102,7 @@ static int dst_request(struct request_queue *q, struct bio *bio) struct dst_node *n = q->queuedata; int err = -EIO; - if (bio_empty_barrier(bio) && !q->prepare_discard_fn) { + if (bio_empty_barrier(bio) && !blk_queue_discard(q)) { /* * This is a dirty^Wnice hack, but if we complete this * operation with -EOPNOTSUPP like intended, XFS diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e23a86cae5a..f62d45e8761 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -82,7 +82,6 @@ enum rq_cmd_type_bits { enum { REQ_LB_OP_EJECT = 0x40, /* eject request */ REQ_LB_OP_FLUSH = 0x41, /* flush request */ - REQ_LB_OP_DISCARD = 0x42, /* discard sectors */ }; /* @@ -261,7 +260,6 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); -typedef int (prepare_discard_fn) (struct request_queue *, struct request *); struct bio_vec; struct bvec_merge_data { @@ -340,7 +338,6 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unplug_fn *unplug_fn; - prepare_discard_fn *prepare_discard_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; softirq_done_fn *softirq_done_fn; @@ -460,6 +457,7 @@ struct request_queue #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ #define QUEUE_FLAG_CQ 16 /* hardware does queuing */ +#define QUEUE_FLAG_DISCARD 17 /* supports DISCARD */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ @@ -591,6 +589,7 @@ enum { #define blk_queue_flushing(q) ((q)->ordseq) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) +#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) @@ -955,7 +954,6 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); -extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *); extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); From ca80650cfbde5b17a5fa957a261c7973f84599a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Sep 2009 13:54:20 +0200 Subject: [PATCH 002/113] block: allow large discard requests Currently we set the bio size to the byte equivalent of the blocks to be trimmed when submitting the initial DISCARD ioctl. That means it is subject to the max_hw_sectors limitation of the HBA which is much lower than the size of a DISCARD request we can support. Add a separate max_discard_sectors tunable to limit the size for discard requests. We limit the max discard request size in bytes to 32bit as that is the limit for bio->bi_size. This could be much larger if we had a way to pass that information through the block layer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-barrier.c | 10 ++++++---- block/blk-core.c | 3 ++- block/blk-settings.c | 13 +++++++++++++ include/linux/blkdev.h | 3 +++ 4 files changed, 24 insertions(+), 5 deletions(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 21f5025c394..8873b9b439f 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -385,6 +385,8 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, while (nr_sects && !ret) { unsigned int sector_size = q->limits.logical_block_size; + unsigned int max_discard_sectors = + min(q->limits.max_discard_sectors, UINT_MAX >> 9); bio = bio_alloc(gfp_mask, 1); if (!bio) @@ -411,10 +413,10 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, * touch many more blocks on disk than the actual payload * length. */ - if (nr_sects > queue_max_hw_sectors(q)) { - bio->bi_size = queue_max_hw_sectors(q) << 9; - nr_sects -= queue_max_hw_sectors(q); - sector += queue_max_hw_sectors(q); + if (nr_sects > max_discard_sectors) { + bio->bi_size = max_discard_sectors << 9; + nr_sects -= max_discard_sectors; + sector += max_discard_sectors; } else { bio->bi_size = nr_sects << 9; nr_sects = 0; diff --git a/block/blk-core.c b/block/blk-core.c index 80a020dd158..34504f30972 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1436,7 +1436,8 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } - if (unlikely(nr_sectors > queue_max_hw_sectors(q))) { + if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) && + nr_sectors > queue_max_hw_sectors(q))) { printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), bio_sectors(bio), diff --git a/block/blk-settings.c b/block/blk-settings.c index d29498ef1eb..e0695bca702 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -96,6 +96,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_segment_size = MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; lim->max_hw_sectors = INT_MAX; + lim->max_discard_sectors = SAFE_MAX_SECTORS; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -238,6 +239,18 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors) } EXPORT_SYMBOL(blk_queue_max_hw_sectors); +/** + * blk_queue_max_discard_sectors - set max sectors for a single discard + * @q: the request queue for the device + * @max_discard: maximum number of sectors to discard + **/ +void blk_queue_max_discard_sectors(struct request_queue *q, + unsigned int max_discard_sectors) +{ + q->limits.max_discard_sectors = max_discard_sectors; +} +EXPORT_SYMBOL(blk_queue_max_discard_sectors); + /** * blk_queue_max_phys_segments - set max phys segments for a request for this queue * @q: the request queue for the device diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f62d45e8761..1a03b715dfa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -311,6 +311,7 @@ struct queue_limits { unsigned int alignment_offset; unsigned int io_min; unsigned int io_opt; + unsigned int max_discard_sectors; unsigned short logical_block_size; unsigned short max_hw_segments; @@ -928,6 +929,8 @@ extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); +extern void blk_queue_max_discard_sectors(struct request_queue *q, + unsigned int max_discard_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); extern void blk_queue_alignment_offset(struct request_queue *q, From 1a35e0f6443f4266dad4c569c55c57a9032596fa Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Thu, 1 Oct 2009 21:16:13 +0200 Subject: [PATCH 003/113] Add a tracepoint for block request remapping Since 2.6.31 now has request-based device-mapper, it's useful to have a tracepoint for request-remapping as well as bio-remapping. This patch adds a tracepoint for request-remapping, trace_block_rq_remap(). Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Alasdair G Kergon Cc: Li Zefan Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + include/linux/blktrace_api.h | 2 +- include/trace/events/block.h | 33 +++++++++++++++++++++++++++++++++ kernel/trace/blktrace.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 34504f30972..ddaaea4fdff 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include "blk.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); static int __make_request(struct request_queue *q, struct bio *bio); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 622939a2329..3b73b9992b2 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -212,7 +212,7 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) -# define blk_trace_remove_sysfs(struct device *dev) do { } while (0) +# define blk_trace_remove_sysfs(dev) do { } while (0) static inline int blk_trace_init_sysfs(struct device *dev) { return 0; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d86af94691c..00405b5f624 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -488,6 +488,39 @@ TRACE_EVENT(block_remap, (unsigned long long)__entry->old_sector) ); +TRACE_EVENT(block_rq_remap, + + TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev, + sector_t from), + + TP_ARGS(q, rq, dev, from), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( dev_t, old_dev ) + __field( sector_t, old_sector ) + __array( char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = disk_devt(rq->rq_disk); + __entry->sector = blk_rq_pos(rq); + __entry->nr_sector = blk_rq_sectors(rq); + __entry->old_dev = dev; + __entry->old_sector = from; + blk_fill_rwbs_rq(__entry->rwbs, rq); + ), + + TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, + MAJOR(__entry->old_dev), MINOR(__entry->old_dev), + (unsigned long long)__entry->old_sector) +); + #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 60b5c5a3d4b..d9d6206e0b1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -855,6 +855,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, sizeof(r), &r); } +/** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + /** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for @@ -922,10 +953,13 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + WARN_ON(ret); } static void blk_unregister_tracepoints(void) { + unregister_trace_block_rq_remap(blk_add_trace_rq_remap); unregister_trace_block_remap(blk_add_trace_remap); unregister_trace_block_split(blk_add_trace_split); unregister_trace_block_unplug_io(blk_add_trace_unplug_io); From b411b3637fa71fce9cf2acf0639009500f5892fe Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 25 Sep 2009 16:07:19 -0700 Subject: [PATCH 004/113] The DRBD driver Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- .../blockdev/drbd/DRBD-8.3-data-packets.svg | 588 +++ .../blockdev/drbd/DRBD-data-packets.svg | 459 ++ Documentation/blockdev/drbd/README.txt | 16 + Documentation/blockdev/drbd/conn-states-8.dot | 18 + Documentation/blockdev/drbd/disk-states-8.dot | 16 + .../drbd/drbd-connection-state-overview.dot | 85 + Documentation/blockdev/drbd/node-states-8.dot | 14 + MAINTAINERS | 13 + drivers/block/Kconfig | 2 + drivers/block/Makefile | 1 + drivers/block/drbd/Kconfig | 82 + drivers/block/drbd/Makefile | 8 + drivers/block/drbd/drbd_actlog.c | 1484 ++++++ drivers/block/drbd/drbd_bitmap.c | 1327 +++++ drivers/block/drbd/drbd_int.h | 2258 +++++++++ drivers/block/drbd/drbd_main.c | 3735 ++++++++++++++ drivers/block/drbd/drbd_nl.c | 2365 +++++++++ drivers/block/drbd/drbd_proc.c | 266 + drivers/block/drbd/drbd_receiver.c | 4456 +++++++++++++++++ drivers/block/drbd/drbd_req.c | 1132 +++++ drivers/block/drbd/drbd_req.h | 327 ++ drivers/block/drbd/drbd_strings.c | 113 + drivers/block/drbd/drbd_tracing.c | 752 +++ drivers/block/drbd/drbd_tracing.h | 87 + drivers/block/drbd/drbd_vli.h | 351 ++ drivers/block/drbd/drbd_worker.c | 1529 ++++++ drivers/block/drbd/drbd_wrappers.h | 91 + include/linux/drbd.h | 349 ++ include/linux/drbd_limits.h | 137 + include/linux/drbd_nl.h | 137 + include/linux/drbd_tag_magic.h | 83 + include/linux/lru_cache.h | 294 ++ lib/Kconfig | 3 + lib/Makefile | 2 + lib/lru_cache.c | 560 +++ 35 files changed, 23140 insertions(+) create mode 100644 Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg create mode 100644 Documentation/blockdev/drbd/DRBD-data-packets.svg create mode 100644 Documentation/blockdev/drbd/README.txt create mode 100644 Documentation/blockdev/drbd/conn-states-8.dot create mode 100644 Documentation/blockdev/drbd/disk-states-8.dot create mode 100644 Documentation/blockdev/drbd/drbd-connection-state-overview.dot create mode 100644 Documentation/blockdev/drbd/node-states-8.dot create mode 100644 drivers/block/drbd/Kconfig create mode 100644 drivers/block/drbd/Makefile create mode 100644 drivers/block/drbd/drbd_actlog.c create mode 100644 drivers/block/drbd/drbd_bitmap.c create mode 100644 drivers/block/drbd/drbd_int.h create mode 100644 drivers/block/drbd/drbd_main.c create mode 100644 drivers/block/drbd/drbd_nl.c create mode 100644 drivers/block/drbd/drbd_proc.c create mode 100644 drivers/block/drbd/drbd_receiver.c create mode 100644 drivers/block/drbd/drbd_req.c create mode 100644 drivers/block/drbd/drbd_req.h create mode 100644 drivers/block/drbd/drbd_strings.c create mode 100644 drivers/block/drbd/drbd_tracing.c create mode 100644 drivers/block/drbd/drbd_tracing.h create mode 100644 drivers/block/drbd/drbd_vli.h create mode 100644 drivers/block/drbd/drbd_worker.c create mode 100644 drivers/block/drbd/drbd_wrappers.h create mode 100644 include/linux/drbd.h create mode 100644 include/linux/drbd_limits.h create mode 100644 include/linux/drbd_nl.h create mode 100644 include/linux/drbd_tag_magic.h create mode 100644 include/linux/lru_cache.h create mode 100644 lib/lru_cache.c diff --git a/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg new file mode 100644 index 00000000000..f87cfa0dc2f --- /dev/null +++ b/Documentation/blockdev/drbd/DRBD-8.3-data-packets.svg @@ -0,0 +1,588 @@ + + + + + + Master slide + + + + + + + + + + RSDataReply + + + + + + + CsumRSRequest + + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + + WriteAck + + + + got_BlockAck() + + + Checksum based Resync, case not in sync + + + DRBD-8.3 data flow + + + w_e_send_csum() + + + + + + + + RSIsInSync + + + + + + + CsumRSRequest + + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_csum_rs_req() + + + got_IsInSync() + + + Checksum based Resync, case in sync + + + + + + + + + + OVReply + + + + + + + OVRequest + + + + receive_OVRequest() + + + drbd_endio_read_sec() + + + w_e_end_ov_req() + + + receive_OVReply() + + + drbd_endio_read_sec() + + + w_e_end_ov_reply() + + + + + + OVResult + + + + got_OVResult() + + + Online verify + + + w_make_ov_request() + + + + + + + + drbd_endio_read_sec() + + + w_make_resync_request() + + + w_e_send_csum() + + + + + drbd_endio_read_sec() + + + + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_complete_io() + + + rs_complete_io() + + diff --git a/Documentation/blockdev/drbd/DRBD-data-packets.svg b/Documentation/blockdev/drbd/DRBD-data-packets.svg new file mode 100644 index 00000000000..48a1e2165fe --- /dev/null +++ b/Documentation/blockdev/drbd/DRBD-data-packets.svg @@ -0,0 +1,459 @@ + + + + + + Master slide + + + + + + + + + RSDataReply + + + + + RSDataRequest + + + w_make_resync_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_rsdata_req() + + + receive_RSDataReply() + + + drbd_endio_write_sec() + + + e_end_resync_block() + + + + + WriteAck + + + got_BlockAck() + + + Resync blocks, 4-32K + + + + + + + WriteAck + + + + + Data + + + drbd_make_request() + + + receive_Data() + + + drbd_endio_write_sec() + + + e_end_block() + + + got_BlockAck() + + + Regular mirrored write, 512-32K + + + w_send_dblock() + + + + + drbd_endio_write_pri() + + + + + + + DataReply + + + + + DataRequest + + + drbd_make_request() + + + receive_DataRequest() + + + drbd_endio_read_sec() + + + w_e_end_data_req() + + + Drawing + + receive_DataReply() + + + + Diskless read, 512-32K + + + w_send_read_req() + + + DRBD 8 data flow + + + + + + al_begin_io() + + + al_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + + rs_begin_io() + + + rs_complete_io() + + diff --git a/Documentation/blockdev/drbd/README.txt b/Documentation/blockdev/drbd/README.txt new file mode 100644 index 00000000000..627b0a1bf35 --- /dev/null +++ b/Documentation/blockdev/drbd/README.txt @@ -0,0 +1,16 @@ +Description + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Please visit http://www.drbd.org to find out more. + +The here included files are intended to help understand the implementation + +DRBD-8.3-data-packets.svg, DRBD-data-packets.svg + relates some functions, and write packets. + +conn-states-8.dot, disk-states-8.dot, node-states-8.dot + The sub graphs of DRBD's state transitions diff --git a/Documentation/blockdev/drbd/conn-states-8.dot b/Documentation/blockdev/drbd/conn-states-8.dot new file mode 100644 index 00000000000..025e8cf5e64 --- /dev/null +++ b/Documentation/blockdev/drbd/conn-states-8.dot @@ -0,0 +1,18 @@ +digraph conn_states { + StandAllone -> WFConnection [ label = "ioctl_set_net()" ] + WFConnection -> Unconnected [ label = "unable to bind()" ] + WFConnection -> WFReportParams [ label = "in connect() after accept" ] + WFReportParams -> StandAllone [ label = "checks in receive_param()" ] + WFReportParams -> Connected [ label = "in receive_param()" ] + WFReportParams -> WFBitMapS [ label = "sync_handshake()" ] + WFReportParams -> WFBitMapT [ label = "sync_handshake()" ] + WFBitMapS -> SyncSource [ label = "receive_bitmap()" ] + WFBitMapT -> SyncTarget [ label = "receive_bitmap()" ] + SyncSource -> Connected + SyncTarget -> Connected + SyncSource -> PausedSyncS + SyncTarget -> PausedSyncT + PausedSyncS -> SyncSource + PausedSyncT -> SyncTarget + Connected -> WFConnection [ label = "* on network error" ] +} diff --git a/Documentation/blockdev/drbd/disk-states-8.dot b/Documentation/blockdev/drbd/disk-states-8.dot new file mode 100644 index 00000000000..d06cfb46fb9 --- /dev/null +++ b/Documentation/blockdev/drbd/disk-states-8.dot @@ -0,0 +1,16 @@ +digraph disk_states { + Diskless -> Inconsistent [ label = "ioctl_set_disk()" ] + Diskless -> Consistent [ label = "ioctl_set_disk()" ] + Diskless -> Outdated [ label = "ioctl_set_disk()" ] + Consistent -> Outdated [ label = "receive_param()" ] + Consistent -> UpToDate [ label = "receive_param()" ] + Consistent -> Inconsistent [ label = "start resync" ] + Outdated -> Inconsistent [ label = "start resync" ] + UpToDate -> Inconsistent [ label = "ioctl_replicate" ] + Inconsistent -> UpToDate [ label = "resync completed" ] + Consistent -> Failed [ label = "io completion error" ] + Outdated -> Failed [ label = "io completion error" ] + UpToDate -> Failed [ label = "io completion error" ] + Inconsistent -> Failed [ label = "io completion error" ] + Failed -> Diskless [ label = "sending notify to peer" ] +} diff --git a/Documentation/blockdev/drbd/drbd-connection-state-overview.dot b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot new file mode 100644 index 00000000000..6d9cf0a7b11 --- /dev/null +++ b/Documentation/blockdev/drbd/drbd-connection-state-overview.dot @@ -0,0 +1,85 @@ +// vim: set sw=2 sts=2 : +digraph { + rankdir=BT + bgcolor=white + + node [shape=plaintext] + node [fontcolor=black] + + StandAlone [ style=filled,fillcolor=gray,label=StandAlone ] + + node [fontcolor=lightgray] + + Unconnected [ label=Unconnected ] + + CommTrouble [ shape=record, + label="{communication loss|{Timeout|BrokenPipe|NetworkFailure}}" ] + + node [fontcolor=gray] + + subgraph cluster_try_connect { + label="try to connect, handshake" + rank=max + WFConnection [ label=WFConnection ] + WFReportParams [ label=WFReportParams ] + } + + TearDown [ label=TearDown ] + + Connected [ label=Connected,style=filled,fillcolor=green,fontcolor=black ] + + node [fontcolor=lightblue] + + StartingSyncS [ label=StartingSyncS ] + StartingSyncT [ label=StartingSyncT ] + + subgraph cluster_bitmap_exchange { + node [fontcolor=red] + fontcolor=red + label="new application (WRITE?) requests blocked\lwhile bitmap is exchanged" + + WFBitMapT [ label=WFBitMapT ] + WFSyncUUID [ label=WFSyncUUID ] + WFBitMapS [ label=WFBitMapS ] + } + + node [fontcolor=blue] + + cluster_resync [ shape=record,label="{resynchronisation process running\l'concurrent' application requests allowed|{{PausedSyncT\nSyncTarget}|{PausedSyncS\nSyncSource}}}" ] + + node [shape=box,fontcolor=black] + + // drbdadm [label="drbdadm connect"] + // handshake [label="drbd_connect()\ndrbd_do_handshake\ndrbd_sync_handshake() etc."] + // comm_error [label="communication trouble"] + + // + // edges + // -------------------------------------- + + StandAlone -> Unconnected [ label="drbdadm connect" ] + Unconnected -> StandAlone [ label="drbdadm disconnect\lor serious communication trouble" ] + Unconnected -> WFConnection [ label="receiver thread is started" ] + WFConnection -> WFReportParams [ headlabel="accept()\land/or \lconnect()\l" ] + + WFReportParams -> StandAlone [ label="during handshake\lpeers do not agree\labout something essential" ] + WFReportParams -> Connected [ label="data identical\lno sync needed",color=green,fontcolor=green ] + + WFReportParams -> WFBitMapS + WFReportParams -> WFBitMapT + WFBitMapT -> WFSyncUUID [minlen=0.1,constraint=false] + + WFBitMapS -> cluster_resync:S + WFSyncUUID -> cluster_resync:T + + edge [color=green] + cluster_resync:any -> Connected [ label="resnyc done",fontcolor=green ] + + edge [color=red] + WFReportParams -> CommTrouble + Connected -> CommTrouble + cluster_resync:any -> CommTrouble + edge [color=black] + CommTrouble -> Unconnected [label="receiver thread is stopped" ] + +} diff --git a/Documentation/blockdev/drbd/node-states-8.dot b/Documentation/blockdev/drbd/node-states-8.dot new file mode 100644 index 00000000000..4a2b00c2354 --- /dev/null +++ b/Documentation/blockdev/drbd/node-states-8.dot @@ -0,0 +1,14 @@ +digraph node_states { + Secondary -> Primary [ label = "ioctl_set_state()" ] + Primary -> Secondary [ label = "ioctl_set_state()" ] +} + +digraph peer_states { + Secondary -> Primary [ label = "recv state packet" ] + Primary -> Secondary [ label = "recv state packet" ] + Primary -> Unknown [ label = "connection lost" ] + Secondary -> Unknown [ label = "connection lost" ] + Unknown -> Primary [ label = "connected" ] + Unknown -> Secondary [ label = "connected" ] +} + diff --git a/MAINTAINERS b/MAINTAINERS index c450f3abb8c..ea56bd7a6cb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1758,6 +1758,19 @@ S: Maintained F: drivers/scsi/dpt* F: drivers/scsi/dpt/ +DRBD DRIVER +P: Philipp Reisner +P: Lars Ellenberg +M: drbd-dev@lists.linbit.com +L: drbd-user@lists.linbit.com +W: http://www.drbd.org +T: git git://git.drbd.org/linux-2.6-drbd.git drbd +T: git git://git.drbd.org/drbd-8.3.git +S: Supported +F: drivers/block/drbd/ +F: lib/lru_cache.c +F: Documentation/blockdev/drbd/ + DRIVER CORE, KOBJECTS, AND SYSFS M: Greg Kroah-Hartman T: quilt kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/ diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 1d886e079c5..77bfce52e9c 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -271,6 +271,8 @@ config BLK_DEV_CRYPTOLOOP instead, which can be configured to be on-disk compatible with the cryptoloop device. +source "drivers/block/drbd/Kconfig" + config BLK_DEV_NBD tristate "Network block device support" depends on NET diff --git a/drivers/block/Makefile b/drivers/block/Makefile index cdaa3f8fddf..aff5ac925c3 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -36,5 +36,6 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o +obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ swim_mod-objs := swim.o swim_asm.o diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig new file mode 100644 index 00000000000..4e6f90f487c --- /dev/null +++ b/drivers/block/drbd/Kconfig @@ -0,0 +1,82 @@ +# +# DRBD device driver configuration +# + +comment "DRBD disabled because PROC_FS, INET or CONNECTOR not selected" + depends on !PROC_FS || !INET || !CONNECTOR + +config BLK_DEV_DRBD + tristate "DRBD Distributed Replicated Block Device support" + depends on PROC_FS && INET && CONNECTOR + select LRU_CACHE + default n + help + + NOTE: In order to authenticate connections you have to select + CRYPTO_HMAC and a hash function as well. + + DRBD is a shared-nothing, synchronously replicated block device. It + is designed to serve as a building block for high availability + clusters and in this context, is a "drop-in" replacement for shared + storage. Simplistically, you could see it as a network RAID 1. + + Each minor device has a role, which can be 'primary' or 'secondary'. + On the node with the primary device the application is supposed to + run and to access the device (/dev/drbdX). Every write is sent to + the local 'lower level block device' and, across the network, to the + node with the device in 'secondary' state. The secondary device + simply writes the data to its lower level block device. + + DRBD can also be used in dual-Primary mode (device writable on both + nodes), which means it can exhibit shared disk semantics in a + shared-nothing cluster. Needless to say, on top of dual-Primary + DRBD utilizing a cluster file system is necessary to maintain for + cache coherency. + + For automatic failover you need a cluster manager (e.g. heartbeat). + See also: http://www.drbd.org/, http://www.linux-ha.org + + If unsure, say N. + +config DRBD_TRACE + tristate "DRBD tracing" + depends on BLK_DEV_DRBD + select TRACEPOINTS + default n + help + + Say Y here if you want to be able to trace various events in DRBD. + + If unsure, say N. + +config DRBD_FAULT_INJECTION + bool "DRBD fault injection" + depends on BLK_DEV_DRBD + help + + Say Y here if you want to simulate IO errors, in order to test DRBD's + behavior. + + The actual simulation of IO errors is done by writing 3 values to + /sys/module/drbd/parameters/ + + enable_faults: bitmask of... + 1 meta data write + 2 read + 4 resync data write + 8 read + 16 data write + 32 data read + 64 read ahead + 128 kmalloc of bitmap + 256 allocation of EE (epoch_entries) + + fault_devs: bitmask of minor numbers + fault_rate: frequency in percent + + Example: Simulate data write errors on /dev/drbd0 with a probability of 5%. + echo 16 > /sys/module/drbd/parameters/enable_faults + echo 1 > /sys/module/drbd/parameters/fault_devs + echo 5 > /sys/module/drbd/parameters/fault_rate + + If unsure, say N. diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile new file mode 100644 index 00000000000..7d86ef8a8b4 --- /dev/null +++ b/drivers/block/drbd/Makefile @@ -0,0 +1,8 @@ +drbd-y := drbd_bitmap.o drbd_proc.o +drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o +drbd-y += drbd_main.o drbd_strings.o drbd_nl.o + +drbd_trace-y := drbd_tracing.o + +obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o +obj-$(CONFIG_DRBD_TRACE) += drbd_trace.o diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c new file mode 100644 index 00000000000..74b4835d310 --- /dev/null +++ b/drivers/block/drbd/drbd_actlog.c @@ -0,0 +1,1484 @@ +/* + drbd_actlog.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include "drbd_int.h" +#include "drbd_tracing.h" +#include "drbd_wrappers.h" + +/* We maintain a trivial check sum in our on disk activity log. + * With that we can ensure correct operation even when the storage + * device might do a partial (last) sector write while loosing power. + */ +struct __packed al_transaction { + u32 magic; + u32 tr_number; + struct __packed { + u32 pos; + u32 extent; } updates[1 + AL_EXTENTS_PT]; + u32 xor_sum; +}; + +struct update_odbm_work { + struct drbd_work w; + unsigned int enr; +}; + +struct update_al_work { + struct drbd_work w; + struct lc_element *al_ext; + struct completion event; + unsigned int enr; + /* if old_enr != LC_FREE, write corresponding bitmap sector, too */ + unsigned int old_enr; +}; + +struct drbd_atodb_wait { + atomic_t count; + struct completion io_done; + struct drbd_conf *mdev; + int error; +}; + + +int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); + +/* The actual tracepoint needs to have constant number of known arguments... + */ +void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + trace__drbd_resync(mdev, level, fmt, ap); + va_end(ap); +} + +static int _drbd_md_sync_page_io(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev, + struct page *page, sector_t sector, + int rw, int size) +{ + struct bio *bio; + struct drbd_md_io md_io; + int ok; + + md_io.mdev = mdev; + init_completion(&md_io.event); + md_io.error = 0; + + if ((rw & WRITE) && !test_bit(MD_NO_BARRIER, &mdev->flags)) + rw |= (1 << BIO_RW_BARRIER); + rw |= ((1<bi_bdev = bdev->md_bdev; + bio->bi_sector = sector; + ok = (bio_add_page(bio, page, size, 0) == size); + if (!ok) + goto out; + bio->bi_private = &md_io; + bio->bi_end_io = drbd_md_io_complete; + bio->bi_rw = rw; + + trace_drbd_bio(mdev, "Md", bio, 0, NULL); + + if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) + bio_endio(bio, -EIO); + else + submit_bio(rw, bio); + wait_for_completion(&md_io.event); + ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + + /* check for unsupported barrier op. + * would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 */ + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && !ok)) { + /* Try again with no barrier */ + dev_warn(DEV, "Barriers not supported on meta data device - disabling\n"); + set_bit(MD_NO_BARRIER, &mdev->flags); + rw &= ~(1 << BIO_RW_BARRIER); + bio_put(bio); + goto retry; + } + out: + bio_put(bio); + return ok; +} + +int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + sector_t sector, int rw) +{ + int logical_block_size, mask, ok; + int offset = 0; + struct page *iop = mdev->md_io_page; + + D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + + BUG_ON(!bdev->md_bdev); + + logical_block_size = bdev_logical_block_size(bdev->md_bdev); + if (logical_block_size == 0) + logical_block_size = MD_SECTOR_SIZE; + + /* in case logical_block_size != 512 [ s390 only? ] */ + if (logical_block_size != MD_SECTOR_SIZE) { + mask = (logical_block_size / MD_SECTOR_SIZE) - 1; + D_ASSERT(mask == 1 || mask == 3 || mask == 7); + D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE); + offset = sector & mask; + sector = sector & ~mask; + iop = mdev->md_io_tmpp; + + if (rw & WRITE) { + /* these are GFP_KERNEL pages, pre-allocated + * on device initialization */ + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, + READ, logical_block_size); + + if (unlikely(!ok)) { + dev_err(DEV, "drbd_md_sync_page_io(,%llus," + "READ [logical_block_size!=512]) failed!\n", + (unsigned long long)sector); + return 0; + } + + memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE); + } + } + + if (sector < drbd_md_first_sector(bdev) || + sector > drbd_md_last_sector(bdev)) + dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n", + current->comm, current->pid, __func__, + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + + ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size); + if (unlikely(!ok)) { + dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n", + (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + return 0; + } + + if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) { + void *p = page_address(mdev->md_io_page); + void *hp = page_address(mdev->md_io_tmpp); + + memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE); + } + + return ok; +} + +static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr) +{ + struct lc_element *al_ext; + struct lc_element *tmp; + unsigned long al_flags = 0; + + spin_lock_irq(&mdev->al_lock); + tmp = lc_find(mdev->resync, enr/AL_EXT_PER_BM_SECT); + if (unlikely(tmp != NULL)) { + struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce); + if (test_bit(BME_NO_WRITES, &bm_ext->flags)) { + spin_unlock_irq(&mdev->al_lock); + return NULL; + } + } + al_ext = lc_get(mdev->act_log, enr); + al_flags = mdev->act_log->flags; + spin_unlock_irq(&mdev->al_lock); + + /* + if (!al_ext) { + if (al_flags & LC_STARVING) + dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n"); + if (al_flags & LC_DIRTY) + dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n"); + } + */ + + return al_ext; +} + +void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); + struct lc_element *al_ext; + struct update_al_work al_work; + + D_ASSERT(atomic_read(&mdev->local_cnt) > 0); + + trace_drbd_actlog(mdev, sector, "al_begin_io"); + + wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); + + if (al_ext->lc_number != enr) { + /* drbd_al_write_transaction(mdev,al_ext,enr); + * recurses into generic_make_request(), which + * disallows recursion, bios being serialized on the + * current->bio_tail list now. + * we have to delegate updates to the activity log + * to the worker thread. */ + init_completion(&al_work.event); + al_work.al_ext = al_ext; + al_work.enr = enr; + al_work.old_enr = al_ext->lc_number; + al_work.w.cb = w_al_write_transaction; + drbd_queue_work_front(&mdev->data.work, &al_work.w); + wait_for_completion(&al_work.event); + + mdev->al_writ_cnt++; + + spin_lock_irq(&mdev->al_lock); + lc_changed(mdev->act_log, al_ext); + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); + } +} + +void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); + struct lc_element *extent; + unsigned long flags; + + trace_drbd_actlog(mdev, sector, "al_complete_io"); + + spin_lock_irqsave(&mdev->al_lock, flags); + + extent = lc_find(mdev->act_log, enr); + + if (!extent) { + spin_unlock_irqrestore(&mdev->al_lock, flags); + dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr); + return; + } + + if (lc_put(mdev->act_log, extent) == 0) + wake_up(&mdev->al_wait); + + spin_unlock_irqrestore(&mdev->al_lock, flags); +} + +int +w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct update_al_work *aw = container_of(w, struct update_al_work, w); + struct lc_element *updated = aw->al_ext; + const unsigned int new_enr = aw->enr; + const unsigned int evicted = aw->old_enr; + struct al_transaction *buffer; + sector_t sector; + int i, n, mx; + unsigned int extent_nr; + u32 xor_sum = 0; + + if (!get_ldev(mdev)) { + dev_err(DEV, "get_ldev() failed in w_al_write_transaction\n"); + complete(&((struct update_al_work *)w)->event); + return 1; + } + /* do we have to do a bitmap write, first? + * TODO reduce maximum latency: + * submit both bios, then wait for both, + * instead of doing two synchronous sector writes. */ + if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE) + drbd_bm_write_sect(mdev, evicted/AL_EXT_PER_BM_SECT); + + mutex_lock(&mdev->md_io_mutex); /* protects md_io_page, al_tr_cycle, ... */ + buffer = (struct al_transaction *)page_address(mdev->md_io_page); + + buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); + buffer->tr_number = cpu_to_be32(mdev->al_tr_number); + + n = lc_index_of(mdev->act_log, updated); + + buffer->updates[0].pos = cpu_to_be32(n); + buffer->updates[0].extent = cpu_to_be32(new_enr); + + xor_sum ^= new_enr; + + mx = min_t(int, AL_EXTENTS_PT, + mdev->act_log->nr_elements - mdev->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = mdev->al_tr_cycle + i; + extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number; + buffer->updates[i+1].pos = cpu_to_be32(idx); + buffer->updates[i+1].extent = cpu_to_be32(extent_nr); + xor_sum ^= extent_nr; + } + for (; i < AL_EXTENTS_PT; i++) { + buffer->updates[i+1].pos = __constant_cpu_to_be32(-1); + buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE); + xor_sum ^= LC_FREE; + } + mdev->al_tr_cycle += AL_EXTENTS_PT; + if (mdev->al_tr_cycle >= mdev->act_log->nr_elements) + mdev->al_tr_cycle = 0; + + buffer->xor_sum = cpu_to_be32(xor_sum); + + sector = mdev->ldev->md.md_offset + + mdev->ldev->md.al_offset + mdev->al_tr_pos; + + if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) + drbd_chk_io_error(mdev, 1, TRUE); + + if (++mdev->al_tr_pos > + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) + mdev->al_tr_pos = 0; + + D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); + mdev->al_tr_number++; + + mutex_unlock(&mdev->md_io_mutex); + + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + + return 1; +} + +/** + * drbd_al_read_tr() - Read a single transaction from the on disk activity log + * @mdev: DRBD device. + * @bdev: Block device to read form. + * @b: pointer to an al_transaction. + * @index: On disk slot of the transaction to read. + * + * Returns -1 on IO error, 0 on checksum error and 1 upon success. + */ +static int drbd_al_read_tr(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev, + struct al_transaction *b, + int index) +{ + sector_t sector; + int rv, i; + u32 xor_sum = 0; + + sector = bdev->md.md_offset + bdev->md.al_offset + index; + + /* Dont process error normally, + * as this is done before disk is attached! */ + if (!drbd_md_sync_page_io(mdev, bdev, sector, READ)) + return -1; + + rv = (be32_to_cpu(b->magic) == DRBD_MAGIC); + + for (i = 0; i < AL_EXTENTS_PT + 1; i++) + xor_sum ^= be32_to_cpu(b->updates[i].extent); + rv &= (xor_sum == be32_to_cpu(b->xor_sum)); + + return rv; +} + +/** + * drbd_al_read_log() - Restores the activity log from its on disk representation. + * @mdev: DRBD device. + * @bdev: Block device to read form. + * + * Returns 1 on success, returns 0 when reading the log failed due to IO errors. + */ +int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + struct al_transaction *buffer; + int i; + int rv; + int mx; + int active_extents = 0; + int transactions = 0; + int found_valid = 0; + int from = 0; + int to = 0; + u32 from_tnr = 0; + u32 to_tnr = 0; + u32 cnr; + + mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT); + + /* lock out all other meta data io for now, + * and make sure the page is mapped. + */ + mutex_lock(&mdev->md_io_mutex); + buffer = page_address(mdev->md_io_page); + + /* Find the valid transaction in the log */ + for (i = 0; i <= mx; i++) { + rv = drbd_al_read_tr(mdev, bdev, buffer, i); + if (rv == 0) + continue; + if (rv == -1) { + mutex_unlock(&mdev->md_io_mutex); + return 0; + } + cnr = be32_to_cpu(buffer->tr_number); + + if (++found_valid == 1) { + from = i; + to = i; + from_tnr = cnr; + to_tnr = cnr; + continue; + } + if ((int)cnr - (int)from_tnr < 0) { + D_ASSERT(from_tnr - cnr + i - from == mx+1); + from = i; + from_tnr = cnr; + } + if ((int)cnr - (int)to_tnr > 0) { + D_ASSERT(cnr - to_tnr == i - to); + to = i; + to_tnr = cnr; + } + } + + if (!found_valid) { + dev_warn(DEV, "No usable activity log found.\n"); + mutex_unlock(&mdev->md_io_mutex); + return 1; + } + + /* Read the valid transactions. + * dev_info(DEV, "Reading from %d to %d.\n",from,to); */ + i = from; + while (1) { + int j, pos; + unsigned int extent_nr; + unsigned int trn; + + rv = drbd_al_read_tr(mdev, bdev, buffer, i); + ERR_IF(rv == 0) goto cancel; + if (rv == -1) { + mutex_unlock(&mdev->md_io_mutex); + return 0; + } + + trn = be32_to_cpu(buffer->tr_number); + + spin_lock_irq(&mdev->al_lock); + + /* This loop runs backwards because in the cyclic + elements there might be an old version of the + updated element (in slot 0). So the element in slot 0 + can overwrite old versions. */ + for (j = AL_EXTENTS_PT; j >= 0; j--) { + pos = be32_to_cpu(buffer->updates[j].pos); + extent_nr = be32_to_cpu(buffer->updates[j].extent); + + if (extent_nr == LC_FREE) + continue; + + lc_set(mdev->act_log, extent_nr, pos); + active_extents++; + } + spin_unlock_irq(&mdev->al_lock); + + transactions++; + +cancel: + if (i == to) + break; + i++; + if (i > mx) + i = 0; + } + + mdev->al_tr_number = to_tnr+1; + mdev->al_tr_pos = to; + if (++mdev->al_tr_pos > + div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) + mdev->al_tr_pos = 0; + + /* ok, we are done with it */ + mutex_unlock(&mdev->md_io_mutex); + + dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", + transactions, active_extents); + + return 1; +} + +static void atodb_endio(struct bio *bio, int error) +{ + struct drbd_atodb_wait *wc = bio->bi_private; + struct drbd_conf *mdev = wc->mdev; + struct page *page; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + if (!error && !uptodate) + error = -EIO; + + drbd_chk_io_error(mdev, error, TRUE); + if (error && wc->error == 0) + wc->error = error; + + if (atomic_dec_and_test(&wc->count)) + complete(&wc->io_done); + + page = bio->bi_io_vec[0].bv_page; + put_page(page); + bio_put(bio); + mdev->bm_writ_cnt++; + put_ldev(mdev); +} + +#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) +/* activity log to on disk bitmap -- prepare bio unless that sector + * is already covered by previously prepared bios */ +static int atodb_prepare_unless_covered(struct drbd_conf *mdev, + struct bio **bios, + unsigned int enr, + struct drbd_atodb_wait *wc) __must_hold(local) +{ + struct bio *bio; + struct page *page; + sector_t on_disk_sector = enr + mdev->ldev->md.md_offset + + mdev->ldev->md.bm_offset; + unsigned int page_offset = PAGE_SIZE; + int offset; + int i = 0; + int err = -ENOMEM; + + /* Check if that enr is already covered by an already created bio. + * Caution, bios[] is not NULL terminated, + * but only initialized to all NULL. + * For completely scattered activity log, + * the last invocation iterates over all bios, + * and finds the last NULL entry. + */ + while ((bio = bios[i])) { + if (bio->bi_sector == on_disk_sector) + return 0; + i++; + } + /* bios[i] == NULL, the next not yet used slot */ + + /* GFP_KERNEL, we are not in the write-out path */ + bio = bio_alloc(GFP_KERNEL, 1); + if (bio == NULL) + return -ENOMEM; + + if (i > 0) { + const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; + page_offset = prev_bv->bv_offset + prev_bv->bv_len; + page = prev_bv->bv_page; + } + if (page_offset == PAGE_SIZE) { + page = alloc_page(__GFP_HIGHMEM); + if (page == NULL) + goto out_bio_put; + page_offset = 0; + } else { + get_page(page); + } + + offset = S2W(enr); + drbd_bm_get_lel(mdev, offset, + min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), + kmap(page) + page_offset); + kunmap(page); + + bio->bi_private = wc; + bio->bi_end_io = atodb_endio; + bio->bi_bdev = mdev->ldev->md_bdev; + bio->bi_sector = on_disk_sector; + + if (bio_add_page(bio, page, MD_SECTOR_SIZE, page_offset) != MD_SECTOR_SIZE) + goto out_put_page; + + atomic_inc(&wc->count); + /* we already know that we may do this... + * get_ldev_if_state(mdev,D_ATTACHING); + * just get the extra reference, so that the local_cnt reflects + * the number of pending IO requests DRBD at its backing device. + */ + atomic_inc(&mdev->local_cnt); + + bios[i] = bio; + + return 0; + +out_put_page: + err = -EINVAL; + put_page(page); +out_bio_put: + bio_put(bio); + return err; +} + +/** + * drbd_al_to_on_disk_bm() - * Writes bitmap parts covered by active AL extents + * @mdev: DRBD device. + * + * Called when we detach (unconfigure) local storage, + * or when we go from R_PRIMARY to R_SECONDARY role. + */ +void drbd_al_to_on_disk_bm(struct drbd_conf *mdev) +{ + int i, nr_elements; + unsigned int enr; + struct bio **bios; + struct drbd_atodb_wait wc; + + ERR_IF (!get_ldev_if_state(mdev, D_ATTACHING)) + return; /* sorry, I don't have any act_log etc... */ + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + nr_elements = mdev->act_log->nr_elements; + + /* GFP_KERNEL, we are not in anyone's write-out path */ + bios = kzalloc(sizeof(struct bio *) * nr_elements, GFP_KERNEL); + if (!bios) + goto submit_one_by_one; + + atomic_set(&wc.count, 0); + init_completion(&wc.io_done); + wc.mdev = mdev; + wc.error = 0; + + for (i = 0; i < nr_elements; i++) { + enr = lc_element_by_index(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + /* next statement also does atomic_inc wc.count and local_cnt */ + if (atodb_prepare_unless_covered(mdev, bios, + enr/AL_EXT_PER_BM_SECT, + &wc)) + goto free_bios_submit_one_by_one; + } + + /* unnecessary optimization? */ + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + /* all prepared, submit them */ + for (i = 0; i < nr_elements; i++) { + if (bios[i] == NULL) + break; + if (FAULT_ACTIVE(mdev, DRBD_FAULT_MD_WR)) { + bios[i]->bi_rw = WRITE; + bio_endio(bios[i], -EIO); + } else { + submit_bio(WRITE, bios[i]); + } + } + + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); + + /* always (try to) flush bitmap to stable storage */ + drbd_md_flush(mdev); + + /* In case we did not submit a single IO do not wait for + * them to complete. ( Because we would wait forever here. ) + * + * In case we had IOs and they are already complete, there + * is not point in waiting anyways. + * Therefore this if () ... */ + if (atomic_read(&wc.count)) + wait_for_completion(&wc.io_done); + + put_ldev(mdev); + + kfree(bios); + return; + + free_bios_submit_one_by_one: + /* free everything by calling the endio callback directly. */ + for (i = 0; i < nr_elements && bios[i]; i++) + bio_endio(bios[i], 0); + + kfree(bios); + + submit_one_by_one: + dev_warn(DEV, "Using the slow drbd_al_to_on_disk_bm()\n"); + + for (i = 0; i < mdev->act_log->nr_elements; i++) { + enr = lc_element_by_index(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + /* Really slow: if we have al-extents 16..19 active, + * sector 4 will be written four times! Synchronous! */ + drbd_bm_write_sect(mdev, enr/AL_EXT_PER_BM_SECT); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + put_ldev(mdev); +} + +/** + * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents + * @mdev: DRBD device. + */ +void drbd_al_apply_to_bm(struct drbd_conf *mdev) +{ + unsigned int enr; + unsigned long add = 0; + char ppb[10]; + int i; + + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + for (i = 0; i < mdev->act_log->nr_elements; i++) { + enr = lc_element_by_index(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + add += drbd_bm_ALe_set_all(mdev, enr); + } + + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n", + ppsize(ppb, Bit2KB(add))); +} + +static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext) +{ + int rv; + + spin_lock_irq(&mdev->al_lock); + rv = (al_ext->refcnt == 0); + if (likely(rv)) + lc_del(mdev->act_log, al_ext); + spin_unlock_irq(&mdev->al_lock); + + return rv; +} + +/** + * drbd_al_shrink() - Removes all active extents form the activity log + * @mdev: DRBD device. + * + * Removes all active extents form the activity log, waiting until + * the reference count of each entry dropped to 0 first, of course. + * + * You need to lock mdev->act_log with lc_try_lock() / lc_unlock() + */ +void drbd_al_shrink(struct drbd_conf *mdev) +{ + struct lc_element *al_ext; + int i; + + D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags)); + + for (i = 0; i < mdev->act_log->nr_elements; i++) { + al_ext = lc_element_by_index(mdev->act_log, i); + if (al_ext->lc_number == LC_FREE) + continue; + wait_event(mdev->al_wait, _try_lc_del(mdev, al_ext)); + } + + wake_up(&mdev->al_wait); +} + +static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w); + + if (!get_ldev(mdev)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n"); + kfree(udw); + return 1; + } + + drbd_bm_write_sect(mdev, udw->enr); + put_ldev(mdev); + + kfree(udw); + + if (drbd_bm_total_weight(mdev) <= mdev->rs_failed) { + switch (mdev->state.conn) { + case C_SYNC_SOURCE: case C_SYNC_TARGET: + case C_PAUSED_SYNC_S: case C_PAUSED_SYNC_T: + drbd_resync_finished(mdev); + default: + /* nothing to do */ + break; + } + } + drbd_bcast_sync_progress(mdev); + + return 1; +} + + +/* ATTENTION. The AL's extents are 4MB each, while the extents in the + * resync LRU-cache are 16MB each. + * The caller of this function has to hold an get_ldev() reference. + * + * TODO will be obsoleted once we have a caching lru of the on disk bitmap + */ +static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, + int count, int success) +{ + struct lc_element *e; + struct update_odbm_work *udw; + + unsigned int enr; + + D_ASSERT(atomic_read(&mdev->local_cnt)); + + /* I simply assume that a sector/size pair never crosses + * a 16 MB extent border. (Currently this is true...) */ + enr = BM_SECT_TO_EXT(sector); + + e = lc_get(mdev->resync, enr); + if (e) { + struct bm_extent *ext = lc_entry(e, struct bm_extent, lce); + if (ext->lce.lc_number == enr) { + if (success) + ext->rs_left -= count; + else + ext->rs_failed += count; + if (ext->rs_left < ext->rs_failed) { + dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d\n", + (unsigned long long)sector, + ext->lce.lc_number, ext->rs_left, + ext->rs_failed, count); + dump_stack(); + + lc_put(mdev->resync, &ext->lce); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return; + } + } else { + /* Normally this element should be in the cache, + * since drbd_rs_begin_io() pulled it already in. + * + * But maybe an application write finished, and we set + * something outside the resync lru_cache in sync. + */ + int rs_left = drbd_bm_e_weight(mdev, enr); + if (ext->flags != 0) { + dev_warn(DEV, "changing resync lce: %d[%u;%02lx]" + " -> %d[%u;00]\n", + ext->lce.lc_number, ext->rs_left, + ext->flags, enr, rs_left); + ext->flags = 0; + } + if (ext->rs_failed) { + dev_warn(DEV, "Kicking resync_lru element enr=%u " + "out with rs_failed=%d\n", + ext->lce.lc_number, ext->rs_failed); + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); + } + ext->rs_left = rs_left; + ext->rs_failed = success ? 0 : count; + lc_changed(mdev->resync, &ext->lce); + } + lc_put(mdev->resync, &ext->lce); + /* no race, we are within the al_lock! */ + + if (ext->rs_left == ext->rs_failed) { + ext->rs_failed = 0; + + udw = kmalloc(sizeof(*udw), GFP_ATOMIC); + if (udw) { + udw->enr = ext->lce.lc_number; + udw->w.cb = w_update_odbm; + drbd_queue_work_front(&mdev->data.work, &udw->w); + } else { + dev_warn(DEV, "Could not kmalloc an udw\n"); + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); + } + } + } else { + dev_err(DEV, "lc_get() failed! locked=%d/%d flags=%lu\n", + mdev->resync_locked, + mdev->resync->nr_elements, + mdev->resync->flags); + } +} + +/* clear the bit corresponding to the piece of storage in question: + * size byte of data starting from sector. Only clear a bits of the affected + * one ore more _aligned_ BM_BLOCK_SIZE blocks. + * + * called by worker on C_SYNC_TARGET and receiver on SyncSource. + * + */ +void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, + const char *file, const unsigned int line) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr, ebnr, lbnr; + unsigned long count = 0; + sector_t esector, nr_sectors; + int wake_up = 0; + unsigned long flags; + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector, size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we clear it (in sync). + * round up start sector, round down end sector. we make sure we only + * clear full, aligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) + return; + if (unlikely(esector == (nr_sectors-1))) + ebnr = lbnr; + else + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + trace_drbd_resync(mdev, TRACE_LVL_METRICS, + "drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", + (unsigned long long)sector, size, sbnr, ebnr); + + if (sbnr > ebnr) + return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irqsave(&mdev->al_lock, flags); + count = drbd_bm_clear_bits(mdev, sbnr, ebnr); + if (count) { + /* we need the lock for drbd_try_clear_on_disk_bm */ + if (jiffies - mdev->rs_mark_time > HZ*10) { + /* should be rolling marks, + * but we estimate only anyways. */ + if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) && + mdev->state.conn != C_PAUSED_SYNC_T && + mdev->state.conn != C_PAUSED_SYNC_S) { + mdev->rs_mark_time = jiffies; + mdev->rs_mark_left = drbd_bm_total_weight(mdev); + } + } + if (get_ldev(mdev)) { + drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE); + put_ldev(mdev); + } + /* just wake_up unconditional now, various lc_chaged(), + * lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up = 1; + } + spin_unlock_irqrestore(&mdev->al_lock, flags); + if (wake_up) + wake_up(&mdev->al_wait); +} + +/* + * this is intended to set one request worth of data out of sync. + * affects at least 1 bit, + * and at most 1+DRBD_MAX_SEGMENT_SIZE/BM_BLOCK_SIZE bits. + * + * called by tl_clear and drbd_send_dblock (==drbd_make_request). + * so this can be _any_ process. + */ +void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, + const char *file, const unsigned int line) +{ + unsigned long sbnr, ebnr, lbnr, flags; + sector_t esector, nr_sectors; + unsigned int enr, count; + struct lc_element *e; + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "sector: %llus, size: %d\n", + (unsigned long long)sector, size); + return; + } + + if (!get_ldev(mdev)) + return; /* no disk, no metadata, no bitmap to set bits in */ + + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + ERR_IF(sector >= nr_sectors) + goto out; + ERR_IF(esector >= nr_sectors) + esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* we set it out of sync, + * we do not need to round anything here */ + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + trace_drbd_resync(mdev, TRACE_LVL_METRICS, + "drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", + (unsigned long long)sector, size, sbnr, ebnr); + + /* ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. */ + spin_lock_irqsave(&mdev->al_lock, flags); + count = drbd_bm_set_bits(mdev, sbnr, ebnr); + + enr = BM_SECT_TO_EXT(sector); + e = lc_find(mdev->resync, enr); + if (e) + lc_entry(e, struct bm_extent, lce)->rs_left += count; + spin_unlock_irqrestore(&mdev->al_lock, flags); + +out: + put_ldev(mdev); +} + +static +struct bm_extent *_bme_get(struct drbd_conf *mdev, unsigned int enr) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int wakeup = 0; + unsigned long rs_flags; + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_locked > mdev->resync->nr_elements/2) { + spin_unlock_irq(&mdev->al_lock); + return NULL; + } + e = lc_get(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync, &bm_ext->lce); + wakeup = 1; + } + if (bm_ext->lce.refcnt == 1) + mdev->resync_locked++; + set_bit(BME_NO_WRITES, &bm_ext->flags); + } + rs_flags = mdev->resync->flags; + spin_unlock_irq(&mdev->al_lock); + if (wakeup) + wake_up(&mdev->al_wait); + + if (!bm_ext) { + if (rs_flags & LC_STARVING) + dev_warn(DEV, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_DIRTY); + } + + return bm_ext; +} + +static int _is_in_al(struct drbd_conf *mdev, unsigned int enr) +{ + struct lc_element *al_ext; + int rv = 0; + + spin_lock_irq(&mdev->al_lock); + if (unlikely(enr == mdev->act_log->new_number)) + rv = 1; + else { + al_ext = lc_find(mdev->act_log, enr); + if (al_ext) { + if (al_ext->refcnt) + rv = 1; + } + } + spin_unlock_irq(&mdev->al_lock); + + /* + if (unlikely(rv)) { + dev_info(DEV, "Delaying sync read until app's write is done\n"); + } + */ + return rv; +} + +/** + * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED + * @mdev: DRBD device. + * @sector: The sector number. + * + * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted. + */ +int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct bm_extent *bm_ext; + int i, sig; + + trace_drbd_resync(mdev, TRACE_LVL_ALL, + "drbd_rs_begin_io: sector=%llus (rs_end=%d)\n", + (unsigned long long)sector, enr); + + sig = wait_event_interruptible(mdev->al_wait, + (bm_ext = _bme_get(mdev, enr))); + if (sig) + return 0; + + if (test_bit(BME_LOCKED, &bm_ext->flags)) + return 1; + + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + sig = wait_event_interruptible(mdev->al_wait, + !_is_in_al(mdev, enr * AL_EXT_PER_BM_SECT + i)); + if (sig) { + spin_lock_irq(&mdev->al_lock); + if (lc_put(mdev->resync, &bm_ext->lce) == 0) { + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + spin_unlock_irq(&mdev->al_lock); + return 0; + } + } + + set_bit(BME_LOCKED, &bm_ext->flags); + + return 1; +} + +/** + * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep + * @mdev: DRBD device. + * @sector: The sector number. + * + * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then + * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN + * if there is still application IO going on in this area. + */ +int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT; + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + + trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n", + (unsigned long long)sector); + + spin_lock_irq(&mdev->al_lock); + if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { + /* in case you have very heavy scattered io, it may + * stall the syncer undefined if we give up the ref count + * when we try again and requeue. + * + * if we don't give up the refcount, but the next time + * we are scheduled this extent has been "synced" by new + * application writes, we'd miss the lc_put on the + * extent we keep the refcount on. + * so we remembered which extent we had to try again, and + * if the next requested one is something else, we do + * the lc_put here... + * we also have to wake_up + */ + + trace_drbd_resync(mdev, TRACE_LVL_ALL, + "dropping %u, apparently got 'synced' by application io\n", + mdev->resync_wenr); + + e = lc_find(mdev->resync, mdev->resync_wenr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_wenr = LC_FREE; + if (lc_put(mdev->resync, &bm_ext->lce) == 0) + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } else { + dev_alert(DEV, "LOGIC BUG\n"); + } + } + /* TRY. */ + e = lc_try_get(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext) { + if (test_bit(BME_LOCKED, &bm_ext->flags)) + goto proceed; + if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) { + mdev->resync_locked++; + } else { + /* we did set the BME_NO_WRITES, + * but then could not set BME_LOCKED, + * so we tried again. + * drop the extra reference. */ + trace_drbd_resync(mdev, TRACE_LVL_ALL, + "dropping extra reference on %u\n", enr); + + bm_ext->lce.refcnt--; + D_ASSERT(bm_ext->lce.refcnt > 0); + } + goto check_al; + } else { + /* do we rather want to try later? */ + if (mdev->resync_locked > mdev->resync->nr_elements-3) { + trace_drbd_resync(mdev, TRACE_LVL_ALL, + "resync_locked = %u!\n", mdev->resync_locked); + + goto try_again; + } + /* Do or do not. There is no try. -- Yoda */ + e = lc_get(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + const unsigned long rs_flags = mdev->resync->flags; + if (rs_flags & LC_STARVING) + dev_warn(DEV, "Have to wait for element" + " (resync LRU too small?)\n"); + BUG_ON(rs_flags & LC_DIRTY); + goto try_again; + } + if (bm_ext->lce.lc_number != enr) { + bm_ext->rs_left = drbd_bm_e_weight(mdev, enr); + bm_ext->rs_failed = 0; + lc_changed(mdev->resync, &bm_ext->lce); + wake_up(&mdev->al_wait); + D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0); + } + set_bit(BME_NO_WRITES, &bm_ext->flags); + D_ASSERT(bm_ext->lce.refcnt == 1); + mdev->resync_locked++; + goto check_al; + } +check_al: + trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr); + + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { + if (unlikely(al_enr+i == mdev->act_log->new_number)) + goto try_again; + if (lc_is_used(mdev->act_log, al_enr+i)) + goto try_again; + } + set_bit(BME_LOCKED, &bm_ext->flags); +proceed: + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + return 0; + +try_again: + trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr); + if (bm_ext) + mdev->resync_wenr = enr; + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; +} + +void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) +{ + unsigned int enr = BM_SECT_TO_EXT(sector); + struct lc_element *e; + struct bm_extent *bm_ext; + unsigned long flags; + + trace_drbd_resync(mdev, TRACE_LVL_ALL, + "drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n", + (long long)sector, enr); + + spin_lock_irqsave(&mdev->al_lock, flags); + e = lc_find(mdev->resync, enr); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (!bm_ext) { + spin_unlock_irqrestore(&mdev->al_lock, flags); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "drbd_rs_complete_io() called, but extent not found\n"); + return; + } + + if (bm_ext->lce.refcnt == 0) { + spin_unlock_irqrestore(&mdev->al_lock, flags); + dev_err(DEV, "drbd_rs_complete_io(,%llu [=%u]) called, " + "but refcnt is 0!?\n", + (unsigned long long)sector, enr); + return; + } + + if (lc_put(mdev->resync, &bm_ext->lce) == 0) { + clear_bit(BME_LOCKED, &bm_ext->flags); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_locked--; + wake_up(&mdev->al_wait); + } + + spin_unlock_irqrestore(&mdev->al_lock, flags); +} + +/** + * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED) + * @mdev: DRBD device. + */ +void drbd_rs_cancel_all(struct drbd_conf *mdev) +{ + trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n"); + + spin_lock_irq(&mdev->al_lock); + + if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ + lc_reset(mdev->resync); + put_ldev(mdev); + } + mdev->resync_locked = 0; + mdev->resync_wenr = LC_FREE; + spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); +} + +/** + * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU + * @mdev: DRBD device. + * + * Returns 0 upon success, -EAGAIN if at least one reference count was + * not zero. + */ +int drbd_rs_del_all(struct drbd_conf *mdev) +{ + struct lc_element *e; + struct bm_extent *bm_ext; + int i; + + trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n"); + + spin_lock_irq(&mdev->al_lock); + + if (get_ldev_if_state(mdev, D_FAILED)) { + /* ok, ->resync is there. */ + for (i = 0; i < mdev->resync->nr_elements; i++) { + e = lc_element_by_index(mdev->resync, i); + bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; + if (bm_ext->lce.lc_number == LC_FREE) + continue; + if (bm_ext->lce.lc_number == mdev->resync_wenr) { + dev_info(DEV, "dropping %u in drbd_rs_del_all, apparently" + " got 'synced' by application io\n", + mdev->resync_wenr); + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(test_bit(BME_NO_WRITES, &bm_ext->flags)); + clear_bit(BME_NO_WRITES, &bm_ext->flags); + mdev->resync_wenr = LC_FREE; + lc_put(mdev->resync, &bm_ext->lce); + } + if (bm_ext->lce.refcnt != 0) { + dev_info(DEV, "Retrying drbd_rs_del_all() later. " + "refcnt=%d\n", bm_ext->lce.refcnt); + put_ldev(mdev); + spin_unlock_irq(&mdev->al_lock); + return -EAGAIN; + } + D_ASSERT(!test_bit(BME_LOCKED, &bm_ext->flags)); + D_ASSERT(!test_bit(BME_NO_WRITES, &bm_ext->flags)); + lc_del(mdev->resync, &bm_ext->lce); + } + D_ASSERT(mdev->resync->used == 0); + put_ldev(mdev); + } + spin_unlock_irq(&mdev->al_lock); + + return 0; +} + +/** + * drbd_rs_failed_io() - Record information on a failure to resync the specified blocks + * @mdev: DRBD device. + * @sector: The sector number. + * @size: Size of failed IO operation, in byte. + */ +void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) +{ + /* Is called from worker and receiver context _only_ */ + unsigned long sbnr, ebnr, lbnr; + unsigned long count; + sector_t esector, nr_sectors; + int wake_up = 0; + + trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, + "drbd_rs_failed_io: sector=%llus, size=%u\n", + (unsigned long long)sector, size); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", + (unsigned long long)sector, size); + return; + } + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + ERR_IF(sector >= nr_sectors) return; + ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1); + + lbnr = BM_SECT_TO_BIT(nr_sectors-1); + + /* + * round up start sector, round down end sector. we make sure we only + * handle full, aligned, BM_BLOCK_SIZE (4K) blocks */ + if (unlikely(esector < BM_SECT_PER_BIT-1)) + return; + if (unlikely(esector == (nr_sectors-1))) + ebnr = lbnr; + else + ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); + sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); + + if (sbnr > ebnr) + return; + + /* + * ok, (capacity & 7) != 0 sometimes, but who cares... + * we count rs_{total,left} in bits, not sectors. + */ + spin_lock_irq(&mdev->al_lock); + count = drbd_bm_count_bits(mdev, sbnr, ebnr); + if (count) { + mdev->rs_failed += count; + + if (get_ldev(mdev)) { + drbd_try_clear_on_disk_bm(mdev, sector, count, FALSE); + put_ldev(mdev); + } + + /* just wake_up unconditional now, various lc_chaged(), + * lc_put() in drbd_try_clear_on_disk_bm(). */ + wake_up = 1; + } + spin_unlock_irq(&mdev->al_lock); + if (wake_up) + wake_up(&mdev->al_wait); +} diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c new file mode 100644 index 00000000000..b61057e7788 --- /dev/null +++ b/drivers/block/drbd/drbd_bitmap.c @@ -0,0 +1,1327 @@ +/* + drbd_bitmap.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2004-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2004-2008, Philipp Reisner . + Copyright (C) 2004-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include "drbd_int.h" + +/* OPAQUE outside this file! + * interface defined in drbd_int.h + + * convention: + * function name drbd_bm_... => used elsewhere, "public". + * function name bm_... => internal to implementation, "private". + + * Note that since find_first_bit returns int, at the current granularity of + * the bitmap (4KB per byte), this implementation "only" supports up to + * 1<<(32+12) == 16 TB... + */ + +/* + * NOTE + * Access to the *bm_pages is protected by bm_lock. + * It is safe to read the other members within the lock. + * + * drbd_bm_set_bits is called from bio_endio callbacks, + * We may be called with irq already disabled, + * so we need spin_lock_irqsave(). + * And we need the kmap_atomic. + */ +struct drbd_bitmap { + struct page **bm_pages; + spinlock_t bm_lock; + /* WARNING unsigned long bm_*: + * 32bit number of bit offset is just enough for 512 MB bitmap. + * it will blow up if we make the bitmap bigger... + * not that it makes much sense to have a bitmap that large, + * rather change the granularity to 16k or 64k or something. + * (that implies other problems, however...) + */ + unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ + unsigned long bm_bits; + size_t bm_words; + size_t bm_number_of_pages; + sector_t bm_dev_capacity; + struct semaphore bm_change; /* serializes resize operations */ + + atomic_t bm_async_io; + wait_queue_head_t bm_io_wait; + + unsigned long bm_flags; + + /* debugging aid, in case we are still racy somewhere */ + char *bm_why; + struct task_struct *bm_task; +}; + +/* definition of bits in bm_flags */ +#define BM_LOCKED 0 +#define BM_MD_IO_ERROR 1 +#define BM_P_VMALLOCED 2 + +static int bm_is_locked(struct drbd_bitmap *b) +{ + return test_bit(BM_LOCKED, &b->bm_flags); +} + +#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__) +static void __bm_print_lock_info(struct drbd_conf *mdev, const char *func) +{ + struct drbd_bitmap *b = mdev->bitmap; + if (!__ratelimit(&drbd_ratelimit_state)) + return; + dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n", + current == mdev->receiver.task ? "receiver" : + current == mdev->asender.task ? "asender" : + current == mdev->worker.task ? "worker" : current->comm, + func, b->bm_why ?: "?", + b->bm_task == mdev->receiver.task ? "receiver" : + b->bm_task == mdev->asender.task ? "asender" : + b->bm_task == mdev->worker.task ? "worker" : "?"); +} + +void drbd_bm_lock(struct drbd_conf *mdev, char *why) +{ + struct drbd_bitmap *b = mdev->bitmap; + int trylock_failed; + + if (!b) { + dev_err(DEV, "FIXME no bitmap in drbd_bm_lock!?\n"); + return; + } + + trylock_failed = down_trylock(&b->bm_change); + + if (trylock_failed) { + dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n", + current == mdev->receiver.task ? "receiver" : + current == mdev->asender.task ? "asender" : + current == mdev->worker.task ? "worker" : current->comm, + why, b->bm_why ?: "?", + b->bm_task == mdev->receiver.task ? "receiver" : + b->bm_task == mdev->asender.task ? "asender" : + b->bm_task == mdev->worker.task ? "worker" : "?"); + down(&b->bm_change); + } + if (__test_and_set_bit(BM_LOCKED, &b->bm_flags)) + dev_err(DEV, "FIXME bitmap already locked in bm_lock\n"); + + b->bm_why = why; + b->bm_task = current; +} + +void drbd_bm_unlock(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + if (!b) { + dev_err(DEV, "FIXME no bitmap in drbd_bm_unlock!?\n"); + return; + } + + if (!__test_and_clear_bit(BM_LOCKED, &mdev->bitmap->bm_flags)) + dev_err(DEV, "FIXME bitmap not locked in bm_unlock\n"); + + b->bm_why = NULL; + b->bm_task = NULL; + up(&b->bm_change); +} + +/* word offset to long pointer */ +static unsigned long *__bm_map_paddr(struct drbd_bitmap *b, unsigned long offset, const enum km_type km) +{ + struct page *page; + unsigned long page_nr; + + /* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */ + page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); + BUG_ON(page_nr >= b->bm_number_of_pages); + page = b->bm_pages[page_nr]; + + return (unsigned long *) kmap_atomic(page, km); +} + +static unsigned long * bm_map_paddr(struct drbd_bitmap *b, unsigned long offset) +{ + return __bm_map_paddr(b, offset, KM_IRQ1); +} + +static void __bm_unmap(unsigned long *p_addr, const enum km_type km) +{ + kunmap_atomic(p_addr, km); +}; + +static void bm_unmap(unsigned long *p_addr) +{ + return __bm_unmap(p_addr, KM_IRQ1); +} + +/* long word offset of _bitmap_ sector */ +#define S2W(s) ((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) +/* word offset from start of bitmap to word number _in_page_ + * modulo longs per page +#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) + hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) + so do it explicitly: + */ +#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) + +/* Long words per page */ +#define LWPP (PAGE_SIZE/sizeof(long)) + +/* + * actually most functions herein should take a struct drbd_bitmap*, not a + * struct drbd_conf*, but for the debug macros I like to have the mdev around + * to be able to report device specific. + */ + +static void bm_free_pages(struct page **pages, unsigned long number) +{ + unsigned long i; + if (!pages) + return; + + for (i = 0; i < number; i++) { + if (!pages[i]) { + printk(KERN_ALERT "drbd: bm_free_pages tried to free " + "a NULL pointer; i=%lu n=%lu\n", + i, number); + continue; + } + __free_page(pages[i]); + pages[i] = NULL; + } +} + +static void bm_vk_free(void *ptr, int v) +{ + if (v) + vfree(ptr); + else + kfree(ptr); +} + +/* + * "have" and "want" are NUMBER OF PAGES. + */ +static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) +{ + struct page **old_pages = b->bm_pages; + struct page **new_pages, *page; + unsigned int i, bytes, vmalloced = 0; + unsigned long have = b->bm_number_of_pages; + + BUG_ON(have == 0 && old_pages != NULL); + BUG_ON(have != 0 && old_pages == NULL); + + if (have == want) + return old_pages; + + /* Trying kmalloc first, falling back to vmalloc. + * GFP_KERNEL is ok, as this is done when a lower level disk is + * "attached" to the drbd. Context is receiver thread or cqueue + * thread. As we have no disk yet, we are not in the IO path, + * not even the IO path of the peer. */ + bytes = sizeof(struct page *)*want; + new_pages = kmalloc(bytes, GFP_KERNEL); + if (!new_pages) { + new_pages = vmalloc(bytes); + if (!new_pages) + return NULL; + vmalloced = 1; + } + + memset(new_pages, 0, bytes); + if (want >= have) { + for (i = 0; i < have; i++) + new_pages[i] = old_pages[i]; + for (; i < want; i++) { + page = alloc_page(GFP_HIGHUSER); + if (!page) { + bm_free_pages(new_pages + have, i - have); + bm_vk_free(new_pages, vmalloced); + return NULL; + } + new_pages[i] = page; + } + } else { + for (i = 0; i < want; i++) + new_pages[i] = old_pages[i]; + /* NOT HERE, we are outside the spinlock! + bm_free_pages(old_pages + want, have - want); + */ + } + + if (vmalloced) + set_bit(BM_P_VMALLOCED, &b->bm_flags); + else + clear_bit(BM_P_VMALLOCED, &b->bm_flags); + + return new_pages; +} + +/* + * called on driver init only. TODO call when a device is created. + * allocates the drbd_bitmap, and stores it in mdev->bitmap. + */ +int drbd_bm_init(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + WARN_ON(b != NULL); + b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL); + if (!b) + return -ENOMEM; + spin_lock_init(&b->bm_lock); + init_MUTEX(&b->bm_change); + init_waitqueue_head(&b->bm_io_wait); + + mdev->bitmap = b; + + return 0; +} + +sector_t drbd_bm_capacity(struct drbd_conf *mdev) +{ + ERR_IF(!mdev->bitmap) return 0; + return mdev->bitmap->bm_dev_capacity; +} + +/* called on driver unload. TODO: call when a device is destroyed. + */ +void drbd_bm_cleanup(struct drbd_conf *mdev) +{ + ERR_IF (!mdev->bitmap) return; + bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages); + bm_vk_free(mdev->bitmap->bm_pages, test_bit(BM_P_VMALLOCED, &mdev->bitmap->bm_flags)); + kfree(mdev->bitmap); + mdev->bitmap = NULL; +} + +/* + * since (b->bm_bits % BITS_PER_LONG) != 0, + * this masks out the remaining bits. + * Returns the number of bits cleared. + */ +static int bm_clear_surplus(struct drbd_bitmap *b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; + size_t w = b->bm_bits >> LN2_BPL; + int cleared = 0; + unsigned long *p_addr, *bm; + + p_addr = bm_map_paddr(b, w); + bm = p_addr + MLPP(w); + if (w < b->bm_words) { + cleared = hweight_long(*bm & ~mask); + *bm &= mask; + w++; bm++; + } + + if (w < b->bm_words) { + cleared += hweight_long(*bm); + *bm = 0; + } + bm_unmap(p_addr); + return cleared; +} + +static void bm_set_surplus(struct drbd_bitmap *b) +{ + const unsigned long mask = (1UL << (b->bm_bits & (BITS_PER_LONG-1))) - 1; + size_t w = b->bm_bits >> LN2_BPL; + unsigned long *p_addr, *bm; + + p_addr = bm_map_paddr(b, w); + bm = p_addr + MLPP(w); + if (w < b->bm_words) { + *bm |= ~mask; + bm++; w++; + } + + if (w < b->bm_words) { + *bm = ~(0UL); + } + bm_unmap(p_addr); +} + +static unsigned long __bm_count_bits(struct drbd_bitmap *b, const int swap_endian) +{ + unsigned long *p_addr, *bm, offset = 0; + unsigned long bits = 0; + unsigned long i, do_now; + + while (offset < b->bm_words) { + i = do_now = min_t(size_t, b->bm_words-offset, LWPP); + p_addr = __bm_map_paddr(b, offset, KM_USER0); + bm = p_addr + MLPP(offset); + while (i--) { +#ifndef __LITTLE_ENDIAN + if (swap_endian) + *bm = lel_to_cpu(*bm); +#endif + bits += hweight_long(*bm++); + } + __bm_unmap(p_addr, KM_USER0); + offset += do_now; + cond_resched(); + } + + return bits; +} + +static unsigned long bm_count_bits(struct drbd_bitmap *b) +{ + return __bm_count_bits(b, 0); +} + +static unsigned long bm_count_bits_swap_endian(struct drbd_bitmap *b) +{ + return __bm_count_bits(b, 1); +} + +/* offset and len in long words.*/ +static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len) +{ + unsigned long *p_addr, *bm; + size_t do_now, end; + +#define BM_SECTORS_PER_BIT (BM_BLOCK_SIZE/512) + + end = offset + len; + + if (end > b->bm_words) { + printk(KERN_ALERT "drbd: bm_memset end > bm_words\n"); + return; + } + + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset; + p_addr = bm_map_paddr(b, offset); + bm = p_addr + MLPP(offset); + if (bm+do_now > p_addr + LWPP) { + printk(KERN_ALERT "drbd: BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n", + p_addr, bm, (int)do_now); + break; /* breaks to after catch_oob_access_end() only! */ + } + memset(bm, c, do_now * sizeof(long)); + bm_unmap(p_addr); + offset += do_now; + } +} + +/* + * make sure the bitmap has enough room for the attached storage, + * if necessary, resize. + * called whenever we may have changed the device size. + * returns -ENOMEM if we could not allocate enough memory, 0 on success. + * In case this is actually a resize, we copy the old bitmap into the new one. + * Otherwise, the bitmap is initialized to all bits set. + */ +int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long bits, words, owords, obits, *p_addr, *bm; + unsigned long want, have, onpages; /* number of pages */ + struct page **npages, **opages = NULL; + int err = 0, growing; + int opages_vmalloced; + + ERR_IF(!b) return -ENOMEM; + + drbd_bm_lock(mdev, "resize"); + + dev_info(DEV, "drbd_bm_resize called with capacity == %llu\n", + (unsigned long long)capacity); + + if (capacity == b->bm_dev_capacity) + goto out; + + opages_vmalloced = test_bit(BM_P_VMALLOCED, &b->bm_flags); + + if (capacity == 0) { + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + onpages = b->bm_number_of_pages; + owords = b->bm_words; + b->bm_pages = NULL; + b->bm_number_of_pages = + b->bm_set = + b->bm_bits = + b->bm_words = + b->bm_dev_capacity = 0; + spin_unlock_irq(&b->bm_lock); + bm_free_pages(opages, onpages); + bm_vk_free(opages, opages_vmalloced); + goto out; + } + bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); + + /* if we would use + words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL; + a 32bit host could present the wrong number of words + to a 64bit host. + */ + words = ALIGN(bits, 64) >> LN2_BPL; + + if (get_ldev(mdev)) { + D_ASSERT((u64)bits <= (((u64)mdev->ldev->md.md_size_sect-MD_BM_OFFSET) << 12)); + put_ldev(mdev); + } + + /* one extra long to catch off by one errors */ + want = ALIGN((words+1)*sizeof(long), PAGE_SIZE) >> PAGE_SHIFT; + have = b->bm_number_of_pages; + if (want == have) { + D_ASSERT(b->bm_pages != NULL); + npages = b->bm_pages; + } else { + if (FAULT_ACTIVE(mdev, DRBD_FAULT_BM_ALLOC)) + npages = NULL; + else + npages = bm_realloc_pages(b, want); + } + + if (!npages) { + err = -ENOMEM; + goto out; + } + + spin_lock_irq(&b->bm_lock); + opages = b->bm_pages; + owords = b->bm_words; + obits = b->bm_bits; + + growing = bits > obits; + if (opages) + bm_set_surplus(b); + + b->bm_pages = npages; + b->bm_number_of_pages = want; + b->bm_bits = bits; + b->bm_words = words; + b->bm_dev_capacity = capacity; + + if (growing) { + bm_memset(b, owords, 0xff, words-owords); + b->bm_set += bits - obits; + } + + if (want < have) { + /* implicit: (opages != NULL) && (opages != npages) */ + bm_free_pages(opages + want, have - want); + } + + p_addr = bm_map_paddr(b, words); + bm = p_addr + MLPP(words); + *bm = DRBD_MAGIC; + bm_unmap(p_addr); + + (void)bm_clear_surplus(b); + + spin_unlock_irq(&b->bm_lock); + if (opages != npages) + bm_vk_free(opages, opages_vmalloced); + if (!growing) + b->bm_set = bm_count_bits(b); + dev_info(DEV, "resync bitmap: bits=%lu words=%lu\n", bits, words); + + out: + drbd_bm_unlock(mdev); + return err; +} + +/* inherently racy: + * if not protected by other means, return value may be out of date when + * leaving this function... + * we still need to lock it, since it is important that this returns + * bm_set == 0 precisely. + * + * maybe bm_set should be atomic_t ? + */ +static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long s; + unsigned long flags; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + s = b->bm_set; + spin_unlock_irqrestore(&b->bm_lock, flags); + + return s; +} + +unsigned long drbd_bm_total_weight(struct drbd_conf *mdev) +{ + unsigned long s; + /* if I don't have a disk, I don't know about out-of-sync status */ + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) + return 0; + s = _drbd_bm_total_weight(mdev); + put_ldev(mdev); + return s; +} + +size_t drbd_bm_words(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + return b->bm_words; +} + +unsigned long drbd_bm_bits(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return 0; + + return b->bm_bits; +} + +/* merge number words from buffer into the bitmap starting at offset. + * buffer[i] is expected to be little endian unsigned long. + * bitmap must be locked by drbd_bm_lock. + * currently only used from receive_bitmap. + */ +void drbd_bm_merge_lel(struct drbd_conf *mdev, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr, *bm; + unsigned long word, bits; + size_t end, do_now; + + end = offset + number; + + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + if (number == 0) + return; + WARN_ON(offset >= b->bm_words); + WARN_ON(end > b->bm_words); + + spin_lock_irq(&b->bm_lock); + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + p_addr = bm_map_paddr(b, offset); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) { + bits = hweight_long(*bm); + word = *bm | lel_to_cpu(*buffer++); + *bm++ = word; + b->bm_set += hweight_long(word) - bits; + } + bm_unmap(p_addr); + } + /* with 32bit <-> 64bit cross-platform connect + * this is only correct for current usage, + * where we _know_ that we are 64 bit aligned, + * and know that this function is used in this way, too... + */ + if (end == b->bm_words) + b->bm_set -= bm_clear_surplus(b); + + spin_unlock_irq(&b->bm_lock); +} + +/* copy number words from the bitmap starting at offset into the buffer. + * buffer[i] will be little endian unsigned long. + */ +void drbd_bm_get_lel(struct drbd_conf *mdev, size_t offset, size_t number, + unsigned long *buffer) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr, *bm; + size_t end, do_now; + + end = offset + number; + + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + + spin_lock_irq(&b->bm_lock); + if ((offset >= b->bm_words) || + (end > b->bm_words) || + (number <= 0)) + dev_err(DEV, "offset=%lu number=%lu bm_words=%lu\n", + (unsigned long) offset, + (unsigned long) number, + (unsigned long) b->bm_words); + else { + while (offset < end) { + do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset; + p_addr = bm_map_paddr(b, offset); + bm = p_addr + MLPP(offset); + offset += do_now; + while (do_now--) + *buffer++ = cpu_to_lel(*bm++); + bm_unmap(p_addr); + } + } + spin_unlock_irq(&b->bm_lock); +} + +/* set all bits in the bitmap */ +void drbd_bm_set_all(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0xff, b->bm_words); + (void)bm_clear_surplus(b); + b->bm_set = b->bm_bits; + spin_unlock_irq(&b->bm_lock); +} + +/* clear all bits in the bitmap */ +void drbd_bm_clear_all(struct drbd_conf *mdev) +{ + struct drbd_bitmap *b = mdev->bitmap; + ERR_IF(!b) return; + ERR_IF(!b->bm_pages) return; + + spin_lock_irq(&b->bm_lock); + bm_memset(b, 0, 0, b->bm_words); + b->bm_set = 0; + spin_unlock_irq(&b->bm_lock); +} + +static void bm_async_io_complete(struct bio *bio, int error) +{ + struct drbd_bitmap *b = bio->bi_private; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! + * do we want to WARN() on this? */ + if (!error && !uptodate) + error = -EIO; + + if (error) { + /* doh. what now? + * for now, set all bits, and flag MD_IO_ERROR */ + __set_bit(BM_MD_IO_ERROR, &b->bm_flags); + } + if (atomic_dec_and_test(&b->bm_async_io)) + wake_up(&b->bm_io_wait); + + bio_put(bio); +} + +static void bm_page_io_async(struct drbd_conf *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) +{ + /* we are process context. we always get a bio */ + struct bio *bio = bio_alloc(GFP_KERNEL, 1); + unsigned int len; + sector_t on_disk_sector = + mdev->ldev->md.md_offset + mdev->ldev->md.bm_offset; + on_disk_sector += ((sector_t)page_nr) << (PAGE_SHIFT-9); + + /* this might happen with very small + * flexible external meta data device */ + len = min_t(unsigned int, PAGE_SIZE, + (drbd_md_last_sector(mdev->ldev) - on_disk_sector + 1)<<9); + + bio->bi_bdev = mdev->ldev->md_bdev; + bio->bi_sector = on_disk_sector; + bio_add_page(bio, b->bm_pages[page_nr], len, 0); + bio->bi_private = b; + bio->bi_end_io = bm_async_io_complete; + + if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { + bio->bi_rw |= rw; + bio_endio(bio, -EIO); + } else { + submit_bio(rw, bio); + } +} + +# if defined(__LITTLE_ENDIAN) + /* nothing to do, on disk == in memory */ +# define bm_cpu_to_lel(x) ((void)0) +# else +void bm_cpu_to_lel(struct drbd_bitmap *b) +{ + /* need to cpu_to_lel all the pages ... + * this may be optimized by using + * cpu_to_lel(-1) == -1 and cpu_to_lel(0) == 0; + * the following is still not optimal, but better than nothing */ + unsigned int i; + unsigned long *p_addr, *bm; + if (b->bm_set == 0) { + /* no page at all; avoid swap if all is 0 */ + i = b->bm_number_of_pages; + } else if (b->bm_set == b->bm_bits) { + /* only the last page */ + i = b->bm_number_of_pages - 1; + } else { + /* all pages */ + i = 0; + } + for (; i < b->bm_number_of_pages; i++) { + p_addr = kmap_atomic(b->bm_pages[i], KM_USER0); + for (bm = p_addr; bm < p_addr + PAGE_SIZE/sizeof(long); bm++) + *bm = cpu_to_lel(*bm); + kunmap_atomic(p_addr, KM_USER0); + } +} +# endif +/* lel_to_cpu == cpu_to_lel */ +# define bm_lel_to_cpu(x) bm_cpu_to_lel(x) + +/* + * bm_rw: read/write the whole bitmap from/to its on disk location. + */ +static int bm_rw(struct drbd_conf *mdev, int rw) __must_hold(local) +{ + struct drbd_bitmap *b = mdev->bitmap; + /* sector_t sector; */ + int bm_words, num_pages, i; + unsigned long now; + char ppb[10]; + int err = 0; + + WARN_ON(!bm_is_locked(b)); + + /* no spinlock here, the drbd_bm_lock should be enough! */ + + bm_words = drbd_bm_words(mdev); + num_pages = (bm_words*sizeof(long) + PAGE_SIZE-1) >> PAGE_SHIFT; + + /* on disk bitmap is little endian */ + if (rw == WRITE) + bm_cpu_to_lel(b); + + now = jiffies; + atomic_set(&b->bm_async_io, num_pages); + __clear_bit(BM_MD_IO_ERROR, &b->bm_flags); + + /* let the layers below us try to merge these bios... */ + for (i = 0; i < num_pages; i++) + bm_page_io_async(mdev, b, i, rw); + + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->md_bdev)); + wait_event(b->bm_io_wait, atomic_read(&b->bm_async_io) == 0); + + if (test_bit(BM_MD_IO_ERROR, &b->bm_flags)) { + dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); + drbd_chk_io_error(mdev, 1, TRUE); + err = -EIO; + } + + now = jiffies; + if (rw == WRITE) { + /* swap back endianness */ + bm_lel_to_cpu(b); + /* flush bitmap to stable storage */ + drbd_md_flush(mdev); + } else /* rw == READ */ { + /* just read, if necessary adjust endianness */ + b->bm_set = bm_count_bits_swap_endian(b); + dev_info(DEV, "recounting of set bits took additional %lu jiffies\n", + jiffies - now); + } + now = b->bm_set; + + dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", + ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + + return err; +} + +/** + * drbd_bm_read() - Read the whole bitmap from its on disk location. + * @mdev: DRBD device. + */ +int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, READ); +} + +/** + * drbd_bm_write() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + */ +int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE); +} + +/** + * drbd_bm_write_sect: Writes a 512 (MD_SECTOR_SIZE) byte piece of the bitmap + * @mdev: DRBD device. + * @enr: Extent number in the resync lru (happens to be sector offset) + * + * The BM_EXT_SIZE is on purpose exactly the amount of the bitmap covered + * by a single sector write. Therefore enr == sector offset from the + * start of the bitmap. + */ +int drbd_bm_write_sect(struct drbd_conf *mdev, unsigned long enr) __must_hold(local) +{ + sector_t on_disk_sector = enr + mdev->ldev->md.md_offset + + mdev->ldev->md.bm_offset; + int bm_words, num_words, offset; + int err = 0; + + mutex_lock(&mdev->md_io_mutex); + bm_words = drbd_bm_words(mdev); + offset = S2W(enr); /* word offset into bitmap */ + num_words = min(S2W(1), bm_words - offset); + if (num_words < S2W(1)) + memset(page_address(mdev->md_io_page), 0, MD_SECTOR_SIZE); + drbd_bm_get_lel(mdev, offset, num_words, + page_address(mdev->md_io_page)); + if (!drbd_md_sync_page_io(mdev, mdev->ldev, on_disk_sector, WRITE)) { + int i; + err = -EIO; + dev_err(DEV, "IO ERROR writing bitmap sector %lu " + "(meta-disk sector %llus)\n", + enr, (unsigned long long)on_disk_sector); + drbd_chk_io_error(mdev, 1, TRUE); + for (i = 0; i < AL_EXT_PER_BM_SECT; i++) + drbd_bm_ALe_set_all(mdev, enr*AL_EXT_PER_BM_SECT+i); + } + mdev->bm_writ_cnt++; + mutex_unlock(&mdev->md_io_mutex); + return err; +} + +/* NOTE + * find_first_bit returns int, we return unsigned long. + * should not make much difference anyways, but ... + * + * this returns a bit number, NOT a sector! + */ +#define BPP_MASK ((1UL << (PAGE_SHIFT+3)) - 1) +static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, + const int find_zero_bit, const enum km_type km) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long i = -1UL; + unsigned long *p_addr; + unsigned long bit_offset; /* bit offset of the mapped page. */ + + if (bm_fo > b->bm_bits) { + dev_err(DEV, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits); + } else { + while (bm_fo < b->bm_bits) { + unsigned long offset; + bit_offset = bm_fo & ~BPP_MASK; /* bit offset of the page */ + offset = bit_offset >> LN2_BPL; /* word offset of the page */ + p_addr = __bm_map_paddr(b, offset, km); + + if (find_zero_bit) + i = find_next_zero_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); + else + i = find_next_bit(p_addr, PAGE_SIZE*8, bm_fo & BPP_MASK); + + __bm_unmap(p_addr, km); + if (i < PAGE_SIZE*8) { + i = bit_offset + i; + if (i >= b->bm_bits) + break; + goto found; + } + bm_fo = bit_offset + PAGE_SIZE*8; + } + i = -1UL; + } + found: + return i; +} + +static unsigned long bm_find_next(struct drbd_conf *mdev, + unsigned long bm_fo, const int find_zero_bit) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long i = -1UL; + + ERR_IF(!b) return i; + ERR_IF(!b->bm_pages) return i; + + spin_lock_irq(&b->bm_lock); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + + i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); + + spin_unlock_irq(&b->bm_lock); + return i; +} + +unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) +{ + return bm_find_next(mdev, bm_fo, 0); +} + +#if 0 +/* not yet needed for anything. */ +unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) +{ + return bm_find_next(mdev, bm_fo, 1); +} +#endif + +/* does not spin_lock_irqsave. + * you must take drbd_bm_lock() first */ +unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) +{ + /* WARN_ON(!bm_is_locked(mdev)); */ + return __bm_find_next(mdev, bm_fo, 0, KM_USER1); +} + +unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) +{ + /* WARN_ON(!bm_is_locked(mdev)); */ + return __bm_find_next(mdev, bm_fo, 1, KM_USER1); +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector. + * expected to be called for only a few bits (e - s about BITS_PER_LONG). + * Must hold bitmap lock already. */ +int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, + unsigned long e, int val, const enum km_type km) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr = NULL; + unsigned long bitnr; + unsigned long last_page_nr = -1UL; + int c = 0; + + if (e >= b->bm_bits) { + dev_err(DEV, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n", + s, e, b->bm_bits); + e = b->bm_bits ? b->bm_bits -1 : 0; + } + for (bitnr = s; bitnr <= e; bitnr++) { + unsigned long offset = bitnr>>LN2_BPL; + unsigned long page_nr = offset >> (PAGE_SHIFT - LN2_BPL + 3); + if (page_nr != last_page_nr) { + if (p_addr) + __bm_unmap(p_addr, km); + p_addr = __bm_map_paddr(b, offset, km); + last_page_nr = page_nr; + } + if (val) + c += (0 == __test_and_set_bit(bitnr & BPP_MASK, p_addr)); + else + c -= (0 != __test_and_clear_bit(bitnr & BPP_MASK, p_addr)); + } + if (p_addr) + __bm_unmap(p_addr, km); + b->bm_set += c; + return c; +} + +/* returns number of bits actually changed. + * for val != 0, we change 0 -> 1, return code positive + * for val == 0, we change 1 -> 0, return code negative + * wants bitnr, not sector */ +int bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, + const unsigned long e, int val) +{ + unsigned long flags; + struct drbd_bitmap *b = mdev->bitmap; + int c = 0; + + ERR_IF(!b) return 1; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + + c = __bm_change_bits_to(mdev, s, e, val, KM_IRQ1); + + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + +/* returns number of bits changed 0 -> 1 */ +int drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + return bm_change_bits_to(mdev, s, e, 1); +} + +/* returns number of bits changed 1 -> 0 */ +int drbd_bm_clear_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + return -bm_change_bits_to(mdev, s, e, 0); +} + +/* sets all bits in full words, + * from first_word up to, but not including, last_word */ +static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, + int page_nr, int first_word, int last_word) +{ + int i; + int bits; + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_USER0); + for (i = first_word; i < last_word; i++) { + bits = hweight_long(paddr[i]); + paddr[i] = ~0UL; + b->bm_set += BITS_PER_LONG - bits; + } + kunmap_atomic(paddr, KM_USER0); +} + +/* Same thing as drbd_bm_set_bits, but without taking the spin_lock_irqsave. + * You must first drbd_bm_lock(). + * Can be called to set the whole bitmap in one go. + * Sets bits from s to e _inclusive_. */ +void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + /* First set_bit from the first bit (s) + * up to the next long boundary (sl), + * then assign full words up to the last long boundary (el), + * then set_bit up to and including the last bit (e). + * + * Do not use memset, because we must account for changes, + * so we need to loop over the words with hweight() anyways. + */ + unsigned long sl = ALIGN(s,BITS_PER_LONG); + unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1); + int first_page; + int last_page; + int page_nr; + int first_word; + int last_word; + + if (e - s <= 3*BITS_PER_LONG) { + /* don't bother; el and sl may even be wrong. */ + __bm_change_bits_to(mdev, s, e, 1, KM_USER0); + return; + } + + /* difference is large enough that we can trust sl and el */ + + /* bits filling the current long */ + if (sl) + __bm_change_bits_to(mdev, s, sl-1, 1, KM_USER0); + + first_page = sl >> (3 + PAGE_SHIFT); + last_page = el >> (3 + PAGE_SHIFT); + + /* MLPP: modulo longs per page */ + /* LWPP: long words per page */ + first_word = MLPP(sl >> LN2_BPL); + last_word = LWPP; + + /* first and full pages, unless first page == last page */ + for (page_nr = first_page; page_nr < last_page; page_nr++) { + bm_set_full_words_within_one_page(mdev->bitmap, page_nr, first_word, last_word); + cond_resched(); + first_word = 0; + } + + /* last page (respectively only page, for first page == last page) */ + last_word = MLPP(el >> LN2_BPL); + bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); + + /* possibly trailing bits. + * example: (e & 63) == 63, el will be e+1. + * if that even was the very last bit, + * it would trigger an assert in __bm_change_bits_to() + */ + if (el <= e) + __bm_change_bits_to(mdev, el, e, 1, KM_USER0); +} + +/* returns bit state + * wants bitnr, NOT sector. + * inherently racy... area needs to be locked by means of {al,rs}_lru + * 1 ... bit set + * 0 ... bit not set + * -1 ... first out of bounds access, stop testing for bits! + */ +int drbd_bm_test_bit(struct drbd_conf *mdev, const unsigned long bitnr) +{ + unsigned long flags; + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr; + int i; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + if (bitnr < b->bm_bits) { + unsigned long offset = bitnr>>LN2_BPL; + p_addr = bm_map_paddr(b, offset); + i = test_bit(bitnr & BPP_MASK, p_addr) ? 1 : 0; + bm_unmap(p_addr); + } else if (bitnr == b->bm_bits) { + i = -1; + } else { /* (bitnr > b->bm_bits) */ + dev_err(DEV, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits); + i = 0; + } + + spin_unlock_irqrestore(&b->bm_lock, flags); + return i; +} + +/* returns number of bits set in the range [s, e] */ +int drbd_bm_count_bits(struct drbd_conf *mdev, const unsigned long s, const unsigned long e) +{ + unsigned long flags; + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr = NULL, page_nr = -1; + unsigned long bitnr; + int c = 0; + size_t w; + + /* If this is called without a bitmap, that is a bug. But just to be + * robust in case we screwed up elsewhere, in that case pretend there + * was one dirty bit in the requested area, so we won't try to do a + * local read there (no bitmap probably implies no disk) */ + ERR_IF(!b) return 1; + ERR_IF(!b->bm_pages) return 1; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + for (bitnr = s; bitnr <= e; bitnr++) { + w = bitnr >> LN2_BPL; + if (page_nr != w >> (PAGE_SHIFT - LN2_BPL + 3)) { + page_nr = w >> (PAGE_SHIFT - LN2_BPL + 3); + if (p_addr) + bm_unmap(p_addr); + p_addr = bm_map_paddr(b, w); + } + ERR_IF (bitnr >= b->bm_bits) { + dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits); + } else { + c += (0 != test_bit(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr)); + } + } + if (p_addr) + bm_unmap(p_addr); + spin_unlock_irqrestore(&b->bm_lock, flags); + return c; +} + + +/* inherently racy... + * return value may be already out-of-date when this function returns. + * but the general usage is that this is only use during a cstate when bits are + * only cleared, not set, and typically only care for the case when the return + * value is zero, or we already "locked" this "bitmap extent" by other means. + * + * enr is bm-extent number, since we chose to name one sector (512 bytes) + * worth of the bitmap a "bitmap extent". + * + * TODO + * I think since we use it like a reference count, we should use the real + * reference count of some bitmap extent element from some lru instead... + * + */ +int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + int count, s, e; + unsigned long flags; + unsigned long *p_addr, *bm; + + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irqsave(&b->bm_lock, flags); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + + s = S2W(enr); + e = min((size_t)S2W(enr+1), b->bm_words); + count = 0; + if (s < b->bm_words) { + int n = e-s; + p_addr = bm_map_paddr(b, s); + bm = p_addr + MLPP(s); + while (n--) + count += hweight_long(*bm++); + bm_unmap(p_addr); + } else { + dev_err(DEV, "start offset (%d) too large in drbd_bm_e_weight\n", s); + } + spin_unlock_irqrestore(&b->bm_lock, flags); + return count; +} + +/* set all bits covered by the AL-extent al_enr */ +unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr) +{ + struct drbd_bitmap *b = mdev->bitmap; + unsigned long *p_addr, *bm; + unsigned long weight; + int count, s, e, i, do_now; + ERR_IF(!b) return 0; + ERR_IF(!b->bm_pages) return 0; + + spin_lock_irq(&b->bm_lock); + if (bm_is_locked(b)) + bm_print_lock_info(mdev); + weight = b->bm_set; + + s = al_enr * BM_WORDS_PER_AL_EXT; + e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words); + /* assert that s and e are on the same page */ + D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3) + == s >> (PAGE_SHIFT - LN2_BPL + 3)); + count = 0; + if (s < b->bm_words) { + i = do_now = e-s; + p_addr = bm_map_paddr(b, s); + bm = p_addr + MLPP(s); + while (i--) { + count += hweight_long(*bm); + *bm = -1UL; + bm++; + } + bm_unmap(p_addr); + b->bm_set += do_now*BITS_PER_LONG - count; + if (e == b->bm_words) + b->bm_set -= bm_clear_surplus(b); + } else { + dev_err(DEV, "start offset (%d) too large in drbd_bm_ALe_set_all\n", s); + } + weight = b->bm_set - weight; + spin_unlock_irq(&b->bm_lock); + return weight; +} diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h new file mode 100644 index 00000000000..8da602e010b --- /dev/null +++ b/drivers/block/drbd/drbd_int.h @@ -0,0 +1,2258 @@ +/* + drbd_int.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#ifndef _DRBD_INT_H +#define _DRBD_INT_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __CHECKER__ +# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) +# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) +# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) +# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) +#else +# define __protected_by(x) +# define __protected_read_by(x) +# define __protected_write_by(x) +# define __must_hold(x) +#endif + +#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) + +/* module parameter, defined in drbd_main.c */ +extern unsigned int minor_count; +extern int disable_sendpage; +extern int allow_oos; +extern unsigned int cn_idx; + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern int enable_faults; +extern int fault_rate; +extern int fault_devs; +#endif + +extern char usermode_helper[]; + + +#ifndef TRUE +#define TRUE 1 +#endif +#ifndef FALSE +#define FALSE 0 +#endif + +/* I don't remember why XCPU ... + * This is used to wake the asender, + * and to interrupt sending the sending task + * on disconnect. + */ +#define DRBD_SIG SIGXCPU + +/* This is used to stop/restart our threads. + * Cannot use SIGTERM nor SIGKILL, since these + * are sent out by init on runlevel changes + * I choose SIGHUP for now. + */ +#define DRBD_SIGKILL SIGHUP + +/* All EEs on the free list should have ID_VACANT (== 0) + * freshly allocated EEs get !ID_VACANT (== 1) + * so if it says "cannot dereference null pointer at adress 0x00000001", + * it is most likely one of these :( */ + +#define ID_IN_SYNC (4711ULL) +#define ID_OUT_OF_SYNC (4712ULL) + +#define ID_SYNCER (-1ULL) +#define ID_VACANT 0 +#define is_syncer_block_id(id) ((id) == ID_SYNCER) + +struct drbd_conf; + + +/* to shorten dev_warn(DEV, "msg"); and relatives statements */ +#define DEV (disk_to_dev(mdev->vdisk)) + +#define D_ASSERT(exp) if (!(exp)) \ + dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__) + +#define ERR_IF(exp) if (({ \ + int _b = (exp) != 0; \ + if (_b) dev_err(DEV, "%s: (%s) in %s:%d\n", \ + __func__, #exp, __FILE__, __LINE__); \ + _b; \ + })) + +/* Defines to control fault insertion */ +enum { + DRBD_FAULT_MD_WR = 0, /* meta data write */ + DRBD_FAULT_MD_RD = 1, /* read */ + DRBD_FAULT_RS_WR = 2, /* resync */ + DRBD_FAULT_RS_RD = 3, + DRBD_FAULT_DT_WR = 4, /* data */ + DRBD_FAULT_DT_RD = 5, + DRBD_FAULT_DT_RA = 6, /* data read ahead */ + DRBD_FAULT_BM_ALLOC = 7, /* bitmap allocation */ + DRBD_FAULT_AL_EE = 8, /* alloc ee */ + + DRBD_FAULT_MAX, +}; + +extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...); + +#ifdef CONFIG_DRBD_FAULT_INJECTION +extern unsigned int +_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); +static inline int +drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) { + return fault_rate && + (enable_faults & (1< P_MAY_IGNORE) ... */ + P_MAX_OPT_CMD = 0x101, + + /* special command ids for handshake */ + + P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */ + P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */ + + P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */ +}; + +static inline const char *cmdname(enum drbd_packets cmd) +{ + /* THINK may need to become several global tables + * when we want to support more than + * one PRO_VERSION */ + static const char *cmdnames[] = { + [P_DATA] = "Data", + [P_DATA_REPLY] = "DataReply", + [P_RS_DATA_REPLY] = "RSDataReply", + [P_BARRIER] = "Barrier", + [P_BITMAP] = "ReportBitMap", + [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget", + [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource", + [P_UNPLUG_REMOTE] = "UnplugRemote", + [P_DATA_REQUEST] = "DataRequest", + [P_RS_DATA_REQUEST] = "RSDataRequest", + [P_SYNC_PARAM] = "SyncParam", + [P_SYNC_PARAM89] = "SyncParam89", + [P_PROTOCOL] = "ReportProtocol", + [P_UUIDS] = "ReportUUIDs", + [P_SIZES] = "ReportSizes", + [P_STATE] = "ReportState", + [P_SYNC_UUID] = "ReportSyncUUID", + [P_AUTH_CHALLENGE] = "AuthChallenge", + [P_AUTH_RESPONSE] = "AuthResponse", + [P_PING] = "Ping", + [P_PING_ACK] = "PingAck", + [P_RECV_ACK] = "RecvAck", + [P_WRITE_ACK] = "WriteAck", + [P_RS_WRITE_ACK] = "RSWriteAck", + [P_DISCARD_ACK] = "DiscardAck", + [P_NEG_ACK] = "NegAck", + [P_NEG_DREPLY] = "NegDReply", + [P_NEG_RS_DREPLY] = "NegRSDReply", + [P_BARRIER_ACK] = "BarrierAck", + [P_STATE_CHG_REQ] = "StateChgRequest", + [P_STATE_CHG_REPLY] = "StateChgReply", + [P_OV_REQUEST] = "OVRequest", + [P_OV_REPLY] = "OVReply", + [P_OV_RESULT] = "OVResult", + [P_MAX_CMD] = NULL, + }; + + if (cmd == P_HAND_SHAKE_M) + return "HandShakeM"; + if (cmd == P_HAND_SHAKE_S) + return "HandShakeS"; + if (cmd == P_HAND_SHAKE) + return "HandShake"; + if (cmd >= P_MAX_CMD) + return "Unknown"; + return cmdnames[cmd]; +} + +/* for sending/receiving the bitmap, + * possibly in some encoding scheme */ +struct bm_xfer_ctx { + /* "const" + * stores total bits and long words + * of the bitmap, so we don't need to + * call the accessor functions over and again. */ + unsigned long bm_bits; + unsigned long bm_words; + /* during xfer, current position within the bitmap */ + unsigned long bit_offset; + unsigned long word_offset; + + /* statistics; index: (h->command == P_BITMAP) */ + unsigned packets[2]; + unsigned bytes[2]; +}; + +extern void INFO_bm_xfer_stats(struct drbd_conf *mdev, + const char *direction, struct bm_xfer_ctx *c); + +static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c) +{ + /* word_offset counts "native long words" (32 or 64 bit), + * aligned at 64 bit. + * Encoded packet may end at an unaligned bit offset. + * In case a fallback clear text packet is transmitted in + * between, we adjust this offset back to the last 64bit + * aligned "native long word", which makes coding and decoding + * the plain text bitmap much more convenient. */ +#if BITS_PER_LONG == 64 + c->word_offset = c->bit_offset >> 6; +#elif BITS_PER_LONG == 32 + c->word_offset = c->bit_offset >> 5; + c->word_offset &= ~(1UL); +#else +# error "unsupported BITS_PER_LONG" +#endif +} + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +/* This is the layout for a packet on the wire. + * The byteorder is the network byte order. + * (except block_id and barrier fields. + * these are pointers to local structs + * and have no relevance for the partner, + * which just echoes them as received.) + * + * NOTE that the payload starts at a long aligned offset, + * regardless of 32 or 64 bit arch! + */ +struct p_header { + u32 magic; + u16 command; + u16 length; /* bytes of data after this header */ + u8 payload[0]; +} __packed; +/* 8 bytes. packet FIXED for the next century! */ + +/* + * short commands, packets without payload, plain p_header: + * P_PING + * P_PING_ACK + * P_BECOME_SYNC_TARGET + * P_BECOME_SYNC_SOURCE + * P_UNPLUG_REMOTE + */ + +/* + * commands with out-of-struct payload: + * P_BITMAP (no additional fields) + * P_DATA, P_DATA_REPLY (see p_data) + * P_COMPRESSED_BITMAP (see receive_compressed_bitmap) + */ + +/* these defines must not be changed without changing the protocol version */ +#define DP_HARDBARRIER 1 +#define DP_RW_SYNC 2 +#define DP_MAY_SET_IN_SYNC 4 + +struct p_data { + struct p_header head; + u64 sector; /* 64 bits sector number */ + u64 block_id; /* to identify the request in protocol B&C */ + u32 seq_num; + u32 dp_flags; +} __packed; + +/* + * commands which share a struct: + * p_block_ack: + * P_RECV_ACK (proto B), P_WRITE_ACK (proto C), + * P_DISCARD_ACK (proto C, two-primaries conflict detection) + * p_block_req: + * P_DATA_REQUEST, P_RS_DATA_REQUEST + */ +struct p_block_ack { + struct p_header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 seq_num; +} __packed; + + +struct p_block_req { + struct p_header head; + u64 sector; + u64 block_id; + u32 blksize; + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +/* + * commands with their own struct for additional fields: + * P_HAND_SHAKE + * P_BARRIER + * P_BARRIER_ACK + * P_SYNC_PARAM + * ReportParams + */ + +struct p_handshake { + struct p_header head; /* 8 bytes */ + u32 protocol_min; + u32 feature_flags; + u32 protocol_max; + + /* should be more than enough for future enhancements + * for now, feature_flags and the reserverd array shall be zero. + */ + + u32 _pad; + u64 reserverd[7]; +} __packed; +/* 80 bytes, FIXED for the next century */ + +struct p_barrier { + struct p_header head; + u32 barrier; /* barrier number _handle_ only */ + u32 pad; /* to multiple of 8 Byte */ +} __packed; + +struct p_barrier_ack { + struct p_header head; + u32 barrier; + u32 set_size; +} __packed; + +struct p_rs_param { + struct p_header head; + u32 rate; + + /* Since protocol version 88 and higher. */ + char verify_alg[0]; +} __packed; + +struct p_rs_param_89 { + struct p_header head; + u32 rate; + /* protocol version 89: */ + char verify_alg[SHARED_SECRET_MAX]; + char csums_alg[SHARED_SECRET_MAX]; +} __packed; + +struct p_protocol { + struct p_header head; + u32 protocol; + u32 after_sb_0p; + u32 after_sb_1p; + u32 after_sb_2p; + u32 want_lose; + u32 two_primaries; + + /* Since protocol version 87 and higher. */ + char integrity_alg[0]; + +} __packed; + +struct p_uuids { + struct p_header head; + u64 uuid[UI_EXTENDED_SIZE]; +} __packed; + +struct p_rs_uuid { + struct p_header head; + u64 uuid; +} __packed; + +struct p_sizes { + struct p_header head; + u64 d_size; /* size of disk */ + u64 u_size; /* user requested size */ + u64 c_size; /* current exported size */ + u32 max_segment_size; /* Maximal size of a BIO */ + u32 queue_order_type; +} __packed; + +struct p_state { + struct p_header head; + u32 state; +} __packed; + +struct p_req_state { + struct p_header head; + u32 mask; + u32 val; +} __packed; + +struct p_req_state_reply { + struct p_header head; + u32 retcode; +} __packed; + +struct p_drbd06_param { + u64 size; + u32 state; + u32 blksize; + u32 protocol; + u32 version; + u32 gen_cnt[5]; + u32 bit_map_gen[5]; +} __packed; + +struct p_discard { + struct p_header head; + u64 block_id; + u32 seq_num; + u32 pad; +} __packed; + +/* Valid values for the encoding field. + * Bump proto version when changing this. */ +enum drbd_bitmap_code { + /* RLE_VLI_Bytes = 0, + * and other bit variants had been defined during + * algorithm evaluation. */ + RLE_VLI_Bits = 2, +}; + +struct p_compressed_bm { + struct p_header head; + /* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code + * (encoding & 0x80): polarity (set/unset) of first runlength + * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits + * used to pad up to head.length bytes + */ + u8 encoding; + + u8 code[0]; +} __packed; + +/* DCBP: Drbd Compressed Bitmap Packet ... */ +static inline enum drbd_bitmap_code +DCBP_get_code(struct p_compressed_bm *p) +{ + return (enum drbd_bitmap_code)(p->encoding & 0x0f); +} + +static inline void +DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code) +{ + BUG_ON(code & ~0xf); + p->encoding = (p->encoding & ~0xf) | code; +} + +static inline int +DCBP_get_start(struct p_compressed_bm *p) +{ + return (p->encoding & 0x80) != 0; +} + +static inline void +DCBP_set_start(struct p_compressed_bm *p, int set) +{ + p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0); +} + +static inline int +DCBP_get_pad_bits(struct p_compressed_bm *p) +{ + return (p->encoding >> 4) & 0x7; +} + +static inline void +DCBP_set_pad_bits(struct p_compressed_bm *p, int n) +{ + BUG_ON(n & ~0x7); + p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4); +} + +/* one bitmap packet, including the p_header, + * should fit within one _architecture independend_ page. + * so we need to use the fixed size 4KiB page size + * most architechtures have used for a long time. + */ +#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header)) +#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long)) +#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm)) +#if (PAGE_SIZE < 4096) +/* drbd_send_bitmap / receive_bitmap would break horribly */ +#error "PAGE_SIZE too small" +#endif + +union p_polymorph { + struct p_header header; + struct p_handshake handshake; + struct p_data data; + struct p_block_ack block_ack; + struct p_barrier barrier; + struct p_barrier_ack barrier_ack; + struct p_rs_param_89 rs_param_89; + struct p_protocol protocol; + struct p_sizes sizes; + struct p_uuids uuids; + struct p_state state; + struct p_req_state req_state; + struct p_req_state_reply req_state_reply; + struct p_block_req block_req; +} __packed; + +/**********************************************************************/ +enum drbd_thread_state { + None, + Running, + Exiting, + Restarting +}; + +struct drbd_thread { + spinlock_t t_lock; + struct task_struct *task; + struct completion stop; + enum drbd_thread_state t_state; + int (*function) (struct drbd_thread *); + struct drbd_conf *mdev; + int reset_cpu_mask; +}; + +static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi) +{ + /* THINK testing the t_state seems to be uncritical in all cases + * (but thread_{start,stop}), so we can read it *without* the lock. + * --lge */ + + smp_rmb(); + return thi->t_state; +} + + +/* + * Having this as the first member of a struct provides sort of "inheritance". + * "derived" structs can be "drbd_queue_work()"ed. + * The callback should know and cast back to the descendant struct. + * drbd_request and drbd_epoch_entry are descendants of drbd_work. + */ +struct drbd_work; +typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel); +struct drbd_work { + struct list_head list; + drbd_work_cb cb; +}; + +struct drbd_tl_epoch; +struct drbd_request { + struct drbd_work w; + struct drbd_conf *mdev; + + /* if local IO is not allowed, will be NULL. + * if local IO _is_ allowed, holds the locally submitted bio clone, + * or, after local IO completion, the ERR_PTR(error). + * see drbd_endio_pri(). */ + struct bio *private_bio; + + struct hlist_node colision; + sector_t sector; + unsigned int size; + unsigned int epoch; /* barrier_nr */ + + /* barrier_nr: used to check on "completion" whether this req was in + * the current epoch, and we therefore have to close it, + * starting a new epoch... + */ + + /* up to here, the struct layout is identical to drbd_epoch_entry; + * we might be able to use that to our advantage... */ + + struct list_head tl_requests; /* ring list in the transfer log */ + struct bio *master_bio; /* master bio pointer */ + unsigned long rq_state; /* see comments above _req_mod() */ + int seq_num; + unsigned long start_time; +}; + +struct drbd_tl_epoch { + struct drbd_work w; + struct list_head requests; /* requests before */ + struct drbd_tl_epoch *next; /* pointer to the next barrier */ + unsigned int br_number; /* the barriers identifier. */ + int n_req; /* number of requests attached before this barrier */ +}; + +struct drbd_request; + +/* These Tl_epoch_entries may be in one of 6 lists: + active_ee .. data packet being written + sync_ee .. syncer block being written + done_ee .. block written, need to send P_WRITE_ACK + read_ee .. [RS]P_DATA_REQUEST being read +*/ + +struct drbd_epoch { + struct list_head list; + unsigned int barrier_nr; + atomic_t epoch_size; /* increased on every request added. */ + atomic_t active; /* increased on every req. added, and dec on every finished. */ + unsigned long flags; +}; + +/* drbd_epoch flag bits */ +enum { + DE_BARRIER_IN_NEXT_EPOCH_ISSUED, + DE_BARRIER_IN_NEXT_EPOCH_DONE, + DE_CONTAINS_A_BARRIER, + DE_HAVE_BARRIER_NUMBER, + DE_IS_FINISHING, +}; + +enum epoch_event { + EV_PUT, + EV_GOT_BARRIER_NR, + EV_BARRIER_DONE, + EV_BECAME_LAST, + EV_TRACE_FLUSH, /* TRACE_ are not real events, only used for tracing */ + EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */ + EV_TRACE_SETTING_BI, /* Barrier is expressed with the first write of the next epoch */ + EV_TRACE_ALLOC, + EV_TRACE_FREE, + EV_CLEANUP = 32, /* used as flag */ +}; + +struct drbd_epoch_entry { + struct drbd_work w; + struct drbd_conf *mdev; + struct bio *private_bio; + struct hlist_node colision; + sector_t sector; + unsigned int size; + struct drbd_epoch *epoch; + + /* up to here, the struct layout is identical to drbd_request; + * we might be able to use that to our advantage... */ + + unsigned int flags; + u64 block_id; +}; + +struct drbd_wq_barrier { + struct drbd_work w; + struct completion done; +}; + +struct digest_info { + int digest_size; + void *digest; +}; + +/* ee flag bits */ +enum { + __EE_CALL_AL_COMPLETE_IO, + __EE_CONFLICT_PENDING, + __EE_MAY_SET_IN_SYNC, + __EE_IS_BARRIER, +}; +#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) +#define EE_CONFLICT_PENDING (1<<__EE_CONFLICT_PENDING) +#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) +#define EE_IS_BARRIER (1<<__EE_IS_BARRIER) + +/* global flag bits */ +enum { + CREATE_BARRIER, /* next P_DATA is preceeded by a P_BARRIER */ + SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, /* whether asender should send a ping asap */ + + STOP_SYNC_TIMER, /* tell timer to cancel itself */ + UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ + UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ + MD_DIRTY, /* current uuids and flags not yet on disk */ + DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ + USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */ + CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */ + CL_ST_CHG_SUCCESS, + CL_ST_CHG_FAIL, + CRASHED_PRIMARY, /* This node was a crashed primary. + * Gets cleared when the state.conn + * goes into C_CONNECTED state. */ + WRITE_BM_AFTER_RESYNC, /* A kmalloc() during resync failed */ + NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ + CONSIDER_RESYNC, + + MD_NO_BARRIER, /* meta data device does not support barriers, + so don't even try */ + SUSPEND_IO, /* suspend application io */ + BITMAP_IO, /* suspend application io; + once no more io in flight, start bitmap io */ + BITMAP_IO_QUEUED, /* Started bitmap IO */ + RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ + NET_CONGESTED, /* The data socket is congested */ + + CONFIG_PENDING, /* serialization of (re)configuration requests. + * if set, also prevents the device from dying */ + DEVICE_DYING, /* device became unconfigured, + * but worker thread is still handling the cleanup. + * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed, + * while this is set. */ + RESIZE_PENDING, /* Size change detected locally, waiting for the response from + * the peer, if it changed there as well. */ +}; + +struct drbd_bitmap; /* opaque for drbd_conf */ + +/* TODO sort members for performance + * MAYBE group them further */ + +/* THINK maybe we actually want to use the default "event/%s" worker threads + * or similar in linux 2.6, which uses per cpu data and threads. + * + * To be general, this might need a spin_lock member. + * For now, please use the mdev->req_lock to protect list_head, + * see drbd_queue_work below. + */ +struct drbd_work_queue { + struct list_head q; + struct semaphore s; /* producers up it, worker down()s it */ + spinlock_t q_lock; /* to protect the list. */ +}; + +struct drbd_socket { + struct drbd_work_queue work; + struct mutex mutex; + struct socket *socket; + /* this way we get our + * send/receive buffers off the stack */ + union p_polymorph sbuf; + union p_polymorph rbuf; +}; + +struct drbd_md { + u64 md_offset; /* sector offset to 'super' block */ + + u64 la_size_sect; /* last agreed size, unit sectors */ + u64 uuid[UI_SIZE]; + u64 device_uuid; + u32 flags; + u32 md_size_sect; + + s32 al_offset; /* signed relative sector offset to al area */ + s32 bm_offset; /* signed relative sector offset to bitmap */ + + /* u32 al_nr_extents; important for restoring the AL + * is stored into sync_conf.al_extents, which in turn + * gets applied to act_log->nr_elements + */ +}; + +/* for sync_conf and other types... */ +#define NL_PACKET(name, number, fields) struct name { fields }; +#define NL_INTEGER(pn,pr,member) int member; +#define NL_INT64(pn,pr,member) __u64 member; +#define NL_BIT(pn,pr,member) unsigned member:1; +#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len; +#include "linux/drbd_nl.h" + +struct drbd_backing_dev { + struct block_device *backing_bdev; + struct block_device *md_bdev; + struct file *lo_file; + struct file *md_file; + struct drbd_md md; + struct disk_conf dc; /* The user provided config... */ + sector_t known_size; /* last known size of that backing device */ +}; + +struct drbd_md_io { + struct drbd_conf *mdev; + struct completion event; + int error; +}; + +struct bm_io_work { + struct drbd_work w; + char *why; + int (*io_fn)(struct drbd_conf *mdev); + void (*done)(struct drbd_conf *mdev, int rv); +}; + +enum write_ordering_e { + WO_none, + WO_drain_io, + WO_bdev_flush, + WO_bio_barrier +}; + +struct drbd_conf { + /* things that are stored as / read from meta data on disk */ + unsigned long flags; + + /* configured by drbdsetup */ + struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */ + struct syncer_conf sync_conf; + struct drbd_backing_dev *ldev __protected_by(local); + + sector_t p_size; /* partner's disk size */ + struct request_queue *rq_queue; + struct block_device *this_bdev; + struct gendisk *vdisk; + + struct drbd_socket data; /* data/barrier/cstate/parameter packets */ + struct drbd_socket meta; /* ping/ack (metadata) packets */ + int agreed_pro_version; /* actually used protocol version */ + unsigned long last_received; /* in jiffies, either socket */ + unsigned int ko_count; + struct drbd_work resync_work, + unplug_work, + md_sync_work; + struct timer_list resync_timer; + struct timer_list md_sync_timer; + + /* Used after attach while negotiating new disk state. */ + union drbd_state new_state_tmp; + + union drbd_state state; + wait_queue_head_t misc_wait; + wait_queue_head_t state_wait; /* upon each state change. */ + unsigned int send_cnt; + unsigned int recv_cnt; + unsigned int read_cnt; + unsigned int writ_cnt; + unsigned int al_writ_cnt; + unsigned int bm_writ_cnt; + atomic_t ap_bio_cnt; /* Requests we need to complete */ + atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */ + atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ + atomic_t unacked_cnt; /* Need to send replys for */ + atomic_t local_cnt; /* Waiting for local completion */ + atomic_t net_cnt; /* Users of net_conf */ + spinlock_t req_lock; + struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */ + struct drbd_tl_epoch *newest_tle; + struct drbd_tl_epoch *oldest_tle; + struct list_head out_of_sequence_requests; + struct hlist_head *tl_hash; + unsigned int tl_hash_s; + + /* blocks to sync in this run [unit BM_BLOCK_SIZE] */ + unsigned long rs_total; + /* number of sync IOs that failed in this run */ + unsigned long rs_failed; + /* Syncer's start time [unit jiffies] */ + unsigned long rs_start; + /* cumulated time in PausedSyncX state [unit jiffies] */ + unsigned long rs_paused; + /* block not up-to-date at mark [unit BM_BLOCK_SIZE] */ + unsigned long rs_mark_left; + /* marks's time [unit jiffies] */ + unsigned long rs_mark_time; + /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */ + unsigned long rs_same_csum; + + /* where does the admin want us to start? (sector) */ + sector_t ov_start_sector; + /* where are we now? (sector) */ + sector_t ov_position; + /* Start sector of out of sync range (to merge printk reporting). */ + sector_t ov_last_oos_start; + /* size of out-of-sync range in sectors. */ + sector_t ov_last_oos_size; + unsigned long ov_left; /* in bits */ + struct crypto_hash *csums_tfm; + struct crypto_hash *verify_tfm; + + struct drbd_thread receiver; + struct drbd_thread worker; + struct drbd_thread asender; + struct drbd_bitmap *bitmap; + unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */ + + /* Used to track operations of resync... */ + struct lru_cache *resync; + /* Number of locked elements in resync LRU */ + unsigned int resync_locked; + /* resync extent number waiting for application requests */ + unsigned int resync_wenr; + + int open_cnt; + u64 *p_uuid; + struct drbd_epoch *current_epoch; + spinlock_t epoch_lock; + unsigned int epochs; + enum write_ordering_e write_ordering; + struct list_head active_ee; /* IO in progress */ + struct list_head sync_ee; /* IO in progress */ + struct list_head done_ee; /* send ack */ + struct list_head read_ee; /* IO in progress */ + struct list_head net_ee; /* zero-copy network send in progress */ + struct hlist_head *ee_hash; /* is proteced by req_lock! */ + unsigned int ee_hash_s; + + /* this one is protected by ee_lock, single thread */ + struct drbd_epoch_entry *last_write_w_barrier; + + int next_barrier_nr; + struct hlist_head *app_reads_hash; /* is proteced by req_lock */ + struct list_head resync_reads; + atomic_t pp_in_use; + wait_queue_head_t ee_wait; + struct page *md_io_page; /* one page buffer for md_io */ + struct page *md_io_tmpp; /* for logical_block_size != 512 */ + struct mutex md_io_mutex; /* protects the md_io_buffer */ + spinlock_t al_lock; + wait_queue_head_t al_wait; + struct lru_cache *act_log; /* activity log */ + unsigned int al_tr_number; + int al_tr_cycle; + int al_tr_pos; /* position of the next transaction in the journal */ + struct crypto_hash *cram_hmac_tfm; + struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */ + struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */ + void *int_dig_out; + void *int_dig_in; + void *int_dig_vv; + wait_queue_head_t seq_wait; + atomic_t packet_seq; + unsigned int peer_seq; + spinlock_t peer_seq_lock; + unsigned int minor; + unsigned long comm_bm_set; /* communicated number of set bits. */ + cpumask_var_t cpu_mask; + struct bm_io_work bm_io_work; + u64 ed_uuid; /* UUID of the exposed data */ + struct mutex state_mutex; + char congestion_reason; /* Why we where congested... */ +}; + +static inline struct drbd_conf *minor_to_mdev(unsigned int minor) +{ + struct drbd_conf *mdev; + + mdev = minor < minor_count ? minor_table[minor] : NULL; + + return mdev; +} + +static inline unsigned int mdev_to_minor(struct drbd_conf *mdev) +{ + return mdev->minor; +} + +/* returns 1 if it was successfull, + * returns 0 if there was no data socket. + * so wherever you are going to use the data.socket, e.g. do + * if (!drbd_get_data_sock(mdev)) + * return 0; + * CODE(); + * drbd_put_data_sock(mdev); + */ +static inline int drbd_get_data_sock(struct drbd_conf *mdev) +{ + mutex_lock(&mdev->data.mutex); + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (unlikely(mdev->data.socket == NULL)) { + mutex_unlock(&mdev->data.mutex); + return 0; + } + return 1; +} + +static inline void drbd_put_data_sock(struct drbd_conf *mdev) +{ + mutex_unlock(&mdev->data.mutex); +} + +/* + * function declarations + *************************/ + +/* drbd_main.c */ + +enum chg_state_flags { + CS_HARD = 1, + CS_VERBOSE = 2, + CS_WAIT_COMPLETE = 4, + CS_SERIALIZE = 8, + CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE, +}; + +extern void drbd_init_set_defaults(struct drbd_conf *mdev); +extern int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, + union drbd_state mask, union drbd_state val); +extern void drbd_force_state(struct drbd_conf *, union drbd_state, + union drbd_state); +extern int _drbd_request_state(struct drbd_conf *, union drbd_state, + union drbd_state, enum chg_state_flags); +extern int __drbd_set_state(struct drbd_conf *, union drbd_state, + enum chg_state_flags, struct completion *done); +extern void print_st_err(struct drbd_conf *, union drbd_state, + union drbd_state, int); +extern int drbd_thread_start(struct drbd_thread *thi); +extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait); +#ifdef CONFIG_SMP +extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev); +extern void drbd_calc_cpu_mask(struct drbd_conf *mdev); +#else +#define drbd_thread_current_set_cpu(A) ({}) +#define drbd_calc_cpu_mask(A) ({}) +#endif +extern void drbd_free_resources(struct drbd_conf *mdev); +extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, + unsigned int set_size); +extern void tl_clear(struct drbd_conf *mdev); +extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); +extern void drbd_free_sock(struct drbd_conf *mdev); +extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, unsigned msg_flags); +extern int drbd_send_protocol(struct drbd_conf *mdev); +extern int drbd_send_uuids(struct drbd_conf *mdev); +extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); +extern int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val); +extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply); +extern int _drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev); +extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, + enum drbd_packets cmd, struct p_header *h, + size_t size, unsigned msg_flags); +#define USE_DATA_SOCKET 1 +#define USE_META_SOCKET 0 +extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, + enum drbd_packets cmd, struct p_header *h, + size_t size); +extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, + char *data, size_t size); +extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc); +extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, + u32 set_size); +extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e); +extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_block_req *rp); +extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_data *dp); +extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, + sector_t sector, int blksize, u64 block_id); +extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e); +extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req); +extern int _drbd_send_barrier(struct drbd_conf *mdev, + struct drbd_tl_epoch *barrier); +extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd, + sector_t sector, int size, u64 block_id); +extern int drbd_send_drequest_csum(struct drbd_conf *mdev, + sector_t sector,int size, + void *digest, int digest_size, + enum drbd_packets cmd); +extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size); + +extern int drbd_send_bitmap(struct drbd_conf *mdev); +extern int _drbd_send_bitmap(struct drbd_conf *mdev); +extern int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode); +extern void drbd_free_bc(struct drbd_backing_dev *ldev); +extern void drbd_mdev_cleanup(struct drbd_conf *mdev); + +/* drbd_meta-data.c (still in drbd_main.c) */ +extern void drbd_md_sync(struct drbd_conf *mdev); +extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev); +/* maybe define them below as inline? */ +extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); +extern void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local); +extern void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); +extern void _drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local); +extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local); +extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local); +extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local); +extern int drbd_md_test_flag(struct drbd_backing_dev *, int); +extern void drbd_md_mark_dirty(struct drbd_conf *mdev); +extern void drbd_queue_bitmap_io(struct drbd_conf *mdev, + int (*io_fn)(struct drbd_conf *), + void (*done)(struct drbd_conf *, int), + char *why); +extern int drbd_bmio_set_n_write(struct drbd_conf *mdev); +extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev); +extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why); + + +/* Meta data layout + We reserve a 128MB Block (4k aligned) + * either at the end of the backing device + * or on a seperate meta data device. */ + +#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */ +/* The following numbers are sectors */ +#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */ +#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */ +/* Allows up to about 3.8TB */ +#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE) + +/* Since the smalles IO unit is usually 512 byte */ +#define MD_SECTOR_SHIFT 9 +#define MD_SECTOR_SIZE (1< we need 32 KB bitmap. + * Bit 0 ==> local node thinks this block is binary identical on both nodes + * Bit 1 ==> local node thinks this block needs to be synced. + */ + +#define BM_BLOCK_SHIFT 12 /* 4k per bit */ +#define BM_BLOCK_SIZE (1<>(BM_BLOCK_SHIFT-9)) +#define BM_BIT_TO_SECT(x) ((sector_t)(x)<<(BM_BLOCK_SHIFT-9)) +#define BM_SECT_PER_BIT BM_BIT_TO_SECT(1) + +/* bit to represented kilo byte conversion */ +#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10)) + +/* in which _bitmap_ extent (resp. sector) the bit for a certain + * _storage_ sector is located in */ +#define BM_SECT_TO_EXT(x) ((x)>>(BM_EXT_SHIFT-9)) + +/* how much _storage_ sectors we have per bitmap sector */ +#define BM_EXT_TO_SECT(x) ((sector_t)(x) << (BM_EXT_SHIFT-9)) +#define BM_SECT_PER_EXT BM_EXT_TO_SECT(1) + +/* in one sector of the bitmap, we have this many activity_log extents. */ +#define AL_EXT_PER_BM_SECT (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT)) +#define BM_WORDS_PER_AL_EXT (1 << (AL_EXTENT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL)) + +#define BM_BLOCKS_PER_BM_EXT_B (BM_EXT_SHIFT - BM_BLOCK_SHIFT) +#define BM_BLOCKS_PER_BM_EXT_MASK ((1<ov_last_oos_size) { + dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n", + (unsigned long long)mdev->ov_last_oos_start, + (unsigned long)mdev->ov_last_oos_size); + } + mdev->ov_last_oos_size=0; +} + + +extern void drbd_csum(struct drbd_conf *, struct crypto_hash *, struct bio *, void *); +/* worker callbacks */ +extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int); +extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int); +extern int w_resync_inactive(struct drbd_conf *, struct drbd_work *, int); +extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int); +extern int w_io_error(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int); +extern int w_make_resync_request(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int); +extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int); +extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int); +extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int); + +extern void resync_timer_fn(unsigned long data); + +/* drbd_receiver.c */ +extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list); +extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + gfp_t gfp_mask) __must_hold(local); +extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e); +extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head); +extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, + struct list_head *head); +extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled); +extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed); +extern void drbd_flush_workqueue(struct drbd_conf *mdev); + +/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to + * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ +static inline int drbd_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + int err; + if (level == SOL_SOCKET) + err = sock_setsockopt(sock, level, optname, optval, optlen); + else + err = sock->ops->setsockopt(sock, level, optname, optval, + optlen); + return err; +} + +static inline void drbd_tcp_cork(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (char __user *)&val, sizeof(val)); +} + +static inline void drbd_tcp_uncork(struct socket *sock) +{ + int __user val = 0; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (char __user *)&val, sizeof(val)); +} + +static inline void drbd_tcp_nodelay(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char __user *)&val, sizeof(val)); +} + +static inline void drbd_tcp_quickack(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (char __user *)&val, sizeof(val)); +} + +void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo); + +/* drbd_proc.c */ +extern struct proc_dir_entry *drbd_proc; +extern struct file_operations drbd_proc_fops; +extern const char *drbd_conn_str(enum drbd_conns s); +extern const char *drbd_role_str(enum drbd_role s); + +/* drbd_actlog.c */ +extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector); +extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector); +extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector); +extern void drbd_rs_cancel_all(struct drbd_conf *mdev); +extern int drbd_rs_del_all(struct drbd_conf *mdev); +extern void drbd_rs_failed_io(struct drbd_conf *mdev, + sector_t sector, int size); +extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *); +extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, + int size, const char *file, const unsigned int line); +#define drbd_set_in_sync(mdev, sector, size) \ + __drbd_set_in_sync(mdev, sector, size, __FILE__, __LINE__) +extern void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, + int size, const char *file, const unsigned int line); +#define drbd_set_out_of_sync(mdev, sector, size) \ + __drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__) +extern void drbd_al_apply_to_bm(struct drbd_conf *mdev); +extern void drbd_al_to_on_disk_bm(struct drbd_conf *mdev); +extern void drbd_al_shrink(struct drbd_conf *mdev); + + +/* drbd_nl.c */ + +void drbd_nl_cleanup(void); +int __init drbd_nl_init(void); +void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state); +void drbd_bcast_sync_progress(struct drbd_conf *mdev); +void drbd_bcast_ee(struct drbd_conf *mdev, + const char *reason, const int dgs, + const char* seen_hash, const char* calc_hash, + const struct drbd_epoch_entry* e); + + +/** + * DOC: DRBD State macros + * + * These macros are used to express state changes in easily readable form. + * + * The NS macros expand to a mask and a value, that can be bit ored onto the + * current state as soon as the spinlock (req_lock) was taken. + * + * The _NS macros are used for state functions that get called with the + * spinlock. These macros expand directly to the new state value. + * + * Besides the basic forms NS() and _NS() additional _?NS[23] are defined + * to express state changes that affect more than one aspect of the state. + * + * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY) + * Means that the network connection was established and that the peer + * is in secondary role. + */ +#define role_MASK R_MASK +#define peer_MASK R_MASK +#define disk_MASK D_MASK +#define pdsk_MASK D_MASK +#define conn_MASK C_MASK +#define susp_MASK 1 +#define user_isp_MASK 1 +#define aftr_isp_MASK 1 + +#define NS(T, S) \ + ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T = (S); val; }) +#define NS2(T1, S1, T2, S2) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val; }) +#define NS3(T1, S1, T2, S2, T3, S3) \ + ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \ + mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \ + ({ union drbd_state val; val.i = 0; val.T1 = (S1); \ + val.T2 = (S2); val.T3 = (S3); val; }) + +#define _NS(D, T, S) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; }) +#define _NS2(D, T1, S1, T2, S2) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns; }) +#define _NS3(D, T1, S1, T2, S2, T3, S3) \ + D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns.T3 = (S3); __ns; }) + +/* + * inline helper functions + *************************/ + +static inline void drbd_state_lock(struct drbd_conf *mdev) +{ + wait_event(mdev->misc_wait, + !test_and_set_bit(CLUSTER_ST_CHANGE, &mdev->flags)); +} + +static inline void drbd_state_unlock(struct drbd_conf *mdev) +{ + clear_bit(CLUSTER_ST_CHANGE, &mdev->flags); + wake_up(&mdev->misc_wait); +} + +static inline int _drbd_set_state(struct drbd_conf *mdev, + union drbd_state ns, enum chg_state_flags flags, + struct completion *done) +{ + int rv; + + read_lock(&global_state_lock); + rv = __drbd_set_state(mdev, ns, flags, done); + read_unlock(&global_state_lock); + + return rv; +} + +/** + * drbd_request_state() - Reqest a state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * + * This is the most graceful way of requesting a state change. It is verbose + * quite verbose in case the state change is not possible, and all those + * state changes are globally serialized. + */ +static inline int drbd_request_state(struct drbd_conf *mdev, + union drbd_state mask, + union drbd_state val) +{ + return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); +} + +#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) +static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) +{ + switch (mdev->ldev->dc.on_io_error) { + case EP_PASS_ON: + if (!forcedetach) { + if (printk_ratelimit()) + dev_err(DEV, "Local IO failed in %s." + "Passing error on...\n", where); + break; + } + /* NOTE fall through to detach case if forcedetach set */ + case EP_DETACH: + case EP_CALL_HELPER: + if (mdev->state.disk > D_FAILED) { + _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); + dev_err(DEV, "Local IO failed in %s." + "Detaching...\n", where); + } + break; + } +} + +/** + * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers + * @mdev: DRBD device. + * @error: Error code passed to the IO completion callback + * @forcedetach: Force detach. I.e. the error happened while accessing the meta data + * + * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED) + */ +#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) +static inline void drbd_chk_io_error_(struct drbd_conf *mdev, + int error, int forcedetach, const char *where) +{ + if (error) { + unsigned long flags; + spin_lock_irqsave(&mdev->req_lock, flags); + __drbd_chk_io_error_(mdev, forcedetach, where); + spin_unlock_irqrestore(&mdev->req_lock, flags); + } +} + + +/** + * drbd_md_first_sector() - Returns the first sector number of the meta data area + * @bdev: Meta data block device. + * + * BTW, for internal meta data, this happens to be the maximum capacity + * we could agree upon with our peer node. + */ +static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + bdev->md.bm_offset; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset; + } +} + +/** + * drbd_md_last_sector() - Return the last sector number of the meta data area + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + return bdev->md.md_offset + MD_AL_OFFSET - 1; + case DRBD_MD_INDEX_FLEX_EXT: + default: + return bdev->md.md_offset + bdev->md.md_size_sect; + } +} + +/* Returns the number of 512 byte sectors of the device */ +static inline sector_t drbd_get_capacity(struct block_device *bdev) +{ + /* return bdev ? get_capacity(bdev->bd_disk) : 0; */ + return bdev ? bdev->bd_inode->i_size >> 9 : 0; +} + +/** + * drbd_get_max_capacity() - Returns the capacity we announce to out peer + * @bdev: Meta data block device. + * + * returns the capacity we announce to out peer. we clip ourselves at the + * various MAX_SECTORS, because if we don't, current implementation will + * oops sooner or later + */ +static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) +{ + sector_t s; + switch (bdev->dc.meta_dev_idx) { + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + s = drbd_get_capacity(bdev->backing_bdev) + ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_md_first_sector(bdev)) + : 0; + break; + case DRBD_MD_INDEX_FLEX_EXT: + s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_get_capacity(bdev->backing_bdev)); + /* clip at maximum size the meta device can support */ + s = min_t(sector_t, s, + BM_EXT_TO_SECT(bdev->md.md_size_sect + - bdev->md.bm_offset)); + break; + default: + s = min_t(sector_t, DRBD_MAX_SECTORS, + drbd_get_capacity(bdev->backing_bdev)); + } + return s; +} + +/** + * drbd_md_ss__() - Return the sector number of our meta data super block + * @mdev: DRBD device. + * @bdev: Meta data block device. + */ +static inline sector_t drbd_md_ss__(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev) +{ + switch (bdev->dc.meta_dev_idx) { + default: /* external, some index */ + return MD_RESERVED_SECT * bdev->dc.meta_dev_idx; + case DRBD_MD_INDEX_INTERNAL: + /* with drbd08, internal meta data is always "flexible" */ + case DRBD_MD_INDEX_FLEX_INT: + /* sizeof(struct md_on_disk_07) == 4k + * position: last 4k aligned block of 4k size */ + if (!bdev->backing_bdev) { + if (__ratelimit(&drbd_ratelimit_state)) { + dev_err(DEV, "bdev->backing_bdev==NULL\n"); + dump_stack(); + } + return 0; + } + return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) + - MD_AL_OFFSET; + case DRBD_MD_INDEX_FLEX_EXT: + return 0; + } +} + +static inline void +_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + list_add_tail(&w->list, &q->q); + up(&q->s); +} + +static inline void +drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + list_add(&w->list, &q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock, flags); +} + +static inline void +drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w) +{ + unsigned long flags; + spin_lock_irqsave(&q->q_lock, flags); + list_add_tail(&w->list, &q->q); + up(&q->s); /* within the spinlock, + see comment near end of drbd_worker() */ + spin_unlock_irqrestore(&q->q_lock, flags); +} + +static inline void wake_asender(struct drbd_conf *mdev) +{ + if (test_bit(SIGNAL_ASENDER, &mdev->flags)) + force_sig(DRBD_SIG, mdev->asender.task); +} + +static inline void request_ping(struct drbd_conf *mdev) +{ + set_bit(SEND_PING, &mdev->flags); + wake_asender(mdev); +} + +static inline int drbd_send_short_cmd(struct drbd_conf *mdev, + enum drbd_packets cmd) +{ + struct p_header h; + return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h)); +} + +static inline int drbd_send_ping(struct drbd_conf *mdev) +{ + struct p_header h; + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h)); +} + +static inline int drbd_send_ping_ack(struct drbd_conf *mdev) +{ + struct p_header h; + return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h)); +} + +static inline void drbd_thread_stop(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, FALSE, TRUE); +} + +static inline void drbd_thread_stop_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, FALSE, FALSE); +} + +static inline void drbd_thread_restart_nowait(struct drbd_thread *thi) +{ + _drbd_thread_stop(thi, TRUE, FALSE); +} + +/* counts how many answer packets packets we expect from our peer, + * for either explicit application requests, + * or implicit barrier packets as necessary. + * increased: + * w_send_barrier + * _req_mod(req, queue_for_net_write or queue_for_net_read); + * it is much easier and equally valid to count what we queue for the + * worker, even before it actually was queued or send. + * (drbd_make_request_common; recovery path on read io-error) + * decreased: + * got_BarrierAck (respective tl_clear, tl_clear_barrier) + * _req_mod(req, data_received) + * [from receive_DataReply] + * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked) + * [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)] + * for some reason it is NOT decreased in got_NegAck, + * but in the resulting cleanup code from report_params. + * we should try to remember the reason for that... + * _req_mod(req, send_failed or send_canceled) + * _req_mod(req, connection_lost_while_pending) + * [from tl_clear_barrier] + */ +static inline void inc_ap_pending(struct drbd_conf *mdev) +{ + atomic_inc(&mdev->ap_pending_cnt); +} + +#define ERR_IF_CNT_IS_NEGATIVE(which) \ + if (atomic_read(&mdev->which) < 0) \ + dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \ + __func__ , __LINE__ , \ + atomic_read(&mdev->which)) + +#define dec_ap_pending(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \ + wake_up(&mdev->misc_wait); \ + ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0) + +/* counts how many resync-related answers we still expect from the peer + * increase decrease + * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY) + * C_SYNC_SOURCE sends P_RS_DATA_REPLY (and expects P_WRITE_ACK whith ID_SYNCER) + * (or P_NEG_ACK with ID_SYNCER) + */ +static inline void inc_rs_pending(struct drbd_conf *mdev) +{ + atomic_inc(&mdev->rs_pending_cnt); +} + +#define dec_rs_pending(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_dec(&mdev->rs_pending_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0) + +/* counts how many answers we still need to send to the peer. + * increased on + * receive_Data unless protocol A; + * we need to send a P_RECV_ACK (proto B) + * or P_WRITE_ACK (proto C) + * receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK + * receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA + * receive_Barrier_* we need to send a P_BARRIER_ACK + */ +static inline void inc_unacked(struct drbd_conf *mdev) +{ + atomic_inc(&mdev->unacked_cnt); +} + +#define dec_unacked(mdev) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_dec(&mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + +#define sub_unacked(mdev, n) do { \ + typecheck(struct drbd_conf *, mdev); \ + atomic_sub(n, &mdev->unacked_cnt); \ + ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0) + + +static inline void put_net_conf(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->net_cnt)) + wake_up(&mdev->misc_wait); +} + +/** + * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there + * @mdev: DRBD device. + * + * You have to call put_net_conf() when finished working with mdev->net_conf. + */ +static inline int get_net_conf(struct drbd_conf *mdev) +{ + int have_net_conf; + + atomic_inc(&mdev->net_cnt); + have_net_conf = mdev->state.conn >= C_UNCONNECTED; + if (!have_net_conf) + put_net_conf(mdev); + return have_net_conf; +} + +/** + * get_ldev() - Increase the ref count on mdev->ldev. Returns 0 if there is no ldev + * @M: DRBD device. + * + * You have to call put_ldev() when finished working with mdev->ldev. + */ +#define get_ldev(M) __cond_lock(local, _get_ldev_if_state(M,D_INCONSISTENT)) +#define get_ldev_if_state(M,MINS) __cond_lock(local, _get_ldev_if_state(M,MINS)) + +static inline void put_ldev(struct drbd_conf *mdev) +{ + __release(local); + if (atomic_dec_and_test(&mdev->local_cnt)) + wake_up(&mdev->misc_wait); + D_ASSERT(atomic_read(&mdev->local_cnt) >= 0); +} + +#ifndef __CHECKER__ +static inline int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins); + if (!io_allowed) + put_ldev(mdev); + return io_allowed; +} +#else +extern int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins); +#endif + +/* you must have an "get_ldev" reference */ +static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, + unsigned long *bits_left, unsigned int *per_mil_done) +{ + /* + * this is to break it at compile time when we change that + * (we may feel 4TB maximum storage per drbd is not enough) + */ + typecheck(unsigned long, mdev->rs_total); + + /* note: both rs_total and rs_left are in bits, i.e. in + * units of BM_BLOCK_SIZE. + * for the percentage, we don't care. */ + + *bits_left = drbd_bm_total_weight(mdev) - mdev->rs_failed; + /* >> 10 to prevent overflow, + * +1 to prevent division by zero */ + if (*bits_left > mdev->rs_total) { + /* doh. maybe a logic bug somewhere. + * may also be just a race condition + * between this and a disconnect during sync. + * for now, just prevent in-kernel buffer overflow. + */ + smp_rmb(); + dev_warn(DEV, "cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", + drbd_conn_str(mdev->state.conn), + *bits_left, mdev->rs_total, mdev->rs_failed); + *per_mil_done = 0; + } else { + /* make sure the calculation happens in long context */ + unsigned long tmp = 1000UL - + (*bits_left >> 10)*1000UL + / ((mdev->rs_total >> 10) + 1UL); + *per_mil_done = tmp; + } +} + + +/* this throttles on-the-fly application requests + * according to max_buffers settings; + * maybe re-implement using semaphores? */ +static inline int drbd_get_max_buffers(struct drbd_conf *mdev) +{ + int mxb = 1000000; /* arbitrary limit on open requests */ + if (get_net_conf(mdev)) { + mxb = mdev->net_conf->max_buffers; + put_net_conf(mdev); + } + return mxb; +} + +static inline int drbd_state_is_stable(union drbd_state s) +{ + + /* DO NOT add a default clause, we want the compiler to warn us + * for any newly introduced state we may have forgotten to add here */ + + switch ((enum drbd_conns)s.conn) { + /* new io only accepted when there is no connection, ... */ + case C_STANDALONE: + case C_WF_CONNECTION: + /* ... or there is a well established connection. */ + case C_CONNECTED: + case C_SYNC_SOURCE: + case C_SYNC_TARGET: + case C_VERIFY_S: + case C_VERIFY_T: + case C_PAUSED_SYNC_S: + case C_PAUSED_SYNC_T: + /* maybe stable, look at the disk state */ + break; + + /* no new io accepted during tansitional states + * like handshake or teardown */ + case C_DISCONNECTING: + case C_UNCONNECTED: + case C_TIMEOUT: + case C_BROKEN_PIPE: + case C_NETWORK_FAILURE: + case C_PROTOCOL_ERROR: + case C_TEAR_DOWN: + case C_WF_REPORT_PARAMS: + case C_STARTING_SYNC_S: + case C_STARTING_SYNC_T: + case C_WF_BITMAP_S: + case C_WF_BITMAP_T: + case C_WF_SYNC_UUID: + case C_MASK: + /* not "stable" */ + return 0; + } + + switch ((enum drbd_disk_state)s.disk) { + case D_DISKLESS: + case D_INCONSISTENT: + case D_OUTDATED: + case D_CONSISTENT: + case D_UP_TO_DATE: + /* disk state is stable as well. */ + break; + + /* no new io accepted during tansitional states */ + case D_ATTACHING: + case D_FAILED: + case D_NEGOTIATING: + case D_UNKNOWN: + case D_MASK: + /* not "stable" */ + return 0; + } + + return 1; +} + +static inline int __inc_ap_bio_cond(struct drbd_conf *mdev) +{ + int mxb = drbd_get_max_buffers(mdev); + + if (mdev->state.susp) + return 0; + if (test_bit(SUSPEND_IO, &mdev->flags)) + return 0; + + /* to avoid potential deadlock or bitmap corruption, + * in various places, we only allow new application io + * to start during "stable" states. */ + + /* no new io accepted when attaching or detaching the disk */ + if (!drbd_state_is_stable(mdev->state)) + return 0; + + /* since some older kernels don't have atomic_add_unless, + * and we are within the spinlock anyways, we have this workaround. */ + if (atomic_read(&mdev->ap_bio_cnt) > mxb) + return 0; + if (test_bit(BITMAP_IO, &mdev->flags)) + return 0; + return 1; +} + +/* I'd like to use wait_event_lock_irq, + * but I'm not sure when it got introduced, + * and not sure when it has 3 or 4 arguments */ +static inline void inc_ap_bio(struct drbd_conf *mdev, int one_or_two) +{ + /* compare with after_state_ch, + * os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S */ + DEFINE_WAIT(wait); + + /* we wait here + * as long as the device is suspended + * until the bitmap is no longer on the fly during connection + * handshake as long as we would exeed the max_buffer limit. + * + * to avoid races with the reconnect code, + * we need to atomic_inc within the spinlock. */ + + spin_lock_irq(&mdev->req_lock); + while (!__inc_ap_bio_cond(mdev)) { + prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + schedule(); + finish_wait(&mdev->misc_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } + atomic_add(one_or_two, &mdev->ap_bio_cnt); + spin_unlock_irq(&mdev->req_lock); +} + +static inline void dec_ap_bio(struct drbd_conf *mdev) +{ + int mxb = drbd_get_max_buffers(mdev); + int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); + + D_ASSERT(ap_bio >= 0); + /* this currently does wake_up for every dec_ap_bio! + * maybe rather introduce some type of hysteresis? + * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ + if (ap_bio < mxb) + wake_up(&mdev->misc_wait); + if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } +} + +static inline void drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) +{ + mdev->ed_uuid = val; +} + +static inline int seq_cmp(u32 a, u32 b) +{ + /* we assume wrap around at 32bit. + * for wrap around at 24bit (old atomic_t), + * we'd have to + * a <<= 8; b <<= 8; + */ + return (s32)(a) - (s32)(b); +} +#define seq_lt(a, b) (seq_cmp((a), (b)) < 0) +#define seq_gt(a, b) (seq_cmp((a), (b)) > 0) +#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0) +#define seq_le(a, b) (seq_cmp((a), (b)) <= 0) +/* CAUTION: please no side effects in arguments! */ +#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b))) + +static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq) +{ + unsigned int m; + spin_lock(&mdev->peer_seq_lock); + m = seq_max(mdev->peer_seq, new_seq); + mdev->peer_seq = m; + spin_unlock(&mdev->peer_seq_lock); + if (m == new_seq) + wake_up(&mdev->seq_wait); +} + +static inline void drbd_update_congested(struct drbd_conf *mdev) +{ + struct sock *sk = mdev->data.socket->sk; + if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5) + set_bit(NET_CONGESTED, &mdev->flags); +} + +static inline int drbd_queue_order_type(struct drbd_conf *mdev) +{ + /* sorry, we currently have no working implementation + * of distributed TCQ stuff */ +#ifndef QUEUE_ORDERED_NONE +#define QUEUE_ORDERED_NONE 0 +#endif + return QUEUE_ORDERED_NONE; +} + +static inline void drbd_blk_run_queue(struct request_queue *q) +{ + if (q && q->unplug_fn) + q->unplug_fn(q); +} + +static inline void drbd_kick_lo(struct drbd_conf *mdev) +{ + if (get_ldev(mdev)) { + drbd_blk_run_queue(bdev_get_queue(mdev->ldev->backing_bdev)); + put_ldev(mdev); + } +} + +static inline void drbd_md_flush(struct drbd_conf *mdev) +{ + int r; + + if (test_bit(MD_NO_BARRIER, &mdev->flags)) + return; + + r = blkdev_issue_flush(mdev->ldev->md_bdev, NULL); + if (r) { + set_bit(MD_NO_BARRIER, &mdev->flags); + dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r); + } +} + +#endif diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c new file mode 100644 index 00000000000..edf0b8031e6 --- /dev/null +++ b/drivers/block/drbd/drbd_main.c @@ -0,0 +1,3735 @@ +/* + drbd.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev + from Logicworks, Inc. for making SDP replication support possible. + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define __KERNEL_SYSCALLS__ +#include +#include + +#include +#include "drbd_int.h" +#include "drbd_tracing.h" +#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ + +#include "drbd_vli.h" + +struct after_state_chg_work { + struct drbd_work w; + union drbd_state os; + union drbd_state ns; + enum chg_state_flags flags; + struct completion *done; +}; + +int drbdd_init(struct drbd_thread *); +int drbd_worker(struct drbd_thread *); +int drbd_asender(struct drbd_thread *); + +int drbd_init(void); +static int drbd_open(struct block_device *bdev, fmode_t mode); +static int drbd_release(struct gendisk *gd, fmode_t mode); +static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags); +static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void md_sync_timer_fn(unsigned long data); +static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); + +DEFINE_TRACE(drbd_unplug); +DEFINE_TRACE(drbd_uuid); +DEFINE_TRACE(drbd_ee); +DEFINE_TRACE(drbd_packet); +DEFINE_TRACE(drbd_md_io); +DEFINE_TRACE(drbd_epoch); +DEFINE_TRACE(drbd_netlink); +DEFINE_TRACE(drbd_actlog); +DEFINE_TRACE(drbd_bio); +DEFINE_TRACE(_drbd_resync); +DEFINE_TRACE(drbd_req); + +MODULE_AUTHOR("Philipp Reisner , " + "Lars Ellenberg "); +MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); +MODULE_VERSION(REL_VERSION); +MODULE_LICENSE("GPL"); +MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices (1-255)"); +MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR); + +#include +/* allow_open_on_secondary */ +MODULE_PARM_DESC(allow_oos, "DONT USE!"); +/* thanks to these macros, if compiled into the kernel (not-module), + * this becomes the boot parameter drbd.minor_count */ +module_param(minor_count, uint, 0444); +module_param(disable_sendpage, bool, 0644); +module_param(allow_oos, bool, 0); +module_param(cn_idx, uint, 0444); +module_param(proc_details, int, 0644); + +#ifdef CONFIG_DRBD_FAULT_INJECTION +int enable_faults; +int fault_rate; +static int fault_count; +int fault_devs; +/* bitmap of enabled faults */ +module_param(enable_faults, int, 0664); +/* fault rate % value - applies to all enabled faults */ +module_param(fault_rate, int, 0664); +/* count of faults inserted */ +module_param(fault_count, int, 0664); +/* bitmap of devices to insert faults on */ +module_param(fault_devs, int, 0644); +#endif + +/* module parameter, defined */ +unsigned int minor_count = 32; +int disable_sendpage; +int allow_oos; +unsigned int cn_idx = CN_IDX_DRBD; +int proc_details; /* Detail level in proc drbd*/ + +/* Module parameter for setting the user mode helper program + * to run. Default is /sbin/drbdadm */ +char usermode_helper[80] = "/sbin/drbdadm"; + +module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); + +/* in 2.6.x, our device mapping and config info contains our virtual gendisks + * as member "struct gendisk *vdisk;" + */ +struct drbd_conf **minor_table; + +struct kmem_cache *drbd_request_cache; +struct kmem_cache *drbd_ee_cache; /* epoch entries */ +struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ +struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ +mempool_t *drbd_request_mempool; +mempool_t *drbd_ee_mempool; + +/* I do not use a standard mempool, because: + 1) I want to hand out the pre-allocated objects first. + 2) I want to be able to interrupt sleeping allocation with a signal. + Note: This is a single linked list, the next pointer is the private + member of struct page. + */ +struct page *drbd_pp_pool; +spinlock_t drbd_pp_lock; +int drbd_pp_vacant; +wait_queue_head_t drbd_pp_wait; + +DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5); + +static struct block_device_operations drbd_ops = { + .owner = THIS_MODULE, + .open = drbd_open, + .release = drbd_release, +}; + +#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) + +#ifdef __CHECKER__ +/* When checking with sparse, and this is an inline function, sparse will + give tons of false positives. When this is a real functions sparse works. + */ +int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins); + if (!io_allowed) { + if (atomic_dec_and_test(&mdev->local_cnt)) + wake_up(&mdev->misc_wait); + } + return io_allowed; +} + +#endif + +/** + * DOC: The transfer log + * + * The transfer log is a single linked list of &struct drbd_tl_epoch objects. + * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail + * of the list. There is always at least one &struct drbd_tl_epoch object. + * + * Each &struct drbd_tl_epoch has a circular double linked list of requests + * attached. + */ +static int tl_init(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + + /* during device minor initialization, we may well use GFP_KERNEL */ + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL); + if (!b) + return 0; + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->next = NULL; + b->br_number = 4711; + b->n_req = 0; + b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ + + mdev->oldest_tle = b; + mdev->newest_tle = b; + INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + + return 1; +} + +static void tl_cleanup(struct drbd_conf *mdev) +{ + D_ASSERT(mdev->oldest_tle == mdev->newest_tle); + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); + kfree(mdev->oldest_tle); + mdev->oldest_tle = NULL; + kfree(mdev->unused_spare_tle); + mdev->unused_spare_tle = NULL; + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; +} + +/** + * _tl_add_barrier() - Adds a barrier to the transfer log + * @mdev: DRBD device. + * @new: Barrier to be added before the current head of the TL. + * + * The caller must hold the req_lock. + */ +void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) +{ + struct drbd_tl_epoch *newest_before; + + INIT_LIST_HEAD(&new->requests); + INIT_LIST_HEAD(&new->w.list); + new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ + new->next = NULL; + new->n_req = 0; + + newest_before = mdev->newest_tle; + /* never send a barrier number == 0, because that is special-cased + * when using TCQ for our write ordering code */ + new->br_number = (newest_before->br_number+1) ?: 1; + if (mdev->newest_tle != new) { + mdev->newest_tle->next = new; + mdev->newest_tle = new; + } +} + +/** + * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL + * @mdev: DRBD device. + * @barrier_nr: Expected identifier of the DRBD write barrier packet. + * @set_size: Expected number of requests before that barrier. + * + * In case the passed barrier_nr or set_size does not match the oldest + * &struct drbd_tl_epoch objects this function will cause a termination + * of the connection. + */ +void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, + unsigned int set_size) +{ + struct drbd_tl_epoch *b, *nob; /* next old barrier */ + struct list_head *le, *tle; + struct drbd_request *r; + + spin_lock_irq(&mdev->req_lock); + + b = mdev->oldest_tle; + + /* first some paranoia code */ + if (b == NULL) { + dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n", + barrier_nr); + goto bail; + } + if (b->br_number != barrier_nr) { + dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n", + barrier_nr, b->br_number); + goto bail; + } + if (b->n_req != set_size) { + dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n", + barrier_nr, set_size, b->n_req); + goto bail; + } + + /* Clean up list of requests processed during current epoch */ + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request, tl_requests); + _req_mod(r, barrier_acked); + } + /* There could be requests on the list waiting for completion + of the write to the local disk. To avoid corruptions of + slab's data structures we have to remove the lists head. + + Also there could have been a barrier ack out of sequence, overtaking + the write acks - which would be a bug and violating write ordering. + To not deadlock in case we lose connection while such requests are + still pending, we need some way to find them for the + _req_mode(connection_lost_while_pending). + + These have been list_move'd to the out_of_sequence_requests list in + _req_mod(, barrier_acked) above. + */ + list_del_init(&b->requests); + + nob = b->next; + if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { + _tl_add_barrier(mdev, b); + if (nob) + mdev->oldest_tle = nob; + /* if nob == NULL b was the only barrier, and becomes the new + barrier. Therefore mdev->oldest_tle points already to b */ + } else { + D_ASSERT(nob != NULL); + mdev->oldest_tle = nob; + kfree(b); + } + + spin_unlock_irq(&mdev->req_lock); + dec_ap_pending(mdev); + + return; + +bail: + spin_unlock_irq(&mdev->req_lock); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); +} + + +/** + * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL + * @mdev: DRBD device. + * + * This is called after the connection to the peer was lost. The storage covered + * by the requests on the transfer gets marked as our of sync. Called from the + * receiver thread and the worker thread. + */ +void tl_clear(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b, *tmp; + struct list_head *le, *tle; + struct drbd_request *r; + int new_initial_bnr = net_random(); + + spin_lock_irq(&mdev->req_lock); + + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + r = list_entry(le, struct drbd_request, tl_requests); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(r, connection_lost_while_pending); + } + tmp = b->next; + + /* there could still be requests on that ring list, + * in case local io is still pending */ + list_del(&b->requests); + + /* dec_ap_pending corresponding to queue_barrier. + * the newest barrier may not have been queued yet, + * in which case w.cb is still NULL. */ + if (b->w.cb != NULL) + dec_ap_pending(mdev); + + if (b == mdev->newest_tle) { + /* recycle, but reinit! */ + D_ASSERT(tmp == NULL); + INIT_LIST_HEAD(&b->requests); + INIT_LIST_HEAD(&b->w.list); + b->w.cb = NULL; + b->br_number = new_initial_bnr; + b->n_req = 0; + + mdev->oldest_tle = b; + break; + } + kfree(b); + b = tmp; + } + + /* we expect this list to be empty. */ + D_ASSERT(list_empty(&mdev->out_of_sequence_requests)); + + /* but just in case, clean it up anyways! */ + list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) { + r = list_entry(le, struct drbd_request, tl_requests); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(r, connection_lost_while_pending); + } + + /* ensure bit indicating barrier is required is clear */ + clear_bit(CREATE_BARRIER, &mdev->flags); + + spin_unlock_irq(&mdev->req_lock); +} + +/** + * cl_wide_st_chg() - TRUE if the state change is a cluster wide one + * @mdev: DRBD device. + * @os: old (current) state. + * @ns: new (wanted) state. + */ +static int cl_wide_st_chg(struct drbd_conf *mdev, + union drbd_state os, union drbd_state ns) +{ + return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED && + ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || + (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || + (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || + (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); +} + +int drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f, + union drbd_state mask, union drbd_state val) +{ + unsigned long flags; + union drbd_state os, ns; + int rv; + + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f, NULL); + ns = mdev->state; + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +/** + * drbd_force_state() - Impose a change which happens outside our control on our state + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + */ +void drbd_force_state(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + drbd_change_state(mdev, CS_HARD, mask, val); +} + +static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns); +static int is_valid_state_transition(struct drbd_conf *, + union drbd_state, union drbd_state); +static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, int *warn_sync_abort); +int drbd_send_state_req(struct drbd_conf *, + union drbd_state, union drbd_state); + +static enum drbd_state_ret_codes _req_st_cond(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + union drbd_state os, ns; + unsigned long flags; + int rv; + + if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags)) + return SS_CW_SUCCESS; + + if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags)) + return SS_CW_FAILED_BY_PEER; + + rv = 0; + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + ns = sanitize_state(mdev, os, ns, NULL); + + if (!cl_wide_st_chg(mdev, os, ns)) + rv = SS_CW_NO_NEED; + if (!rv) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) { + rv = is_valid_state_transition(mdev, ns, os); + if (rv == SS_SUCCESS) + rv = 0; /* cont waiting, otherwise fail. */ + } + } + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +/** + * drbd_req_state() - Perform an eventually cluster wide state change + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Should not be called directly, use drbd_request_state() or + * _drbd_request_state(). + */ +static int drbd_req_state(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val, + enum chg_state_flags f) +{ + struct completion done; + unsigned long flags; + union drbd_state os, ns; + int rv; + + init_completion(&done); + + if (f & CS_SERIALIZE) + mutex_lock(&mdev->state_mutex); + + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + ns = sanitize_state(mdev, os, ns, NULL); + + if (cl_wide_st_chg(mdev, os, ns)) { + rv = is_valid_state(mdev, ns); + if (rv == SS_SUCCESS) + rv = is_valid_state_transition(mdev, ns, os); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (rv < SS_SUCCESS) { + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + drbd_state_lock(mdev); + if (!drbd_send_state_req(mdev, mask, val)) { + drbd_state_unlock(mdev); + rv = SS_CW_FAILED_BY_PEER; + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + + wait_event(mdev->state_wait, + (rv = _req_st_cond(mdev, mask, val))); + + if (rv < SS_SUCCESS) { + drbd_state_unlock(mdev); + if (f & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + goto abort; + } + spin_lock_irqsave(&mdev->req_lock, flags); + os = mdev->state; + ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f, &done); + drbd_state_unlock(mdev); + } else { + rv = _drbd_set_state(mdev, ns, f, &done); + } + + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) { + D_ASSERT(current != mdev->worker.task); + wait_for_completion(&done); + } + +abort: + if (f & CS_SERIALIZE) + mutex_unlock(&mdev->state_mutex); + + return rv; +} + +/** + * _drbd_request_state() - Request a state change (with flags) + * @mdev: DRBD device. + * @mask: mask of state bits to change. + * @val: value of new state bits. + * @f: flags + * + * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE + * flag, or when logging of failed state change requests is not desired. + */ +int _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + int rv; + + wait_event(mdev->state_wait, + (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE); + + return rv; +} + +static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns) +{ + dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n", + name, + drbd_conn_str(ns.conn), + drbd_role_str(ns.role), + drbd_role_str(ns.peer), + drbd_disk_str(ns.disk), + drbd_disk_str(ns.pdsk), + ns.susp ? 's' : 'r', + ns.aftr_isp ? 'a' : '-', + ns.peer_isp ? 'p' : '-', + ns.user_isp ? 'u' : '-' + ); +} + +void print_st_err(struct drbd_conf *mdev, + union drbd_state os, union drbd_state ns, int err) +{ + if (err == SS_IN_TRANSIENT_STATE) + return; + dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err)); + print_st(mdev, " state", os); + print_st(mdev, "wanted", ns); +} + + +#define drbd_peer_str drbd_role_str +#define drbd_pdsk_str drbd_disk_str + +#define drbd_susp_str(A) ((A) ? "1" : "0") +#define drbd_aftr_isp_str(A) ((A) ? "1" : "0") +#define drbd_peer_isp_str(A) ((A) ? "1" : "0") +#define drbd_user_isp_str(A) ((A) ? "1" : "0") + +#define PSC(A) \ + ({ if (ns.A != os.A) { \ + pbp += sprintf(pbp, #A "( %s -> %s ) ", \ + drbd_##A##_str(os.A), \ + drbd_##A##_str(ns.A)); \ + } }) + +/** + * is_valid_state() - Returns an SS_ error code if ns is not valid + * @mdev: DRBD device. + * @ns: State to consider. + */ +static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns) +{ + /* See drbd_state_sw_errors in drbd_strings.c */ + + enum drbd_fencing_p fp; + int rv = SS_SUCCESS; + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + if (get_net_conf(mdev)) { + if (!mdev->net_conf->two_primaries && + ns.role == R_PRIMARY && ns.peer == R_PRIMARY) + rv = SS_TWO_PRIMARIES; + put_net_conf(mdev); + } + + if (rv <= 0) + /* already found a reason to abort */; + else if (ns.role == R_SECONDARY && mdev->open_cnt) + rv = SS_DEVICE_IN_USE; + + else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (fp >= FP_RESOURCE && + ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN) + rv = SS_PRIMARY_NOP; + + else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT) + rv = SS_NO_UP_TO_DATE_DISK; + + else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT) + rv = SS_NO_LOCAL_DISK; + + else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT) + rv = SS_NO_REMOTE_DISK; + + else if ((ns.conn == C_CONNECTED || + ns.conn == C_WF_BITMAP_S || + ns.conn == C_SYNC_SOURCE || + ns.conn == C_PAUSED_SYNC_S) && + ns.disk == D_OUTDATED) + rv = SS_CONNECTED_OUTDATES; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + (mdev->sync_conf.verify_alg[0] == 0)) + rv = SS_NO_VERIFY_ALG; + + else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + mdev->agreed_pro_version < 88) + rv = SS_NOT_SUPPORTED; + + return rv; +} + +/** + * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible + * @mdev: DRBD device. + * @ns: new state. + * @os: old state. + */ +static int is_valid_state_transition(struct drbd_conf *mdev, + union drbd_state ns, union drbd_state os) +{ + int rv = SS_SUCCESS; + + if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) && + os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE) + rv = SS_ALREADY_STANDALONE; + + if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS) + rv = SS_IS_DISKLESS; + + if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED) + rv = SS_NO_NET_CONFIG; + + if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING) + rv = SS_LOWER_THAN_OUTDATED; + + if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) + rv = SS_IN_TRANSIENT_STATE; + + if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) + rv = SS_IN_TRANSIENT_STATE; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && + ns.conn != os.conn && os.conn > C_CONNECTED) + rv = SS_RESYNC_RUNNING; + + if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) && + os.conn < C_CONNECTED) + rv = SS_NEED_CONNECTION; + + return rv; +} + +/** + * sanitize_state() - Resolves implicitly necessary additional changes to a state transition + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @warn_sync_abort: + * + * When we loose connection, we have to set the state of the peers disk (pdsk) + * to D_UNKNOWN. This rule and many more along those lines are in this function. + */ +static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, int *warn_sync_abort) +{ + enum drbd_fencing_p fp; + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + /* Disallow Network errors to configure a device's network part */ + if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) && + os.conn <= C_DISCONNECTING) + ns.conn = os.conn; + + /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */ + if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING) + ns.conn = os.conn; + + /* After C_DISCONNECTING only C_STANDALONE may follow */ + if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) + ns.conn = os.conn; + + if (ns.conn < C_CONNECTED) { + ns.peer_isp = 0; + ns.peer = R_UNKNOWN; + if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT) + ns.pdsk = D_UNKNOWN; + } + + /* Clear the aftr_isp when becoming unconfigured */ + if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY) + ns.aftr_isp = 0; + + if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS) + ns.pdsk = D_UNKNOWN; + + /* Abort resync if a disk fails/detaches */ + if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && + (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { + if (warn_sync_abort) + *warn_sync_abort = 1; + ns.conn = C_CONNECTED; + } + + if (ns.conn >= C_CONNECTED && + ((ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED) || + (ns.disk == D_NEGOTIATING && ns.conn == C_WF_BITMAP_T))) { + switch (ns.conn) { + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + ns.disk = D_OUTDATED; + break; + case C_CONNECTED: + case C_WF_BITMAP_S: + case C_SYNC_SOURCE: + case C_PAUSED_SYNC_S: + ns.disk = D_UP_TO_DATE; + break; + case C_SYNC_TARGET: + ns.disk = D_INCONSISTENT; + dev_warn(DEV, "Implicitly set disk state Inconsistent!\n"); + break; + } + if (os.disk == D_OUTDATED && ns.disk == D_UP_TO_DATE) + dev_warn(DEV, "Implicitly set disk from Outdated to UpToDate\n"); + } + + if (ns.conn >= C_CONNECTED && + (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)) { + switch (ns.conn) { + case C_CONNECTED: + case C_WF_BITMAP_T: + case C_PAUSED_SYNC_T: + case C_SYNC_TARGET: + ns.pdsk = D_UP_TO_DATE; + break; + case C_WF_BITMAP_S: + case C_PAUSED_SYNC_S: + ns.pdsk = D_OUTDATED; + break; + case C_SYNC_SOURCE: + ns.pdsk = D_INCONSISTENT; + dev_warn(DEV, "Implicitly set pdsk Inconsistent!\n"); + break; + } + if (os.pdsk == D_OUTDATED && ns.pdsk == D_UP_TO_DATE) + dev_warn(DEV, "Implicitly set pdsk from Outdated to UpToDate\n"); + } + + /* Connection breaks down before we finished "Negotiating" */ + if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING && + get_ldev_if_state(mdev, D_NEGOTIATING)) { + if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) { + ns.disk = mdev->new_state_tmp.disk; + ns.pdsk = mdev->new_state_tmp.pdsk; + } else { + dev_alert(DEV, "Connection lost while negotiating, no data!\n"); + ns.disk = D_DISKLESS; + ns.pdsk = D_UNKNOWN; + } + put_ldev(mdev); + } + + if (fp == FP_STONITH && + (ns.role == R_PRIMARY && + ns.conn < C_CONNECTED && + ns.pdsk > D_OUTDATED)) + ns.susp = 1; + + if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { + if (ns.conn == C_SYNC_SOURCE) + ns.conn = C_PAUSED_SYNC_S; + if (ns.conn == C_SYNC_TARGET) + ns.conn = C_PAUSED_SYNC_T; + } else { + if (ns.conn == C_PAUSED_SYNC_S) + ns.conn = C_SYNC_SOURCE; + if (ns.conn == C_PAUSED_SYNC_T) + ns.conn = C_SYNC_TARGET; + } + + return ns; +} + +/* helper for __drbd_set_state */ +static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs) +{ + if (cs == C_VERIFY_T) { + /* starting online verify from an arbitrary position + * does not fit well into the existing protocol. + * on C_VERIFY_T, we initialize ov_left and friends + * implicitly in receive_DataRequest once the + * first P_OV_REQUEST is received */ + mdev->ov_start_sector = ~(sector_t)0; + } else { + unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector); + if (bit >= mdev->rs_total) + mdev->ov_start_sector = + BM_BIT_TO_SECT(mdev->rs_total - 1); + mdev->ov_position = mdev->ov_start_sector; + } +} + +/** + * __drbd_set_state() - Set a new DRBD state + * @mdev: DRBD device. + * @ns: new state. + * @flags: Flags + * @done: Optional completion, that will get completed after the after_state_ch() finished + * + * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + */ +int __drbd_set_state(struct drbd_conf *mdev, + union drbd_state ns, enum chg_state_flags flags, + struct completion *done) +{ + union drbd_state os; + int rv = SS_SUCCESS; + int warn_sync_abort = 0; + struct after_state_chg_work *ascw; + + os = mdev->state; + + ns = sanitize_state(mdev, os, ns, &warn_sync_abort); + + if (ns.i == os.i) + return SS_NOTHING_TO_DO; + + if (!(flags & CS_HARD)) { + /* pre-state-change checks ; only look at ns */ + /* See drbd_state_sw_errors in drbd_strings.c */ + + rv = is_valid_state(mdev, ns); + if (rv < SS_SUCCESS) { + /* If the old state was illegal as well, then let + this happen...*/ + + if (is_valid_state(mdev, os) == rv) { + dev_err(DEV, "Considering state change from bad state. " + "Error would be: '%s'\n", + drbd_set_st_err_str(rv)); + print_st(mdev, "old", os); + print_st(mdev, "new", ns); + rv = is_valid_state_transition(mdev, ns, os); + } + } else + rv = is_valid_state_transition(mdev, ns, os); + } + + if (rv < SS_SUCCESS) { + if (flags & CS_VERBOSE) + print_st_err(mdev, os, ns, rv); + return rv; + } + + if (warn_sync_abort) + dev_warn(DEV, "Resync aborted.\n"); + + { + char *pbp, pb[300]; + pbp = pb; + *pbp = 0; + PSC(role); + PSC(peer); + PSC(conn); + PSC(disk); + PSC(pdsk); + PSC(susp); + PSC(aftr_isp); + PSC(peer_isp); + PSC(user_isp); + dev_info(DEV, "%s\n", pb); + } + + /* solve the race between becoming unconfigured, + * worker doing the cleanup, and + * admin reconfiguring us: + * on (re)configure, first set CONFIG_PENDING, + * then wait for a potentially exiting worker, + * start the worker, and schedule one no_op. + * then proceed with configuration. + */ + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY && + !test_and_set_bit(CONFIG_PENDING, &mdev->flags)) + set_bit(DEVICE_DYING, &mdev->flags); + + mdev->state.i = ns.i; + wake_up(&mdev->misc_wait); + wake_up(&mdev->state_wait); + + /* post-state-change actions */ + if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) { + set_bit(STOP_SYNC_TIMER, &mdev->flags); + mod_timer(&mdev->resync_timer, jiffies); + } + + /* aborted verify run. log the last position */ + if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) && + ns.conn < C_CONNECTED) { + mdev->ov_start_sector = + BM_BIT_TO_SECT(mdev->rs_total - mdev->ov_left); + dev_info(DEV, "Online Verify reached sector %llu\n", + (unsigned long long)mdev->ov_start_sector); + } + + if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) && + (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) { + dev_info(DEV, "Syncer continues.\n"); + mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time; + if (ns.conn == C_SYNC_TARGET) { + if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags)) + mod_timer(&mdev->resync_timer, jiffies); + /* This if (!test_bit) is only needed for the case + that a device that has ceased to used its timer, + i.e. it is already in drbd_resync_finished() gets + paused and resumed. */ + } + } + + if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) && + (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) { + dev_info(DEV, "Resync suspended\n"); + mdev->rs_mark_time = jiffies; + if (ns.conn == C_PAUSED_SYNC_T) + set_bit(STOP_SYNC_TIMER, &mdev->flags); + } + + if (os.conn == C_CONNECTED && + (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) { + mdev->ov_position = 0; + mdev->rs_total = + mdev->rs_mark_left = drbd_bm_bits(mdev); + if (mdev->agreed_pro_version >= 90) + set_ov_position(mdev, ns.conn); + else + mdev->ov_start_sector = 0; + mdev->ov_left = mdev->rs_total + - BM_SECT_TO_BIT(mdev->ov_position); + mdev->rs_start = + mdev->rs_mark_time = jiffies; + mdev->ov_last_oos_size = 0; + mdev->ov_last_oos_start = 0; + + if (ns.conn == C_VERIFY_S) { + dev_info(DEV, "Starting Online Verify from sector %llu\n", + (unsigned long long)mdev->ov_position); + mod_timer(&mdev->resync_timer, jiffies); + } + } + + if (get_ldev(mdev)) { + u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND| + MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE| + MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY); + + if (test_bit(CRASHED_PRIMARY, &mdev->flags)) + mdf |= MDF_CRASHED_PRIMARY; + if (mdev->state.role == R_PRIMARY || + (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY)) + mdf |= MDF_PRIMARY_IND; + if (mdev->state.conn > C_WF_REPORT_PARAMS) + mdf |= MDF_CONNECTED_IND; + if (mdev->state.disk > D_INCONSISTENT) + mdf |= MDF_CONSISTENT; + if (mdev->state.disk > D_OUTDATED) + mdf |= MDF_WAS_UP_TO_DATE; + if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT) + mdf |= MDF_PEER_OUT_DATED; + if (mdf != mdev->ldev->md.flags) { + mdev->ldev->md.flags = mdf; + drbd_md_mark_dirty(mdev); + } + if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT) + drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]); + put_ldev(mdev); + } + + /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */ + if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT && + os.peer == R_SECONDARY && ns.peer == R_PRIMARY) + set_bit(CONSIDER_RESYNC, &mdev->flags); + + /* Receiver should clean up itself */ + if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING) + drbd_thread_stop_nowait(&mdev->receiver); + + /* Now the receiver finished cleaning up itself, it should die */ + if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE) + drbd_thread_stop_nowait(&mdev->receiver); + + /* Upon network failure, we need to restart the receiver. */ + if (os.conn > C_TEAR_DOWN && + ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) + drbd_thread_restart_nowait(&mdev->receiver); + + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); + if (ascw) { + ascw->os = os; + ascw->ns = ns; + ascw->flags = flags; + ascw->w.cb = w_after_state_ch; + ascw->done = done; + drbd_queue_work(&mdev->data.work, &ascw->w); + } else { + dev_warn(DEV, "Could not kmalloc an ascw\n"); + } + + return rv; +} + +static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct after_state_chg_work *ascw = + container_of(w, struct after_state_chg_work, w); + after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); + if (ascw->flags & CS_WAIT_COMPLETE) { + D_ASSERT(ascw->done != NULL); + complete(ascw->done); + } + kfree(ascw); + + return 1; +} + +static void abw_start_sync(struct drbd_conf *mdev, int rv) +{ + if (rv) { + dev_err(DEV, "Writing the bitmap failed not starting resync.\n"); + _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE); + return; + } + + switch (mdev->state.conn) { + case C_STARTING_SYNC_T: + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + break; + case C_STARTING_SYNC_S: + drbd_start_resync(mdev, C_SYNC_SOURCE); + break; + } +} + +/** + * after_state_ch() - Perform after state change actions that may sleep + * @mdev: DRBD device. + * @os: old state. + * @ns: new state. + * @flags: Flags + */ +static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, + union drbd_state ns, enum chg_state_flags flags) +{ + enum drbd_fencing_p fp; + + if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) { + clear_bit(CRASHED_PRIMARY, &mdev->flags); + if (mdev->p_uuid) + mdev->p_uuid[UI_FLAGS] &= ~((u64)2); + } + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + /* Inform userspace about the change... */ + drbd_bcast_state(mdev, ns); + + if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) && + (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)) + drbd_khelper(mdev, "pri-on-incon-degr"); + + /* Here we have the actions that are performed after a + state change. This function might sleep */ + + if (fp == FP_STONITH && ns.susp) { + /* case1: The outdate peer handler is successful: + * case2: The connection was established again: */ + if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) || + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) { + tl_clear(mdev); + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + } + /* Do not change the order of the if above and the two below... */ + if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S) + drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL, "send_bitmap (WFBitMapS)"); + + /* Lost contact to peer's copy of the data */ + if ((os.pdsk >= D_INCONSISTENT && + os.pdsk != D_UNKNOWN && + os.pdsk != D_OUTDATED) + && (ns.pdsk < D_INCONSISTENT || + ns.pdsk == D_UNKNOWN || + ns.pdsk == D_OUTDATED)) { + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + if (get_ldev(mdev)) { + if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { + drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); + } + put_ldev(mdev); + } + } + + if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { + if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) + drbd_uuid_new_current(mdev); + + /* D_DISKLESS Peer becomes secondary */ + if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) + drbd_al_to_on_disk_bm(mdev); + put_ldev(mdev); + } + + /* Last part of the attaching process ... */ + if (ns.conn >= C_CONNECTED && + os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { + kfree(mdev->p_uuid); /* We expect to receive up-to-date UUIDs soon. */ + mdev->p_uuid = NULL; /* ...to not use the old ones in the mean time */ + drbd_send_sizes(mdev, 0); /* to start sync... */ + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + /* We want to pause/continue resync, tell peer. */ + if (ns.conn >= C_CONNECTED && + ((os.aftr_isp != ns.aftr_isp) || + (os.user_isp != ns.user_isp))) + drbd_send_state(mdev); + + /* In case one of the isp bits got set, suspend other devices. */ + if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && + (ns.aftr_isp || ns.peer_isp || ns.user_isp)) + suspend_other_sg(mdev); + + /* Make sure the peer gets informed about eventual state + changes (ISP bits) while we were in WFReportParams. */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) + drbd_send_state(mdev); + + /* We are in the progress to start a full sync... */ + if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || + (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S)) + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync, "set_n_write from StartingSync"); + + /* We are invalidating our self... */ + if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED && + os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT) + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate"); + + if (os.disk > D_FAILED && ns.disk == D_FAILED) { + enum drbd_io_error_p eh; + + eh = EP_PASS_ON; + if (get_ldev_if_state(mdev, D_FAILED)) { + eh = mdev->ldev->dc.on_io_error; + put_ldev(mdev); + } + + drbd_rs_cancel_all(mdev); + /* since get_ldev() only works as long as disk>=D_INCONSISTENT, + and it is D_DISKLESS here, local_cnt can only go down, it can + not increase... It will reach zero */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL); + spin_unlock_irq(&mdev->req_lock); + + if (eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + } + + if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) { + + if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ { + if (drbd_send_state(mdev)) + dev_warn(DEV, "Notified peer that my disk is broken.\n"); + else + dev_err(DEV, "Sending state in drbd_io_error() failed\n"); + } + + lc_destroy(mdev->resync); + mdev->resync = NULL; + lc_destroy(mdev->act_log); + mdev->act_log = NULL; + __no_warn(local, + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL;); + + if (mdev->md_io_tmpp) + __free_page(mdev->md_io_tmpp); + } + + /* Disks got bigger while they were detached */ + if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && + test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { + if (ns.conn == C_CONNECTED) + resync_after_online_grow(mdev); + } + + /* A resync finished or aborted, wake paused devices... */ + if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) || + (os.peer_isp && !ns.peer_isp) || + (os.user_isp && !ns.user_isp)) + resume_next_sg(mdev); + + /* Upon network connection, we need to start the receiver */ + if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED) + drbd_thread_start(&mdev->receiver); + + /* Terminate worker thread if we are unconfigured - it will be + restarted as needed... */ + if (ns.disk == D_DISKLESS && + ns.conn == C_STANDALONE && + ns.role == R_SECONDARY) { + if (os.aftr_isp != ns.aftr_isp) + resume_next_sg(mdev); + /* set in __drbd_set_state, unless CONFIG_PENDING was set */ + if (test_bit(DEVICE_DYING, &mdev->flags)) + drbd_thread_stop_nowait(&mdev->worker); + } + + drbd_md_sync(mdev); +} + + +static int drbd_thread_setup(void *arg) +{ + struct drbd_thread *thi = (struct drbd_thread *) arg; + struct drbd_conf *mdev = thi->mdev; + unsigned long flags; + int retval; + +restart: + retval = thi->function(thi); + + spin_lock_irqsave(&thi->t_lock, flags); + + /* if the receiver has been "Exiting", the last thing it did + * was set the conn state to "StandAlone", + * if now a re-connect request comes in, conn state goes C_UNCONNECTED, + * and receiver thread will be "started". + * drbd_thread_start needs to set "Restarting" in that case. + * t_state check and assignment needs to be within the same spinlock, + * so either thread_start sees Exiting, and can remap to Restarting, + * or thread_start see None, and can proceed as normal. + */ + + if (thi->t_state == Restarting) { + dev_info(DEV, "Restarting %s\n", current->comm); + thi->t_state = Running; + spin_unlock_irqrestore(&thi->t_lock, flags); + goto restart; + } + + thi->task = NULL; + thi->t_state = None; + smp_mb(); + complete(&thi->stop); + spin_unlock_irqrestore(&thi->t_lock, flags); + + dev_info(DEV, "Terminating %s\n", current->comm); + + /* Release mod reference taken when thread was started */ + module_put(THIS_MODULE); + return retval; +} + +static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi, + int (*func) (struct drbd_thread *)) +{ + spin_lock_init(&thi->t_lock); + thi->task = NULL; + thi->t_state = None; + thi->function = func; + thi->mdev = mdev; +} + +int drbd_thread_start(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + struct task_struct *nt; + unsigned long flags; + + const char *me = + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE"; + + /* is used from state engine doing drbd_thread_stop_nowait, + * while holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + switch (thi->t_state) { + case None: + dev_info(DEV, "Starting %s thread (from %s [%d])\n", + me, current->comm, current->pid); + + /* Get ref on module for thread - this is released when thread exits */ + if (!try_module_get(THIS_MODULE)) { + dev_err(DEV, "Failed to get module reference in drbd_thread_start\n"); + spin_unlock_irqrestore(&thi->t_lock, flags); + return FALSE; + } + + init_completion(&thi->stop); + D_ASSERT(thi->task == NULL); + thi->reset_cpu_mask = 1; + thi->t_state = Running; + spin_unlock_irqrestore(&thi->t_lock, flags); + flush_signals(current); /* otherw. may get -ERESTARTNOINTR */ + + nt = kthread_create(drbd_thread_setup, (void *) thi, + "drbd%d_%s", mdev_to_minor(mdev), me); + + if (IS_ERR(nt)) { + dev_err(DEV, "Couldn't start thread\n"); + + module_put(THIS_MODULE); + return FALSE; + } + spin_lock_irqsave(&thi->t_lock, flags); + thi->task = nt; + thi->t_state = Running; + spin_unlock_irqrestore(&thi->t_lock, flags); + wake_up_process(nt); + break; + case Exiting: + thi->t_state = Restarting; + dev_info(DEV, "Restarting %s thread (from %s [%d])\n", + me, current->comm, current->pid); + /* fall through */ + case Running: + case Restarting: + default: + spin_unlock_irqrestore(&thi->t_lock, flags); + break; + } + + return TRUE; +} + + +void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait) +{ + unsigned long flags; + + enum drbd_thread_state ns = restart ? Restarting : Exiting; + + /* may be called from state engine, holding the req lock irqsave */ + spin_lock_irqsave(&thi->t_lock, flags); + + if (thi->t_state == None) { + spin_unlock_irqrestore(&thi->t_lock, flags); + if (restart) + drbd_thread_start(thi); + return; + } + + if (thi->t_state != ns) { + if (thi->task == NULL) { + spin_unlock_irqrestore(&thi->t_lock, flags); + return; + } + + thi->t_state = ns; + smp_mb(); + init_completion(&thi->stop); + if (thi->task != current) + force_sig(DRBD_SIGKILL, thi->task); + + } + + spin_unlock_irqrestore(&thi->t_lock, flags); + + if (wait) + wait_for_completion(&thi->stop); +} + +#ifdef CONFIG_SMP +/** + * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs + * @mdev: DRBD device. + * + * Forces all threads of a device onto the same CPU. This is beneficial for + * DRBD's performance. May be overwritten by user's configuration. + */ +void drbd_calc_cpu_mask(struct drbd_conf *mdev) +{ + int ord, cpu; + + /* user override. */ + if (cpumask_weight(mdev->cpu_mask)) + return; + + ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask); + for_each_online_cpu(cpu) { + if (ord-- == 0) { + cpumask_set_cpu(cpu, mdev->cpu_mask); + return; + } + } + /* should not be reached */ + cpumask_setall(mdev->cpu_mask); +} + +/** + * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread + * @mdev: DRBD device. + * + * call in the "main loop" of _all_ threads, no need for any mutex, current won't die + * prematurely. + */ +void drbd_thread_current_set_cpu(struct drbd_conf *mdev) +{ + struct task_struct *p = current; + struct drbd_thread *thi = + p == mdev->asender.task ? &mdev->asender : + p == mdev->receiver.task ? &mdev->receiver : + p == mdev->worker.task ? &mdev->worker : + NULL; + ERR_IF(thi == NULL) + return; + if (!thi->reset_cpu_mask) + return; + thi->reset_cpu_mask = 0; + set_cpus_allowed_ptr(p, mdev->cpu_mask); +} +#endif + +/* the appropriate socket mutex must be held already */ +int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, + enum drbd_packets cmd, struct p_header *h, + size_t size, unsigned msg_flags) +{ + int sent, ok; + + ERR_IF(!h) return FALSE; + ERR_IF(!size) return FALSE; + + h->magic = BE_DRBD_MAGIC; + h->command = cpu_to_be16(cmd); + h->length = cpu_to_be16(size-sizeof(struct p_header)); + + trace_drbd_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__); + sent = drbd_send(mdev, sock, h, size, msg_flags); + + ok = (sent == size); + if (!ok) + dev_err(DEV, "short sent %s size=%d sent=%d\n", + cmdname(cmd), (int)size, sent); + return ok; +} + +/* don't pass the socket. we may only look at it + * when we hold the appropriate socket mutex. + */ +int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket, + enum drbd_packets cmd, struct p_header *h, size_t size) +{ + int ok = 0; + struct socket *sock; + + if (use_data_socket) { + mutex_lock(&mdev->data.mutex); + sock = mdev->data.socket; + } else { + mutex_lock(&mdev->meta.mutex); + sock = mdev->meta.socket; + } + + /* drbd_disconnect() could have called drbd_free_sock() + * while we were waiting in down()... */ + if (likely(sock != NULL)) + ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0); + + if (use_data_socket) + mutex_unlock(&mdev->data.mutex); + else + mutex_unlock(&mdev->meta.mutex); + return ok; +} + +int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, + size_t size) +{ + struct p_header h; + int ok; + + h.magic = BE_DRBD_MAGIC; + h.command = cpu_to_be16(cmd); + h.length = cpu_to_be16(size); + + if (!drbd_get_data_sock(mdev)) + return 0; + + trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__); + + ok = (sizeof(h) == + drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); + ok = ok && (size == + drbd_send(mdev, mdev->data.socket, data, size, 0)); + + drbd_put_data_sock(mdev); + + return ok; +} + +int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc) +{ + struct p_rs_param_89 *p; + struct socket *sock; + int size, rv; + const int apv = mdev->agreed_pro_version; + + size = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + strlen(mdev->sync_conf.verify_alg) + 1 + : /* 89 */ sizeof(struct p_rs_param_89); + + /* used from admin command context and receiver/worker context. + * to avoid kmalloc, grab the socket right here, + * then use the pre-allocated sbuf there */ + mutex_lock(&mdev->data.mutex); + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM; + + p = &mdev->data.sbuf.rs_param_89; + + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + p->rate = cpu_to_be32(sc->rate); + + if (apv >= 88) + strcpy(p->verify_alg, mdev->sync_conf.verify_alg); + if (apv >= 89) + strcpy(p->csums_alg, mdev->sync_conf.csums_alg); + + rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0); + } else + rv = 0; /* not ok */ + + mutex_unlock(&mdev->data.mutex); + + return rv; +} + +int drbd_send_protocol(struct drbd_conf *mdev) +{ + struct p_protocol *p; + int size, rv; + + size = sizeof(struct p_protocol); + + if (mdev->agreed_pro_version >= 87) + size += strlen(mdev->net_conf->integrity_alg) + 1; + + /* we must not recurse into our own queue, + * as that is blocked during handshake */ + p = kmalloc(size, GFP_NOIO); + if (p == NULL) + return 0; + + p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol); + p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p); + p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p); + p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p); + p->want_lose = cpu_to_be32(mdev->net_conf->want_lose); + p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries); + + if (mdev->agreed_pro_version >= 87) + strcpy(p->integrity_alg, mdev->net_conf->integrity_alg); + + rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL, + (struct p_header *)p, size); + kfree(p); + return rv; +} + +int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags) +{ + struct p_uuids p; + int i; + + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) + return 1; + + for (i = UI_CURRENT; i < UI_SIZE; i++) + p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0; + + mdev->comm_bm_set = drbd_bm_total_weight(mdev); + p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set); + uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; + uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; + uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0; + p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags); + + put_ldev(mdev); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS, + (struct p_header *)&p, sizeof(p)); +} + +int drbd_send_uuids(struct drbd_conf *mdev) +{ + return _drbd_send_uuids(mdev, 0); +} + +int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev) +{ + return _drbd_send_uuids(mdev, 8); +} + + +int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val) +{ + struct p_rs_uuid p; + + p.uuid = cpu_to_be64(val); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID, + (struct p_header *)&p, sizeof(p)); +} + +int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply) +{ + struct p_sizes p; + sector_t d_size, u_size; + int q_order_type; + int ok; + + if (get_ldev_if_state(mdev, D_NEGOTIATING)) { + D_ASSERT(mdev->ldev->backing_bdev); + d_size = drbd_get_max_capacity(mdev->ldev); + u_size = mdev->ldev->dc.disk_size; + q_order_type = drbd_queue_order_type(mdev); + p.queue_order_type = cpu_to_be32(drbd_queue_order_type(mdev)); + put_ldev(mdev); + } else { + d_size = 0; + u_size = 0; + q_order_type = QUEUE_ORDERED_NONE; + } + + p.d_size = cpu_to_be64(d_size); + p.u_size = cpu_to_be64(u_size); + p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); + p.max_segment_size = cpu_to_be32(queue_max_segment_size(mdev->rq_queue)); + p.queue_order_type = cpu_to_be32(q_order_type); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +/** + * drbd_send_state() - Sends the drbd state to the peer + * @mdev: DRBD device. + */ +int drbd_send_state(struct drbd_conf *mdev) +{ + struct socket *sock; + struct p_state p; + int ok = 0; + + /* Grab state lock so we wont send state if we're in the middle + * of a cluster wide state change on another thread */ + drbd_state_lock(mdev); + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + drbd_state_unlock(mdev); + return ok; +} + +int drbd_send_state_req(struct drbd_conf *mdev, + union drbd_state mask, union drbd_state val) +{ + struct p_req_state p; + + p.mask = cpu_to_be32(mask.i); + p.val = cpu_to_be32(val.i); + + return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ, + (struct p_header *)&p, sizeof(p)); +} + +int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode) +{ + struct p_req_state_reply p; + + p.retcode = cpu_to_be32(retcode); + + return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY, + (struct p_header *)&p, sizeof(p)); +} + +int fill_bitmap_rle_bits(struct drbd_conf *mdev, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c) +{ + struct bitstream bs; + unsigned long plain_bits; + unsigned long tmp; + unsigned long rl; + unsigned len; + unsigned toggle; + int bits; + + /* may we use this feature? */ + if ((mdev->sync_conf.use_rle == 0) || + (mdev->agreed_pro_version < 90)) + return 0; + + if (c->bit_offset >= c->bm_bits) + return 0; /* nothing to do. */ + + /* use at most thus many bytes */ + bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0); + memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX); + /* plain bits covered in this code string */ + plain_bits = 0; + + /* p->encoding & 0x80 stores whether the first run length is set. + * bit offset is implicit. + * start with toggle == 2 to be able to tell the first iteration */ + toggle = 2; + + /* see how much plain bits we can stuff into one packet + * using RLE and VLI. */ + do { + tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset) + : _drbd_bm_find_next(mdev, c->bit_offset); + if (tmp == -1UL) + tmp = c->bm_bits; + rl = tmp - c->bit_offset; + + if (toggle == 2) { /* first iteration */ + if (rl == 0) { + /* the first checked bit was set, + * store start value, */ + DCBP_set_start(p, 1); + /* but skip encoding of zero run length */ + toggle = !toggle; + continue; + } + DCBP_set_start(p, 0); + } + + /* paranoia: catch zero runlength. + * can only happen if bitmap is modified while we scan it. */ + if (rl == 0) { + dev_err(DEV, "unexpected zero runlength while encoding bitmap " + "t:%u bo:%lu\n", toggle, c->bit_offset); + return -1; + } + + bits = vli_encode_bits(&bs, rl); + if (bits == -ENOBUFS) /* buffer full */ + break; + if (bits <= 0) { + dev_err(DEV, "error while encoding bitmap: %d\n", bits); + return 0; + } + + toggle = !toggle; + plain_bits += rl; + c->bit_offset = tmp; + } while (c->bit_offset < c->bm_bits); + + len = bs.cur.b - p->code + !!bs.cur.bit; + + if (plain_bits < (len << 3)) { + /* incompressible with this method. + * we need to rewind both word and bit position. */ + c->bit_offset -= plain_bits; + bm_xfer_ctx_bit_to_word_offset(c); + c->bit_offset = c->word_offset * BITS_PER_LONG; + return 0; + } + + /* RLE + VLI was able to compress it just fine. + * update c->word_offset. */ + bm_xfer_ctx_bit_to_word_offset(c); + + /* store pad_bits */ + DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7); + + return len; +} + +enum { OK, FAILED, DONE } +send_bitmap_rle_or_plain(struct drbd_conf *mdev, + struct p_header *h, struct bm_xfer_ctx *c) +{ + struct p_compressed_bm *p = (void*)h; + unsigned long num_words; + int len; + int ok; + + len = fill_bitmap_rle_bits(mdev, p, c); + + if (len < 0) + return FAILED; + + if (len) { + DCBP_set_code(p, RLE_VLI_Bits); + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h, + sizeof(*p) + len, 0); + + c->packets[0]++; + c->bytes[0] += sizeof(*p) + len; + + if (c->bit_offset >= c->bm_bits) + len = 0; /* DONE */ + } else { + /* was not compressible. + * send a buffer full of plain text bits instead. */ + num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); + len = num_words * sizeof(long); + if (len) + drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload); + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP, + h, sizeof(struct p_header) + len, 0); + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + + c->packets[1]++; + c->bytes[1] += sizeof(struct p_header) + len; + + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + } + ok = ok ? ((len == 0) ? DONE : OK) : FAILED; + + if (ok == DONE) + INFO_bm_xfer_stats(mdev, "send", c); + return ok; +} + +/* See the comment at receive_bitmap() */ +int _drbd_send_bitmap(struct drbd_conf *mdev) +{ + struct bm_xfer_ctx c; + struct p_header *p; + int ret; + + ERR_IF(!mdev->bitmap) return FALSE; + + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + p = (struct p_header *) __get_free_page(GFP_NOIO); + if (!p) { + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); + return FALSE; + } + + if (get_ldev(mdev)) { + if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { + dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n"); + drbd_bm_set_all(mdev); + if (drbd_bm_write(mdev)) { + /* write_bm did fail! Leave full sync flag set in Meta P_DATA + * but otherwise process as per normal - need to tell other + * side that a full resync is required! */ + dev_err(DEV, "Failed to write bitmap to disk!\n"); + } else { + drbd_md_clear_flag(mdev, MDF_FULL_SYNC); + drbd_md_sync(mdev); + } + } + put_ldev(mdev); + } + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(mdev), + .bm_words = drbd_bm_words(mdev), + }; + + do { + ret = send_bitmap_rle_or_plain(mdev, p, &c); + } while (ret == OK); + + free_page((unsigned long) p); + return (ret == DONE); +} + +int drbd_send_bitmap(struct drbd_conf *mdev) +{ + int err; + + if (!drbd_get_data_sock(mdev)) + return -1; + err = !_drbd_send_bitmap(mdev); + drbd_put_data_sock(mdev); + return err; +} + +int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size) +{ + int ok; + struct p_barrier_ack p; + + p.barrier = barrier_nr; + p.set_size = cpu_to_be32(set_size); + + if (mdev->state.conn < C_CONNECTED) + return FALSE; + ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +/** + * _drbd_send_ack() - Sends an ack packet + * @mdev: DRBD device. + * @cmd: Packet command code. + * @sector: sector, needs to be in big endian byte order + * @blksize: size in byte, needs to be in big endian byte order + * @block_id: Id, big endian byte order + */ +static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd, + u64 sector, + u32 blksize, + u64 block_id) +{ + int ok; + struct p_block_ack p; + + p.sector = sector; + p.block_id = block_id; + p.blksize = blksize; + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); + + if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED) + return FALSE; + ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_data *dp) +{ + const int header_size = sizeof(struct p_data) + - sizeof(struct p_header); + int data_size = ((struct p_header *)dp)->length - header_size; + + return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size), + dp->block_id); +} + +int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd, + struct p_block_req *rp) +{ + return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id); +} + +/** + * drbd_send_ack() - Sends an ack packet + * @mdev: DRBD device. + * @cmd: Packet command code. + * @e: Epoch entry. + */ +int drbd_send_ack(struct drbd_conf *mdev, + enum drbd_packets cmd, struct drbd_epoch_entry *e) +{ + return _drbd_send_ack(mdev, cmd, + cpu_to_be64(e->sector), + cpu_to_be32(e->size), + e->block_id); +} + +/* This function misuses the block_id field to signal if the blocks + * are is sync or not. */ +int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd, + sector_t sector, int blksize, u64 block_id) +{ + return _drbd_send_ack(mdev, cmd, + cpu_to_be64(sector), + cpu_to_be32(blksize), + cpu_to_be64(block_id)); +} + +int drbd_send_drequest(struct drbd_conf *mdev, int cmd, + sector_t sector, int size, u64 block_id) +{ + int ok; + struct p_block_req p; + + p.sector = cpu_to_be64(sector); + p.block_id = block_id; + p.blksize = cpu_to_be32(size); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +int drbd_send_drequest_csum(struct drbd_conf *mdev, + sector_t sector, int size, + void *digest, int digest_size, + enum drbd_packets cmd) +{ + int ok; + struct p_block_req p; + + p.sector = cpu_to_be64(sector); + p.block_id = BE_DRBD_MAGIC + 0xbeef; + p.blksize = cpu_to_be32(size); + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size); + + mutex_lock(&mdev->data.mutex); + + ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0)); + ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0)); + + mutex_unlock(&mdev->data.mutex); + + return ok; +} + +int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size) +{ + int ok; + struct p_block_req p; + + p.sector = cpu_to_be64(sector); + p.block_id = BE_DRBD_MAGIC + 0xbabe; + p.blksize = cpu_to_be32(size); + + ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST, + (struct p_header *)&p, sizeof(p)); + return ok; +} + +/* called on sndtimeo + * returns FALSE if we should retry, + * TRUE if we think connection is dead + */ +static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock) +{ + int drop_it; + /* long elapsed = (long)(jiffies - mdev->last_received); */ + + drop_it = mdev->meta.socket == sock + || !mdev->asender.task + || get_t_state(&mdev->asender) != Running + || mdev->state.conn < C_CONNECTED; + + if (drop_it) + return TRUE; + + drop_it = !--mdev->ko_count; + if (!drop_it) { + dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n", + current->comm, current->pid, mdev->ko_count); + request_ping(mdev); + } + + return drop_it; /* && (mdev->state == R_PRIMARY) */; +} + +/* The idea of sendpage seems to be to put some kind of reference + * to the page into the skb, and to hand it over to the NIC. In + * this process get_page() gets called. + * + * As soon as the page was really sent over the network put_page() + * gets called by some part of the network layer. [ NIC driver? ] + * + * [ get_page() / put_page() increment/decrement the count. If count + * reaches 0 the page will be freed. ] + * + * This works nicely with pages from FSs. + * But this means that in protocol A we might signal IO completion too early! + * + * In order not to corrupt data during a resync we must make sure + * that we do not reuse our own buffer pages (EEs) to early, therefore + * we have the net_ee list. + * + * XFS seems to have problems, still, it submits pages with page_count == 0! + * As a workaround, we disable sendpage on pages + * with page_count == 0 or PageSlab. + */ +static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page, + int offset, size_t size) +{ + int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, 0); + kunmap(page); + if (sent == size) + mdev->send_cnt += size>>9; + return sent == size; +} + +static int _drbd_send_page(struct drbd_conf *mdev, struct page *page, + int offset, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int sent, ok; + int len = size; + + /* e.g. XFS meta- & log-data is in slab pages, which have a + * page_count of 0 and/or have PageSlab() set. + * we cannot use send_page for those, as that does get_page(); + * put_page(); and would cause either a VM_BUG directly, or + * __page_cache_release a page that would actually still be referenced + * by someone, leading to some obscure delayed Oops somewhere else. */ + if (disable_sendpage || (page_count(page) < 1) || PageSlab(page)) + return _drbd_no_send_page(mdev, page, offset, size); + + drbd_update_congested(mdev); + set_fs(KERNEL_DS); + do { + sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page, + offset, len, + MSG_NOSIGNAL); + if (sent == -EAGAIN) { + if (we_should_drop_the_connection(mdev, + mdev->data.socket)) + break; + else + continue; + } + if (sent <= 0) { + dev_warn(DEV, "%s: size=%d len=%d sent=%d\n", + __func__, (int)size, len, sent); + break; + } + len -= sent; + offset += sent; + } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/); + set_fs(oldfs); + clear_bit(NET_CONGESTED, &mdev->flags); + + ok = (len == 0); + if (likely(ok)) + mdev->send_cnt += size>>9; + return ok; +} + +static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) +{ + struct bio_vec *bvec; + int i; + __bio_for_each_segment(bvec, bio, i, 0) { + if (!_drbd_no_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len)) + return 0; + } + return 1; +} + +static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) +{ + struct bio_vec *bvec; + int i; + __bio_for_each_segment(bvec, bio, i, 0) { + if (!_drbd_send_page(mdev, bvec->bv_page, + bvec->bv_offset, bvec->bv_len)) + return 0; + } + + return 1; +} + +/* Used to send write requests + * R_PRIMARY -> Peer (P_DATA) + */ +int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) +{ + int ok = 1; + struct p_data p; + unsigned int dp_flags = 0; + void *dgb; + int dgs; + + if (!drbd_get_data_sock(mdev)) + return 0; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(P_DATA); + p.head.length = + cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size); + + p.sector = cpu_to_be64(req->sector); + p.block_id = (unsigned long)req; + p.seq_num = cpu_to_be32(req->seq_num = + atomic_add_return(1, &mdev->packet_seq)); + dp_flags = 0; + + /* NOTE: no need to check if barriers supported here as we would + * not pass the test in make_request_common in that case + */ + if (bio_rw_flagged(req->master_bio, BIO_RW_BARRIER)) { + dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n"); + /* dp_flags |= DP_HARDBARRIER; */ + } + if (bio_rw_flagged(req->master_bio, BIO_RW_SYNCIO)) + dp_flags |= DP_RW_SYNC; + /* for now handle SYNCIO and UNPLUG + * as if they still were one and the same flag */ + if (bio_rw_flagged(req->master_bio, BIO_RW_UNPLUG)) + dp_flags |= DP_RW_SYNC; + if (mdev->state.conn >= C_SYNC_SOURCE && + mdev->state.conn <= C_PAUSED_SYNC_T) + dp_flags |= DP_MAY_SET_IN_SYNC; + + p.dp_flags = cpu_to_be32(dp_flags); + trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); + set_bit(UNPLUG_REMOTE, &mdev->flags); + ok = (sizeof(p) == + drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); + if (ok && dgs) { + dgb = mdev->int_dig_out; + drbd_csum(mdev, mdev->integrity_w_tfm, req->master_bio, dgb); + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); + } + if (ok) { + if (mdev->net_conf->wire_protocol == DRBD_PROT_A) + ok = _drbd_send_bio(mdev, req->master_bio); + else + ok = _drbd_send_zc_bio(mdev, req->master_bio); + } + + drbd_put_data_sock(mdev); + return ok; +} + +/* answer packet, used to send data back for read requests: + * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY) + * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY) + */ +int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, + struct drbd_epoch_entry *e) +{ + int ok; + struct p_data p; + void *dgb; + int dgs; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ? + crypto_hash_digestsize(mdev->integrity_w_tfm) : 0; + + p.head.magic = BE_DRBD_MAGIC; + p.head.command = cpu_to_be16(cmd); + p.head.length = + cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size); + + p.sector = cpu_to_be64(e->sector); + p.block_id = e->block_id; + /* p.seq_num = 0; No sequence numbers here.. */ + + /* Only called by our kernel thread. + * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL + * in response to admin command or module unload. + */ + if (!drbd_get_data_sock(mdev)) + return 0; + + trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); + ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, + sizeof(p), MSG_MORE); + if (ok && dgs) { + dgb = mdev->int_dig_out; + drbd_csum(mdev, mdev->integrity_w_tfm, e->private_bio, dgb); + ok = drbd_send(mdev, mdev->data.socket, dgb, dgs, MSG_MORE); + } + if (ok) + ok = _drbd_send_zc_bio(mdev, e->private_bio); + + drbd_put_data_sock(mdev); + return ok; +} + +/* + drbd_send distinguishes two cases: + + Packets sent via the data socket "sock" + and packets sent via the meta data socket "msock" + + sock msock + -----------------+-------------------------+------------------------------ + timeout conf.timeout / 2 conf.timeout / 2 + timeout action send a ping via msock Abort communication + and close all sockets +*/ + +/* + * you must have down()ed the appropriate [m]sock_mutex elsewhere! + */ +int drbd_send(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, unsigned msg_flags) +{ + struct kvec iov; + struct msghdr msg; + int rv, sent = 0; + + if (!sock) + return -1000; + + /* THINK if (signal_pending) return ... ? */ + + iov.iov_base = buf; + iov.iov_len = size; + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_flags = msg_flags | MSG_NOSIGNAL; + + if (sock == mdev->data.socket) { + mdev->ko_count = mdev->net_conf->ko_count; + drbd_update_congested(mdev); + } + do { + /* STRANGE + * tcp_sendmsg does _not_ use its size parameter at all ? + * + * -EAGAIN on timeout, -EINTR on signal. + */ +/* THINK + * do we need to block DRBD_SIG if sock == &meta.socket ?? + * otherwise wake_asender() might interrupt some send_*Ack ! + */ + rv = kernel_sendmsg(sock, &msg, &iov, 1, size); + if (rv == -EAGAIN) { + if (we_should_drop_the_connection(mdev, sock)) + break; + else + continue; + } + D_ASSERT(rv != 0); + if (rv == -EINTR) { + flush_signals(current); + rv = 0; + } + if (rv < 0) + break; + sent += rv; + iov.iov_base += rv; + iov.iov_len -= rv; + } while (sent < size); + + if (sock == mdev->data.socket) + clear_bit(NET_CONGESTED, &mdev->flags); + + if (rv <= 0) { + if (rv != -EAGAIN) { + dev_err(DEV, "%s_sendmsg returned %d\n", + sock == mdev->meta.socket ? "msock" : "sock", + rv); + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); + } else + drbd_force_state(mdev, NS(conn, C_TIMEOUT)); + } + + return sent; +} + +static int drbd_open(struct block_device *bdev, fmode_t mode) +{ + struct drbd_conf *mdev = bdev->bd_disk->private_data; + unsigned long flags; + int rv = 0; + + spin_lock_irqsave(&mdev->req_lock, flags); + /* to have a stable mdev->state.role + * and no race with updating open_cnt */ + + if (mdev->state.role != R_PRIMARY) { + if (mode & FMODE_WRITE) + rv = -EROFS; + else if (!allow_oos) + rv = -EMEDIUMTYPE; + } + + if (!rv) + mdev->open_cnt++; + spin_unlock_irqrestore(&mdev->req_lock, flags); + + return rv; +} + +static int drbd_release(struct gendisk *gd, fmode_t mode) +{ + struct drbd_conf *mdev = gd->private_data; + mdev->open_cnt--; + return 0; +} + +static void drbd_unplug_fn(struct request_queue *q) +{ + struct drbd_conf *mdev = q->queuedata; + + trace_drbd_unplug(mdev, "got unplugged"); + + /* unplug FIRST */ + spin_lock_irq(q->queue_lock); + blk_remove_plug(q); + spin_unlock_irq(q->queue_lock); + + /* only if connected */ + spin_lock_irq(&mdev->req_lock); + if (mdev->state.pdsk >= D_INCONSISTENT && mdev->state.conn >= C_CONNECTED) { + D_ASSERT(mdev->state.role == R_PRIMARY); + if (test_and_clear_bit(UNPLUG_REMOTE, &mdev->flags)) { + /* add to the data.work queue, + * unless already queued. + * XXX this might be a good addition to drbd_queue_work + * anyways, to detect "double queuing" ... */ + if (list_empty(&mdev->unplug_work.list)) + drbd_queue_work(&mdev->data.work, + &mdev->unplug_work); + } + } + spin_unlock_irq(&mdev->req_lock); + + if (mdev->state.disk >= D_INCONSISTENT) + drbd_kick_lo(mdev); +} + +static void drbd_set_defaults(struct drbd_conf *mdev) +{ + mdev->sync_conf.after = DRBD_AFTER_DEF; + mdev->sync_conf.rate = DRBD_RATE_DEF; + mdev->sync_conf.al_extents = DRBD_AL_EXTENTS_DEF; + mdev->state = (union drbd_state) { + { .role = R_SECONDARY, + .peer = R_UNKNOWN, + .conn = C_STANDALONE, + .disk = D_DISKLESS, + .pdsk = D_UNKNOWN, + .susp = 0 + } }; +} + +void drbd_init_set_defaults(struct drbd_conf *mdev) +{ + /* the memset(,0,) did most of this. + * note: only assignments, no allocation in here */ + + drbd_set_defaults(mdev); + + /* for now, we do NOT yet support it, + * even though we start some framework + * to eventually support barriers */ + set_bit(NO_BARRIER_SUPP, &mdev->flags); + + atomic_set(&mdev->ap_bio_cnt, 0); + atomic_set(&mdev->ap_pending_cnt, 0); + atomic_set(&mdev->rs_pending_cnt, 0); + atomic_set(&mdev->unacked_cnt, 0); + atomic_set(&mdev->local_cnt, 0); + atomic_set(&mdev->net_cnt, 0); + atomic_set(&mdev->packet_seq, 0); + atomic_set(&mdev->pp_in_use, 0); + + mutex_init(&mdev->md_io_mutex); + mutex_init(&mdev->data.mutex); + mutex_init(&mdev->meta.mutex); + sema_init(&mdev->data.work.s, 0); + sema_init(&mdev->meta.work.s, 0); + mutex_init(&mdev->state_mutex); + + spin_lock_init(&mdev->data.work.q_lock); + spin_lock_init(&mdev->meta.work.q_lock); + + spin_lock_init(&mdev->al_lock); + spin_lock_init(&mdev->req_lock); + spin_lock_init(&mdev->peer_seq_lock); + spin_lock_init(&mdev->epoch_lock); + + INIT_LIST_HEAD(&mdev->active_ee); + INIT_LIST_HEAD(&mdev->sync_ee); + INIT_LIST_HEAD(&mdev->done_ee); + INIT_LIST_HEAD(&mdev->read_ee); + INIT_LIST_HEAD(&mdev->net_ee); + INIT_LIST_HEAD(&mdev->resync_reads); + INIT_LIST_HEAD(&mdev->data.work.q); + INIT_LIST_HEAD(&mdev->meta.work.q); + INIT_LIST_HEAD(&mdev->resync_work.list); + INIT_LIST_HEAD(&mdev->unplug_work.list); + INIT_LIST_HEAD(&mdev->md_sync_work.list); + INIT_LIST_HEAD(&mdev->bm_io_work.w.list); + mdev->resync_work.cb = w_resync_inactive; + mdev->unplug_work.cb = w_send_write_hint; + mdev->md_sync_work.cb = w_md_sync; + mdev->bm_io_work.w.cb = w_bitmap_io; + init_timer(&mdev->resync_timer); + init_timer(&mdev->md_sync_timer); + mdev->resync_timer.function = resync_timer_fn; + mdev->resync_timer.data = (unsigned long) mdev; + mdev->md_sync_timer.function = md_sync_timer_fn; + mdev->md_sync_timer.data = (unsigned long) mdev; + + init_waitqueue_head(&mdev->misc_wait); + init_waitqueue_head(&mdev->state_wait); + init_waitqueue_head(&mdev->ee_wait); + init_waitqueue_head(&mdev->al_wait); + init_waitqueue_head(&mdev->seq_wait); + + drbd_thread_init(mdev, &mdev->receiver, drbdd_init); + drbd_thread_init(mdev, &mdev->worker, drbd_worker); + drbd_thread_init(mdev, &mdev->asender, drbd_asender); + + mdev->agreed_pro_version = PRO_VERSION_MAX; + mdev->write_ordering = WO_bio_barrier; + mdev->resync_wenr = LC_FREE; +} + +void drbd_mdev_cleanup(struct drbd_conf *mdev) +{ + if (mdev->receiver.t_state != None) + dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n", + mdev->receiver.t_state); + + /* no need to lock it, I'm the only thread alive */ + if (atomic_read(&mdev->current_epoch->epoch_size) != 0) + dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size)); + mdev->al_writ_cnt = + mdev->bm_writ_cnt = + mdev->read_cnt = + mdev->recv_cnt = + mdev->send_cnt = + mdev->writ_cnt = + mdev->p_size = + mdev->rs_start = + mdev->rs_total = + mdev->rs_failed = + mdev->rs_mark_left = + mdev->rs_mark_time = 0; + D_ASSERT(mdev->net_conf == NULL); + + drbd_set_my_capacity(mdev, 0); + if (mdev->bitmap) { + /* maybe never allocated. */ + drbd_bm_resize(mdev, 0); + drbd_bm_cleanup(mdev); + } + + drbd_free_resources(mdev); + + /* + * currently we drbd_init_ee only on module load, so + * we may do drbd_release_ee only on module unload! + */ + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->net_ee)); + D_ASSERT(list_empty(&mdev->resync_reads)); + D_ASSERT(list_empty(&mdev->data.work.q)); + D_ASSERT(list_empty(&mdev->meta.work.q)); + D_ASSERT(list_empty(&mdev->resync_work.list)); + D_ASSERT(list_empty(&mdev->unplug_work.list)); + +} + + +static void drbd_destroy_mempools(void) +{ + struct page *page; + + while (drbd_pp_pool) { + page = drbd_pp_pool; + drbd_pp_pool = (struct page *)page_private(page); + __free_page(page); + drbd_pp_vacant--; + } + + /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + + if (drbd_ee_mempool) + mempool_destroy(drbd_ee_mempool); + if (drbd_request_mempool) + mempool_destroy(drbd_request_mempool); + if (drbd_ee_cache) + kmem_cache_destroy(drbd_ee_cache); + if (drbd_request_cache) + kmem_cache_destroy(drbd_request_cache); + if (drbd_bm_ext_cache) + kmem_cache_destroy(drbd_bm_ext_cache); + if (drbd_al_ext_cache) + kmem_cache_destroy(drbd_al_ext_cache); + + drbd_ee_mempool = NULL; + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + + return; +} + +static int drbd_create_mempools(void) +{ + struct page *page; + const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; + int i; + + /* prepare our caches and mempools */ + drbd_request_mempool = NULL; + drbd_ee_cache = NULL; + drbd_request_cache = NULL; + drbd_bm_ext_cache = NULL; + drbd_al_ext_cache = NULL; + drbd_pp_pool = NULL; + + /* caches */ + drbd_request_cache = kmem_cache_create( + "drbd_req", sizeof(struct drbd_request), 0, 0, NULL); + if (drbd_request_cache == NULL) + goto Enomem; + + drbd_ee_cache = kmem_cache_create( + "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL); + if (drbd_ee_cache == NULL) + goto Enomem; + + drbd_bm_ext_cache = kmem_cache_create( + "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL); + if (drbd_bm_ext_cache == NULL) + goto Enomem; + + drbd_al_ext_cache = kmem_cache_create( + "drbd_al", sizeof(struct lc_element), 0, 0, NULL); + if (drbd_al_ext_cache == NULL) + goto Enomem; + + /* mempools */ + drbd_request_mempool = mempool_create(number, + mempool_alloc_slab, mempool_free_slab, drbd_request_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + drbd_ee_mempool = mempool_create(number, + mempool_alloc_slab, mempool_free_slab, drbd_ee_cache); + if (drbd_request_mempool == NULL) + goto Enomem; + + /* drbd's page pool */ + spin_lock_init(&drbd_pp_lock); + + for (i = 0; i < number; i++) { + page = alloc_page(GFP_HIGHUSER); + if (!page) + goto Enomem; + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + } + drbd_pp_vacant = number; + + return 0; + +Enomem: + drbd_destroy_mempools(); /* in case we allocated some */ + return -ENOMEM; +} + +static int drbd_notify_sys(struct notifier_block *this, unsigned long code, + void *unused) +{ + /* just so we have it. you never know what interesting things we + * might want to do here some day... + */ + + return NOTIFY_DONE; +} + +static struct notifier_block drbd_notifier = { + .notifier_call = drbd_notify_sys, +}; + +static void drbd_release_ee_lists(struct drbd_conf *mdev) +{ + int rr; + + rr = drbd_release_ee(mdev, &mdev->active_ee); + if (rr) + dev_err(DEV, "%d EEs in active list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->sync_ee); + if (rr) + dev_err(DEV, "%d EEs in sync list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->read_ee); + if (rr) + dev_err(DEV, "%d EEs in read list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->done_ee); + if (rr) + dev_err(DEV, "%d EEs in done list found!\n", rr); + + rr = drbd_release_ee(mdev, &mdev->net_ee); + if (rr) + dev_err(DEV, "%d EEs in net list found!\n", rr); +} + +/* caution. no locking. + * currently only used from module cleanup code. */ +static void drbd_delete_device(unsigned int minor) +{ + struct drbd_conf *mdev = minor_to_mdev(minor); + + if (!mdev) + return; + + /* paranoia asserts */ + if (mdev->open_cnt != 0) + dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, + __FILE__ , __LINE__); + + ERR_IF (!list_empty(&mdev->data.work.q)) { + struct list_head *lp; + list_for_each(lp, &mdev->data.work.q) { + dev_err(DEV, "lp = %p\n", lp); + } + }; + /* end paranoia asserts */ + + del_gendisk(mdev->vdisk); + + /* cleanup stuff that may have been allocated during + * device (re-)configuration or state changes */ + + if (mdev->this_bdev) + bdput(mdev->this_bdev); + + drbd_free_resources(mdev); + + drbd_release_ee_lists(mdev); + + /* should be free'd on disconnect? */ + kfree(mdev->ee_hash); + /* + mdev->ee_hash_s = 0; + mdev->ee_hash = NULL; + */ + + lc_destroy(mdev->act_log); + lc_destroy(mdev->resync); + + kfree(mdev->p_uuid); + /* mdev->p_uuid = NULL; */ + + kfree(mdev->int_dig_out); + kfree(mdev->int_dig_in); + kfree(mdev->int_dig_vv); + + /* cleanup the rest that has been + * allocated from drbd_new_device + * and actually free the mdev itself */ + drbd_free_mdev(mdev); +} + +static void drbd_cleanup(void) +{ + unsigned int i; + + unregister_reboot_notifier(&drbd_notifier); + + drbd_nl_cleanup(); + + if (minor_table) { + if (drbd_proc) + remove_proc_entry("drbd", NULL); + i = minor_count; + while (i--) + drbd_delete_device(i); + drbd_destroy_mempools(); + } + + kfree(minor_table); + + unregister_blkdev(DRBD_MAJOR, "drbd"); + + printk(KERN_INFO "drbd: module cleanup done.\n"); +} + +/** + * drbd_congested() - Callback for pdflush + * @congested_data: User data + * @bdi_bits: Bits pdflush is currently interested in + * + * Returns 1<ldev->backing_bdev); + r = bdi_congested(&q->backing_dev_info, bdi_bits); + put_ldev(mdev); + if (r) + reason = 'b'; + } + + if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->flags)) { + r |= (1 << BDI_async_congested); + reason = reason == 'b' ? 'a' : 'n'; + } + +out: + mdev->congestion_reason = reason; + return r; +} + +struct drbd_conf *drbd_new_device(unsigned int minor) +{ + struct drbd_conf *mdev; + struct gendisk *disk; + struct request_queue *q; + + /* GFP_KERNEL, we are outside of all write-out paths */ + mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL); + if (!mdev) + return NULL; + if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL)) + goto out_no_cpumask; + + mdev->minor = minor; + + drbd_init_set_defaults(mdev); + + q = blk_alloc_queue(GFP_KERNEL); + if (!q) + goto out_no_q; + mdev->rq_queue = q; + q->queuedata = mdev; + blk_queue_max_segment_size(q, DRBD_MAX_SEGMENT_SIZE); + + disk = alloc_disk(1); + if (!disk) + goto out_no_disk; + mdev->vdisk = disk; + + set_disk_ro(disk, TRUE); + + disk->queue = q; + disk->major = DRBD_MAJOR; + disk->first_minor = minor; + disk->fops = &drbd_ops; + sprintf(disk->disk_name, "drbd%d", minor); + disk->private_data = mdev; + + mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor)); + /* we have no partitions. we contain only ourselves. */ + mdev->this_bdev->bd_contains = mdev->this_bdev; + + q->backing_dev_info.congested_fn = drbd_congested; + q->backing_dev_info.congested_data = mdev; + + blk_queue_make_request(q, drbd_make_request_26); + blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); + blk_queue_merge_bvec(q, drbd_merge_bvec); + q->queue_lock = &mdev->req_lock; /* needed since we use */ + /* plugging on a queue, that actually has no requests! */ + q->unplug_fn = drbd_unplug_fn; + + mdev->md_io_page = alloc_page(GFP_KERNEL); + if (!mdev->md_io_page) + goto out_no_io_page; + + if (drbd_bm_init(mdev)) + goto out_no_bitmap; + /* no need to lock access, we are still initializing this minor device. */ + if (!tl_init(mdev)) + goto out_no_tl; + + mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL); + if (!mdev->app_reads_hash) + goto out_no_app_reads; + + mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL); + if (!mdev->current_epoch) + goto out_no_epoch; + + INIT_LIST_HEAD(&mdev->current_epoch->list); + mdev->epochs = 1; + + return mdev; + +/* out_whatever_else: + kfree(mdev->current_epoch); */ +out_no_epoch: + kfree(mdev->app_reads_hash); +out_no_app_reads: + tl_cleanup(mdev); +out_no_tl: + drbd_bm_cleanup(mdev); +out_no_bitmap: + __free_page(mdev->md_io_page); +out_no_io_page: + put_disk(disk); +out_no_disk: + blk_cleanup_queue(q); +out_no_q: + free_cpumask_var(mdev->cpu_mask); +out_no_cpumask: + kfree(mdev); + return NULL; +} + +/* counterpart of drbd_new_device. + * last part of drbd_delete_device. */ +void drbd_free_mdev(struct drbd_conf *mdev) +{ + kfree(mdev->current_epoch); + kfree(mdev->app_reads_hash); + tl_cleanup(mdev); + if (mdev->bitmap) /* should no longer be there. */ + drbd_bm_cleanup(mdev); + __free_page(mdev->md_io_page); + put_disk(mdev->vdisk); + blk_cleanup_queue(mdev->rq_queue); + free_cpumask_var(mdev->cpu_mask); + kfree(mdev); +} + + +int __init drbd_init(void) +{ + int err; + + if (sizeof(struct p_handshake) != 80) { + printk(KERN_ERR + "drbd: never change the size or layout " + "of the HandShake packet.\n"); + return -EINVAL; + } + + if (1 > minor_count || minor_count > 255) { + printk(KERN_ERR + "drbd: invalid minor_count (%d)\n", minor_count); +#ifdef MODULE + return -EINVAL; +#else + minor_count = 8; +#endif + } + + err = drbd_nl_init(); + if (err) + return err; + + err = register_blkdev(DRBD_MAJOR, "drbd"); + if (err) { + printk(KERN_ERR + "drbd: unable to register block device major %d\n", + DRBD_MAJOR); + return err; + } + + register_reboot_notifier(&drbd_notifier); + + /* + * allocate all necessary structs + */ + err = -ENOMEM; + + init_waitqueue_head(&drbd_pp_wait); + + drbd_proc = NULL; /* play safe for drbd_cleanup */ + minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count, + GFP_KERNEL); + if (!minor_table) + goto Enomem; + + err = drbd_create_mempools(); + if (err) + goto Enomem; + + drbd_proc = proc_create("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops); + if (!drbd_proc) { + printk(KERN_ERR "drbd: unable to register proc file\n"); + goto Enomem; + } + + rwlock_init(&global_state_lock); + + printk(KERN_INFO "drbd: initialized. " + "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX); + printk(KERN_INFO "drbd: %s\n", drbd_buildtag()); + printk(KERN_INFO "drbd: registered as block device major %d\n", + DRBD_MAJOR); + printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table); + + return 0; /* Success! */ + +Enomem: + drbd_cleanup(); + if (err == -ENOMEM) + /* currently always the case */ + printk(KERN_ERR "drbd: ran out of memory\n"); + else + printk(KERN_ERR "drbd: initialization failure\n"); + return err; +} + +void drbd_free_bc(struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + bd_release(ldev->backing_bdev); + bd_release(ldev->md_bdev); + + fput(ldev->lo_file); + fput(ldev->md_file); + + kfree(ldev); +} + +void drbd_free_sock(struct drbd_conf *mdev) +{ + if (mdev->data.socket) { + kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR); + sock_release(mdev->data.socket); + mdev->data.socket = NULL; + } + if (mdev->meta.socket) { + kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR); + sock_release(mdev->meta.socket); + mdev->meta.socket = NULL; + } +} + + +void drbd_free_resources(struct drbd_conf *mdev) +{ + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = NULL; + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = NULL; + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + crypto_free_hash(mdev->integrity_w_tfm); + mdev->integrity_w_tfm = NULL; + crypto_free_hash(mdev->integrity_r_tfm); + mdev->integrity_r_tfm = NULL; + + drbd_free_sock(mdev); + + __no_warn(local, + drbd_free_bc(mdev->ldev); + mdev->ldev = NULL;); +} + +/* meta data management */ + +struct meta_data_on_disk { + u64 la_size; /* last agreed size. */ + u64 uuid[UI_SIZE]; /* UUIDs. */ + u64 device_uuid; + u64 reserved_u64_1; + u32 flags; /* MDF */ + u32 magic; + u32 md_size_sect; + u32 al_offset; /* offset to this block */ + u32 al_nr_extents; /* important for restoring the AL */ + /* `-- act_log->nr_elements <-- sync_conf.al_extents */ + u32 bm_offset; /* offset to the bitmap, from here */ + u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ + u32 reserved_u32[4]; + +} __packed; + +/** + * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set + * @mdev: DRBD device. + */ +void drbd_md_sync(struct drbd_conf *mdev) +{ + struct meta_data_on_disk *buffer; + sector_t sector; + int i; + + if (!test_and_clear_bit(MD_DIRTY, &mdev->flags)) + return; + del_timer(&mdev->md_sync_timer); + + /* We use here D_FAILED and not D_ATTACHING because we try to write + * metadata even if we detach due to a disk failure! */ + if (!get_ldev_if_state(mdev, D_FAILED)) + return; + + trace_drbd_md_io(mdev, WRITE, mdev->ldev); + + mutex_lock(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + memset(buffer, 0, 512); + + buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); + for (i = UI_CURRENT; i < UI_SIZE; i++) + buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]); + buffer->flags = cpu_to_be32(mdev->ldev->md.flags); + buffer->magic = cpu_to_be32(DRBD_MD_MAGIC); + + buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect); + buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset); + buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements); + buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE); + buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); + + buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); + + D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); + sector = mdev->ldev->md.md_offset; + + if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { + clear_bit(MD_DIRTY, &mdev->flags); + } else { + /* this was a try anyways ... */ + dev_err(DEV, "meta data update failed!\n"); + + drbd_chk_io_error(mdev, 1, TRUE); + } + + /* Update mdev->ldev->md.la_size_sect, + * since we updated it on metadata. */ + mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); + + mutex_unlock(&mdev->md_io_mutex); + put_ldev(mdev); +} + +/** + * drbd_md_read() - Reads in the meta data super block + * @mdev: DRBD device. + * @bdev: Device from which the meta data should be read in. + * + * Return 0 (NO_ERROR) on success, and an enum drbd_ret_codes in case + * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID. + */ +int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + struct meta_data_on_disk *buffer; + int i, rv = NO_ERROR; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) + return ERR_IO_MD_DISK; + + trace_drbd_md_io(mdev, READ, bdev); + + mutex_lock(&mdev->md_io_mutex); + buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + + if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { + /* NOTE: cant do normal error processing here as this is + called BEFORE disk is attached */ + dev_err(DEV, "Error while reading metadata.\n"); + rv = ERR_IO_MD_DISK; + goto err; + } + + if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) { + dev_err(DEV, "Error while reading metadata, magic not found.\n"); + rv = ERR_MD_INVALID; + goto err; + } + if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) { + dev_err(DEV, "unexpected al_offset: %d (expected %d)\n", + be32_to_cpu(buffer->al_offset), bdev->md.al_offset); + rv = ERR_MD_INVALID; + goto err; + } + if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) { + dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n", + be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset); + rv = ERR_MD_INVALID; + goto err; + } + if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) { + dev_err(DEV, "unexpected md_size: %u (expected %u)\n", + be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect); + rv = ERR_MD_INVALID; + goto err; + } + + if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) { + dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n", + be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE); + rv = ERR_MD_INVALID; + goto err; + } + + bdev->md.la_size_sect = be64_to_cpu(buffer->la_size); + for (i = UI_CURRENT; i < UI_SIZE; i++) + bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]); + bdev->md.flags = be32_to_cpu(buffer->flags); + mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); + bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + + if (mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + err: + mutex_unlock(&mdev->md_io_mutex); + put_ldev(mdev); + + return rv; +} + +/** + * drbd_md_mark_dirty() - Mark meta data super block as dirty + * @mdev: DRBD device. + * + * Call this function if you change anything that should be written to + * the meta-data super block. This function sets MD_DIRTY, and starts a + * timer that ensures that within five seconds you have to call drbd_md_sync(). + */ +void drbd_md_mark_dirty(struct drbd_conf *mdev) +{ + set_bit(MD_DIRTY, &mdev->flags); + mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ); +} + + +static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) +{ + int i; + + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) { + mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; + + trace_drbd_uuid(mdev, i+1); + } +} + +void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) +{ + if (idx == UI_CURRENT) { + if (mdev->state.role == R_PRIMARY) + val |= 1; + else + val &= ~((u64)1); + + drbd_set_ed_uuid(mdev, val); + } + + mdev->ldev->md.uuid[idx] = val; + trace_drbd_uuid(mdev, idx); + drbd_md_mark_dirty(mdev); +} + + +void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) +{ + if (mdev->ldev->md.uuid[idx]) { + drbd_uuid_move_history(mdev); + mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; + trace_drbd_uuid(mdev, UI_HISTORY_START); + } + _drbd_uuid_set(mdev, idx, val); +} + +/** + * drbd_uuid_new_current() - Creates a new current UUID + * @mdev: DRBD device. + * + * Creates a new current UUID, and rotates the old current UUID into + * the bitmap slot. Causes an incremental resync upon next connect. + */ +void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) +{ + u64 val; + + dev_info(DEV, "Creating new current UUID\n"); + D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); + mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; + trace_drbd_uuid(mdev, UI_BITMAP); + + get_random_bytes(&val, sizeof(u64)); + _drbd_uuid_set(mdev, UI_CURRENT, val); +} + +void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) +{ + if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0) + return; + + if (val == 0) { + drbd_uuid_move_history(mdev); + mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; + mdev->ldev->md.uuid[UI_BITMAP] = 0; + trace_drbd_uuid(mdev, UI_HISTORY_START); + trace_drbd_uuid(mdev, UI_BITMAP); + } else { + if (mdev->ldev->md.uuid[UI_BITMAP]) + dev_warn(DEV, "bm UUID already set"); + + mdev->ldev->md.uuid[UI_BITMAP] = val; + mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); + + trace_drbd_uuid(mdev, UI_BITMAP); + } + drbd_md_mark_dirty(mdev); +} + +/** + * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @mdev: DRBD device. + * + * Sets all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_set_n_write(struct drbd_conf *mdev) +{ + int rv = -EIO; + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + drbd_md_set_flag(mdev, MDF_FULL_SYNC); + drbd_md_sync(mdev); + drbd_bm_set_all(mdev); + + rv = drbd_bm_write(mdev); + + if (!rv) { + drbd_md_clear_flag(mdev, MDF_FULL_SYNC); + drbd_md_sync(mdev); + } + + put_ldev(mdev); + } + + return rv; +} + +/** + * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() + * @mdev: DRBD device. + * + * Clears all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_clear_n_write(struct drbd_conf *mdev) +{ + int rv = -EIO; + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + drbd_bm_clear_all(mdev); + rv = drbd_bm_write(mdev); + put_ldev(mdev); + } + + return rv; +} + +static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct bm_io_work *work = container_of(w, struct bm_io_work, w); + int rv; + + D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0); + + drbd_bm_lock(mdev, work->why); + rv = work->io_fn(mdev); + drbd_bm_unlock(mdev); + + clear_bit(BITMAP_IO, &mdev->flags); + wake_up(&mdev->misc_wait); + + if (work->done) + work->done(mdev, rv); + + clear_bit(BITMAP_IO_QUEUED, &mdev->flags); + work->why = NULL; + + return 1; +} + +/** + * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap + * @mdev: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @done: callback to be called after the bitmap IO was performed + * @why: Descriptive text of the reason for doing the IO + * + * While IO on the bitmap happens we freeze application IO thus we ensure + * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be + * called from worker context. It MUST NOT be used while a previous such + * work is still pending! + */ +void drbd_queue_bitmap_io(struct drbd_conf *mdev, + int (*io_fn)(struct drbd_conf *), + void (*done)(struct drbd_conf *, int), + char *why) +{ + D_ASSERT(current == mdev->worker.task); + + D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags)); + D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); + D_ASSERT(list_empty(&mdev->bm_io_work.w.list)); + if (mdev->bm_io_work.why) + dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n", + why, mdev->bm_io_work.why); + + mdev->bm_io_work.io_fn = io_fn; + mdev->bm_io_work.done = done; + mdev->bm_io_work.why = why; + + set_bit(BITMAP_IO, &mdev->flags); + if (atomic_read(&mdev->ap_bio_cnt) == 0) { + if (list_empty(&mdev->bm_io_work.w.list)) { + set_bit(BITMAP_IO_QUEUED, &mdev->flags); + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } else + dev_err(DEV, "FIXME avoided double queuing bm_io_work\n"); + } +} + +/** + * drbd_bitmap_io() - Does an IO operation on the whole bitmap + * @mdev: DRBD device. + * @io_fn: IO callback to be called when bitmap IO is possible + * @why: Descriptive text of the reason for doing the IO + * + * freezes application IO while that the actual IO operations runs. This + * functions MAY NOT be called from worker context. + */ +int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why) +{ + int rv; + + D_ASSERT(current != mdev->worker.task); + + drbd_suspend_io(mdev); + + drbd_bm_lock(mdev, why); + rv = io_fn(mdev); + drbd_bm_unlock(mdev); + + drbd_resume_io(mdev); + + return rv; +} + +void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local) +{ + if ((mdev->ldev->md.flags & flag) != flag) { + drbd_md_mark_dirty(mdev); + mdev->ldev->md.flags |= flag; + } +} + +void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local) +{ + if ((mdev->ldev->md.flags & flag) != 0) { + drbd_md_mark_dirty(mdev); + mdev->ldev->md.flags &= ~flag; + } +} +int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag) +{ + return (bdev->md.flags & flag) != 0; +} + +static void md_sync_timer_fn(unsigned long data) +{ + struct drbd_conf *mdev = (struct drbd_conf *) data; + + drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work); +} + +static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n"); + drbd_md_sync(mdev); + + return 1; +} + +#ifdef CONFIG_DRBD_FAULT_INJECTION +/* Fault insertion support including random number generator shamelessly + * stolen from kernel/rcutorture.c */ +struct fault_random_state { + unsigned long state; + unsigned long count; +}; + +#define FAULT_RANDOM_MULT 39916801 /* prime */ +#define FAULT_RANDOM_ADD 479001701 /* prime */ +#define FAULT_RANDOM_REFRESH 10000 + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from get_random_bytes(). + */ +static unsigned long +_drbd_fault_random(struct fault_random_state *rsp) +{ + long refresh; + + if (--rsp->count < 0) { + get_random_bytes(&refresh, sizeof(refresh)); + rsp->state += refresh; + rsp->count = FAULT_RANDOM_REFRESH; + } + rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD; + return swahw32(rsp->state); +} + +static char * +_drbd_fault_str(unsigned int type) { + static char *_faults[] = { + [DRBD_FAULT_MD_WR] = "Meta-data write", + [DRBD_FAULT_MD_RD] = "Meta-data read", + [DRBD_FAULT_RS_WR] = "Resync write", + [DRBD_FAULT_RS_RD] = "Resync read", + [DRBD_FAULT_DT_WR] = "Data write", + [DRBD_FAULT_DT_RD] = "Data read", + [DRBD_FAULT_DT_RA] = "Data read ahead", + [DRBD_FAULT_BM_ALLOC] = "BM allocation", + [DRBD_FAULT_AL_EE] = "EE allocation" + }; + + return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**"; +} + +unsigned int +_drbd_insert_fault(struct drbd_conf *mdev, unsigned int type) +{ + static struct fault_random_state rrs = {0, 0}; + + unsigned int ret = ( + (fault_devs == 0 || + ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) && + (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate)); + + if (ret) { + fault_count++; + + if (printk_ratelimit()) + dev_warn(DEV, "***Simulating %s failure\n", + _drbd_fault_str(type)); + } + + return ret; +} +#endif + +const char *drbd_buildtag(void) +{ + /* DRBD built from external sources has here a reference to the + git hash of the source code. */ + + static char buildtag[38] = "\0uilt-in"; + + if (buildtag[0] == 0) { +#ifdef CONFIG_MODULES + if (THIS_MODULE != NULL) + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); + else +#endif + buildtag[0] = 'b'; + } + + return buildtag; +} + +module_init(drbd_init) +module_exit(drbd_cleanup) + +/* For drbd_tracing: */ +EXPORT_SYMBOL(drbd_conn_str); +EXPORT_SYMBOL(drbd_role_str); +EXPORT_SYMBOL(drbd_disk_str); +EXPORT_SYMBOL(drbd_set_st_err_str); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c new file mode 100644 index 00000000000..1927acefe23 --- /dev/null +++ b/drivers/block/drbd/drbd_nl.c @@ -0,0 +1,2365 @@ +/* + drbd_nl.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" +#include "drbd_tracing.h" +#include "drbd_wrappers.h" +#include +#include +#include + +static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int); +static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *); +static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *); + +/* see get_sb_bdev and bd_claim */ +static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; + +/* Generate the tag_list to struct functions */ +#define NL_PACKET(name, number, fields) \ +static int name ## _from_tags(struct drbd_conf *mdev, \ + unsigned short *tags, struct name *arg) __attribute__ ((unused)); \ +static int name ## _from_tags(struct drbd_conf *mdev, \ + unsigned short *tags, struct name *arg) \ +{ \ + int tag; \ + int dlen; \ + \ + while ((tag = get_unaligned(tags++)) != TT_END) { \ + dlen = get_unaligned(tags++); \ + switch (tag_number(tag)) { \ + fields \ + default: \ + if (tag & T_MANDATORY) { \ + dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \ + return 0; \ + } \ + } \ + tags = (unsigned short *)((char *)tags + dlen); \ + } \ + return 1; \ +} +#define NL_INTEGER(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \ + arg->member = get_unaligned((int *)(tags)); \ + break; +#define NL_INT64(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \ + arg->member = get_unaligned((u64 *)(tags)); \ + break; +#define NL_BIT(pn, pr, member) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \ + arg->member = *(char *)(tags) ? 1 : 0; \ + break; +#define NL_STRING(pn, pr, member, len) \ + case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \ + if (dlen > len) { \ + dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \ + #member, dlen, (unsigned int)len); \ + return 0; \ + } \ + arg->member ## _len = dlen; \ + memcpy(arg->member, tags, min_t(size_t, dlen, len)); \ + break; +#include "linux/drbd_nl.h" + +/* Generate the struct to tag_list functions */ +#define NL_PACKET(name, number, fields) \ +static unsigned short* \ +name ## _to_tags(struct drbd_conf *mdev, \ + struct name *arg, unsigned short *tags) __attribute__ ((unused)); \ +static unsigned short* \ +name ## _to_tags(struct drbd_conf *mdev, \ + struct name *arg, unsigned short *tags) \ +{ \ + fields \ + return tags; \ +} + +#define NL_INTEGER(pn, pr, member) \ + put_unaligned(pn | pr | TT_INTEGER, tags++); \ + put_unaligned(sizeof(int), tags++); \ + put_unaligned(arg->member, (int *)tags); \ + tags = (unsigned short *)((char *)tags+sizeof(int)); +#define NL_INT64(pn, pr, member) \ + put_unaligned(pn | pr | TT_INT64, tags++); \ + put_unaligned(sizeof(u64), tags++); \ + put_unaligned(arg->member, (u64 *)tags); \ + tags = (unsigned short *)((char *)tags+sizeof(u64)); +#define NL_BIT(pn, pr, member) \ + put_unaligned(pn | pr | TT_BIT, tags++); \ + put_unaligned(sizeof(char), tags++); \ + *(char *)tags = arg->member; \ + tags = (unsigned short *)((char *)tags+sizeof(char)); +#define NL_STRING(pn, pr, member, len) \ + put_unaligned(pn | pr | TT_STRING, tags++); \ + put_unaligned(arg->member ## _len, tags++); \ + memcpy(tags, arg->member, arg->member ## _len); \ + tags = (unsigned short *)((char *)tags + arg->member ## _len); +#include "linux/drbd_nl.h" + +void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name); +void drbd_nl_send_reply(struct cn_msg *, int); + +int drbd_khelper(struct drbd_conf *mdev, char *cmd) +{ + char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL, /* Will be set to address family */ + NULL, /* Will be set to address */ + NULL }; + + char mb[12], af[20], ad[60], *afs; + char *argv[] = {usermode_helper, cmd, mb, NULL }; + int ret; + + snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); + + if (get_net_conf(mdev)) { + switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) { + case AF_INET6: + afs = "ipv6"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6", + &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr); + break; + case AF_INET: + afs = "ipv4"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); + break; + default: + afs = "ssocks"; + snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4", + &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr); + } + snprintf(af, 20, "DRBD_PEER_AF=%s", afs); + envp[3]=af; + envp[4]=ad; + put_net_conf(mdev); + } + + dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); + + drbd_bcast_ev_helper(mdev, cmd); + ret = call_usermodehelper(usermode_helper, argv, envp, 1); + if (ret) + dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + else + dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", + usermode_helper, cmd, mb, + (ret >> 8) & 0xff, ret); + + if (ret < 0) /* Ignore any ERRNOs we got. */ + ret = 0; + + return ret; +} + +enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev) +{ + char *ex_to_string; + int r; + enum drbd_disk_state nps; + enum drbd_fencing_p fp; + + D_ASSERT(mdev->state.pdsk == D_UNKNOWN); + + if (get_ldev_if_state(mdev, D_CONSISTENT)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } else { + dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n"); + return mdev->state.pdsk; + } + + if (fp == FP_STONITH) + _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE); + + r = drbd_khelper(mdev, "fence-peer"); + + switch ((r>>8) & 0xff) { + case 3: /* peer is inconsistent */ + ex_to_string = "peer is inconsistent or worse"; + nps = D_INCONSISTENT; + break; + case 4: /* peer got outdated, or was already outdated */ + ex_to_string = "peer was fenced"; + nps = D_OUTDATED; + break; + case 5: /* peer was down */ + if (mdev->state.disk == D_UP_TO_DATE) { + /* we will(have) create(d) a new UUID anyways... */ + ex_to_string = "peer is unreachable, assumed to be dead"; + nps = D_OUTDATED; + } else { + ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; + nps = mdev->state.pdsk; + } + break; + case 6: /* Peer is primary, voluntarily outdate myself. + * This is useful when an unconnected R_SECONDARY is asked to + * become R_PRIMARY, but finds the other peer being active. */ + ex_to_string = "peer is active"; + dev_warn(DEV, "Peer is primary, outdating myself.\n"); + nps = D_UNKNOWN; + _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE); + break; + case 7: + if (fp != FP_STONITH) + dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n"); + ex_to_string = "peer was stonithed"; + nps = D_OUTDATED; + break; + default: + /* The script is broken ... */ + nps = D_UNKNOWN; + dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff); + return nps; + } + + dev_info(DEV, "fence-peer helper returned %d (%s)\n", + (r>>8) & 0xff, ex_to_string); + return nps; +} + + +int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) +{ + const int max_tries = 4; + int r = 0; + int try = 0; + int forced = 0; + union drbd_state mask, val; + enum drbd_disk_state nps; + + if (new_role == R_PRIMARY) + request_ping(mdev); /* Detect a dead peer ASAP */ + + mutex_lock(&mdev->state_mutex); + + mask.i = 0; mask.role = R_MASK; + val.i = 0; val.role = new_role; + + while (try++ < max_tries) { + r = _drbd_request_state(mdev, mask, val, CS_WAIT_COMPLETE); + + /* in case we first succeeded to outdate, + * but now suddenly could establish a connection */ + if (r == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) { + val.pdsk = 0; + mask.pdsk = 0; + continue; + } + + if (r == SS_NO_UP_TO_DATE_DISK && force && + (mdev->state.disk == D_INCONSISTENT || + mdev->state.disk == D_OUTDATED)) { + mask.disk = D_MASK; + val.disk = D_UP_TO_DATE; + forced = 1; + continue; + } + + if (r == SS_NO_UP_TO_DATE_DISK && + mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) { + D_ASSERT(mdev->state.pdsk == D_UNKNOWN); + nps = drbd_try_outdate_peer(mdev); + + if (nps == D_OUTDATED || nps == D_INCONSISTENT) { + val.disk = D_UP_TO_DATE; + mask.disk = D_MASK; + } + + val.pdsk = nps; + mask.pdsk = D_MASK; + + continue; + } + + if (r == SS_NOTHING_TO_DO) + goto fail; + if (r == SS_PRIMARY_NOP && mask.pdsk == 0) { + nps = drbd_try_outdate_peer(mdev); + + if (force && nps > D_OUTDATED) { + dev_warn(DEV, "Forced into split brain situation!\n"); + nps = D_OUTDATED; + } + + mask.pdsk = D_MASK; + val.pdsk = nps; + + continue; + } + if (r == SS_TWO_PRIMARIES) { + /* Maybe the peer is detected as dead very soon... + retry at most once more in this case. */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); + if (try < max_tries) + try = max_tries - 1; + continue; + } + if (r < SS_SUCCESS) { + r = _drbd_request_state(mdev, mask, val, + CS_VERBOSE + CS_WAIT_COMPLETE); + if (r < SS_SUCCESS) + goto fail; + } + break; + } + + if (r < SS_SUCCESS) + goto fail; + + if (forced) + dev_warn(DEV, "Forced to consider local data as UpToDate!\n"); + + /* Wait until nothing is on the fly :) */ + wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); + + if (new_role == R_SECONDARY) { + set_disk_ro(mdev->vdisk, TRUE); + if (get_ldev(mdev)) { + mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + put_ldev(mdev); + } + } else { + if (get_net_conf(mdev)) { + mdev->net_conf->want_lose = 0; + put_net_conf(mdev); + } + set_disk_ro(mdev->vdisk, FALSE); + if (get_ldev(mdev)) { + if (((mdev->state.conn < C_CONNECTED || + mdev->state.pdsk <= D_FAILED) + && mdev->ldev->md.uuid[UI_BITMAP] == 0) || forced) + drbd_uuid_new_current(mdev); + + mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; + put_ldev(mdev); + } + } + + if ((new_role == R_SECONDARY) && get_ldev(mdev)) { + drbd_al_to_on_disk_bm(mdev); + put_ldev(mdev); + } + + if (mdev->state.conn >= C_WF_REPORT_PARAMS) { + /* if this was forced, we should consider sync */ + if (forced) + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + + drbd_md_sync(mdev); + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); + fail: + mutex_unlock(&mdev->state_mutex); + return r; +} + + +static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct primary primary_args; + + memset(&primary_args, 0, sizeof(struct primary)); + if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; + } + + reply->ret_code = + drbd_set_role(mdev, R_PRIMARY, primary_args.overwrite_peer); + + return 0; +} + +static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0); + + return 0; +} + +/* initializes the md.*_offset members, so we are able to find + * the on disk meta data */ +static void drbd_md_set_sector_offsets(struct drbd_conf *mdev, + struct drbd_backing_dev *bdev) +{ + sector_t md_size_sect = 0; + switch (bdev->dc.meta_dev_idx) { + default: + /* v07 style fixed size indexed meta data */ + bdev->md.md_size_sect = MD_RESERVED_SECT; + bdev->md.md_offset = drbd_md_ss__(mdev, bdev); + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_FLEX_EXT: + /* just occupy the full device; unit: sectors */ + bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev); + bdev->md.md_offset = 0; + bdev->md.al_offset = MD_AL_OFFSET; + bdev->md.bm_offset = MD_BM_OFFSET; + break; + case DRBD_MD_INDEX_INTERNAL: + case DRBD_MD_INDEX_FLEX_INT: + bdev->md.md_offset = drbd_md_ss__(mdev, bdev); + /* al size is still fixed */ + bdev->md.al_offset = -MD_AL_MAX_SIZE; + /* we need (slightly less than) ~ this much bitmap sectors: */ + md_size_sect = drbd_get_capacity(bdev->backing_bdev); + md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT); + md_size_sect = BM_SECT_TO_EXT(md_size_sect); + md_size_sect = ALIGN(md_size_sect, 8); + + /* plus the "drbd meta data super block", + * and the activity log; */ + md_size_sect += MD_BM_OFFSET; + + bdev->md.md_size_sect = md_size_sect; + /* bitmap offset is adjusted by 'super' block size */ + bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET; + break; + } +} + +char *ppsize(char *buf, unsigned long long size) +{ + /* Needs 9 bytes at max. */ + static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' }; + int base = 0; + while (size >= 10000) { + /* shift + round */ + size = (size >> 10) + !!(size & (1<<9)); + base++; + } + sprintf(buf, "%lu %cB", (long)size, units[base]); + + return buf; +} + +/* there is still a theoretical deadlock when called from receiver + * on an D_INCONSISTENT R_PRIMARY: + * remote READ does inc_ap_bio, receiver would need to receive answer + * packet from remote to dec_ap_bio again. + * receiver receive_sizes(), comes here, + * waits for ap_bio_cnt == 0. -> deadlock. + * but this cannot happen, actually, because: + * R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable + * (not connected, or bad/no disk on peer): + * see drbd_fail_request_early, ap_bio_cnt is zero. + * R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET: + * peer may not initiate a resize. + */ +void drbd_suspend_io(struct drbd_conf *mdev) +{ + set_bit(SUSPEND_IO, &mdev->flags); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); +} + +void drbd_resume_io(struct drbd_conf *mdev) +{ + clear_bit(SUSPEND_IO, &mdev->flags); + wake_up(&mdev->misc_wait); +} + +/** + * drbd_determine_dev_size() - Sets the right device size obeying all constraints + * @mdev: DRBD device. + * + * Returns 0 on success, negative return values indicate errors. + * You should call drbd_md_sync() after calling this function. + */ +enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev) __must_hold(local) +{ + sector_t prev_first_sect, prev_size; /* previous meta location */ + sector_t la_size; + sector_t size; + char ppb[10]; + + int md_moved, la_size_changed; + enum determine_dev_size rv = unchanged; + + /* race: + * application request passes inc_ap_bio, + * but then cannot get an AL-reference. + * this function later may wait on ap_bio_cnt == 0. -> deadlock. + * + * to avoid that: + * Suspend IO right here. + * still lock the act_log to not trigger ASSERTs there. + */ + drbd_suspend_io(mdev); + + /* no wait necessary anymore, actually we could assert that */ + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + + prev_first_sect = drbd_md_first_sector(mdev->ldev); + prev_size = mdev->ldev->md.md_size_sect; + la_size = mdev->ldev->md.la_size_sect; + + /* TODO: should only be some assert here, not (re)init... */ + drbd_md_set_sector_offsets(mdev, mdev->ldev); + + size = drbd_new_dev_size(mdev, mdev->ldev); + + if (drbd_get_capacity(mdev->this_bdev) != size || + drbd_bm_capacity(mdev) != size) { + int err; + err = drbd_bm_resize(mdev, size); + if (unlikely(err)) { + /* currently there is only one error: ENOMEM! */ + size = drbd_bm_capacity(mdev)>>1; + if (size == 0) { + dev_err(DEV, "OUT OF MEMORY! " + "Could not allocate bitmap!\n"); + } else { + dev_err(DEV, "BM resizing failed. " + "Leaving size unchanged at size = %lu KB\n", + (unsigned long)size); + } + rv = dev_size_error; + } + /* racy, see comments above. */ + drbd_set_my_capacity(mdev, size); + mdev->ldev->md.la_size_sect = size; + dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), + (unsigned long long)size>>1); + } + if (rv == dev_size_error) + goto out; + + la_size_changed = (la_size != mdev->ldev->md.la_size_sect); + + md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev) + || prev_size != mdev->ldev->md.md_size_sect; + + if (la_size_changed || md_moved) { + drbd_al_shrink(mdev); /* All extents inactive. */ + dev_info(DEV, "Writing the whole bitmap, %s\n", + la_size_changed && md_moved ? "size changed and md moved" : + la_size_changed ? "size changed" : "md moved"); + rv = drbd_bitmap_io(mdev, &drbd_bm_write, "size changed"); /* does drbd_resume_io() ! */ + drbd_md_mark_dirty(mdev); + } + + if (size > la_size) + rv = grew; + if (size < la_size) + rv = shrunk; +out: + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + drbd_resume_io(mdev); + + return rv; +} + +sector_t +drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) +{ + sector_t p_size = mdev->p_size; /* partner's disk size. */ + sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */ + sector_t m_size; /* my size */ + sector_t u_size = bdev->dc.disk_size; /* size requested by user. */ + sector_t size = 0; + + m_size = drbd_get_max_capacity(bdev); + + if (p_size && m_size) { + size = min_t(sector_t, p_size, m_size); + } else { + if (la_size) { + size = la_size; + if (m_size && m_size < size) + size = m_size; + if (p_size && p_size < size) + size = p_size; + } else { + if (m_size) + size = m_size; + if (p_size) + size = p_size; + } + } + + if (size == 0) + dev_err(DEV, "Both nodes diskless!\n"); + + if (u_size) { + if (u_size > size) + dev_err(DEV, "Requested disk size is too big (%lu > %lu)\n", + (unsigned long)u_size>>1, (unsigned long)size>>1); + else + size = u_size; + } + + return size; +} + +/** + * drbd_check_al_size() - Ensures that the AL is of the right size + * @mdev: DRBD device. + * + * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation + * failed, and 0 on success. You should call drbd_md_sync() after you called + * this function. + */ +static int drbd_check_al_size(struct drbd_conf *mdev) +{ + struct lru_cache *n, *t; + struct lc_element *e; + unsigned int in_use; + int i; + + ERR_IF(mdev->sync_conf.al_extents < 7) + mdev->sync_conf.al_extents = 127; + + if (mdev->act_log && + mdev->act_log->nr_elements == mdev->sync_conf.al_extents) + return 0; + + in_use = 0; + t = mdev->act_log; + n = lc_create("act_log", drbd_al_ext_cache, + mdev->sync_conf.al_extents, sizeof(struct lc_element), 0); + + if (n == NULL) { + dev_err(DEV, "Cannot allocate act_log lru!\n"); + return -ENOMEM; + } + spin_lock_irq(&mdev->al_lock); + if (t) { + for (i = 0; i < t->nr_elements; i++) { + e = lc_element_by_index(t, i); + if (e->refcnt) + dev_err(DEV, "refcnt(%d)==%d\n", + e->lc_number, e->refcnt); + in_use += e->refcnt; + } + } + if (!in_use) + mdev->act_log = n; + spin_unlock_irq(&mdev->al_lock); + if (in_use) { + dev_err(DEV, "Activity log still in use!\n"); + lc_destroy(n); + return -EBUSY; + } else { + if (t) + lc_destroy(t); + } + drbd_md_mark_dirty(mdev); /* we changed mdev->act_log->nr_elemens */ + return 0; +} + +void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __must_hold(local) +{ + struct request_queue * const q = mdev->rq_queue; + struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; + int max_segments = mdev->ldev->dc.max_bio_bvecs; + + if (b->merge_bvec_fn && !mdev->ldev->dc.use_bmbv) + max_seg_s = PAGE_SIZE; + + max_seg_s = min(queue_max_sectors(b) * queue_logical_block_size(b), max_seg_s); + + blk_queue_max_sectors(q, max_seg_s >> 9); + blk_queue_max_phys_segments(q, max_segments ? max_segments : MAX_PHYS_SEGMENTS); + blk_queue_max_hw_segments(q, max_segments ? max_segments : MAX_HW_SEGMENTS); + blk_queue_max_segment_size(q, max_seg_s); + blk_queue_logical_block_size(q, 512); + blk_queue_segment_boundary(q, PAGE_SIZE-1); + blk_stack_limits(&q->limits, &b->limits, 0); + + if (b->merge_bvec_fn) + dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n", + b->merge_bvec_fn); + dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q)); + + if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } +} + +/* serialize deconfig (worker exiting, doing cleanup) + * and reconfig (drbdsetup disk, drbdsetup net) + * + * wait for a potentially exiting worker, then restart it, + * or start a new one. + */ +static void drbd_reconfig_start(struct drbd_conf *mdev) +{ + wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags)); + wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); + drbd_thread_start(&mdev->worker); +} + +/* if still unconfigured, stops worker again. + * if configured now, clears CONFIG_PENDING. + * wakes potential waiters */ +static void drbd_reconfig_done(struct drbd_conf *mdev) +{ + spin_lock_irq(&mdev->req_lock); + if (mdev->state.disk == D_DISKLESS && + mdev->state.conn == C_STANDALONE && + mdev->state.role == R_SECONDARY) { + set_bit(DEVICE_DYING, &mdev->flags); + drbd_thread_stop_nowait(&mdev->worker); + } else + clear_bit(CONFIG_PENDING, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + wake_up(&mdev->state_wait); +} + +/* does always return 0; + * interesting return code is in reply->ret_code */ +static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + enum drbd_ret_codes retcode; + enum determine_dev_size dd; + sector_t max_possible_sectors; + sector_t min_md_device_sectors; + struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ + struct inode *inode, *inode2; + struct lru_cache *resync_lru = NULL; + union drbd_state ns, os; + int rv; + int cp_discovered = 0; + int logical_block_size; + + drbd_reconfig_start(mdev); + + /* if you want to reconfigure, please tear down first */ + if (mdev->state.disk > D_DISKLESS) { + retcode = ERR_DISK_CONFIGURED; + goto fail; + } + + /* allocation not in the IO path, cqueue thread context */ + nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); + if (!nbc) { + retcode = ERR_NOMEM; + goto fail; + } + + nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF; + nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF; + nbc->dc.fencing = DRBD_FENCING_DEF; + nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF; + + if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) { + retcode = ERR_MD_IDX_INVALID; + goto fail; + } + + nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0); + if (IS_ERR(nbc->lo_file)) { + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev, + PTR_ERR(nbc->lo_file)); + nbc->lo_file = NULL; + retcode = ERR_OPEN_DISK; + goto fail; + } + + inode = nbc->lo_file->f_dentry->d_inode; + + if (!S_ISBLK(inode->i_mode)) { + retcode = ERR_DISK_NOT_BDEV; + goto fail; + } + + nbc->md_file = filp_open(nbc->dc.meta_dev, O_RDWR, 0); + if (IS_ERR(nbc->md_file)) { + dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev, + PTR_ERR(nbc->md_file)); + nbc->md_file = NULL; + retcode = ERR_OPEN_MD_DISK; + goto fail; + } + + inode2 = nbc->md_file->f_dentry->d_inode; + + if (!S_ISBLK(inode2->i_mode)) { + retcode = ERR_MD_NOT_BDEV; + goto fail; + } + + nbc->backing_bdev = inode->i_bdev; + if (bd_claim(nbc->backing_bdev, mdev)) { + printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", + nbc->backing_bdev, mdev, + nbc->backing_bdev->bd_holder, + nbc->backing_bdev->bd_contains->bd_holder, + nbc->backing_bdev->bd_holders); + retcode = ERR_BDCLAIM_DISK; + goto fail; + } + + resync_lru = lc_create("resync", drbd_bm_ext_cache, + 61, sizeof(struct bm_extent), + offsetof(struct bm_extent, lce)); + if (!resync_lru) { + retcode = ERR_NOMEM; + goto release_bdev_fail; + } + + /* meta_dev_idx >= 0: external fixed size, + * possibly multiple drbd sharing one meta device. + * TODO in that case, paranoia check that [md_bdev, meta_dev_idx] is + * not yet used by some other drbd minor! + * (if you use drbd.conf + drbdadm, + * that should check it for you already; but if you don't, or someone + * fooled it, we need to double check here) */ + nbc->md_bdev = inode2->i_bdev; + if (bd_claim(nbc->md_bdev, (nbc->dc.meta_dev_idx < 0) ? (void *)mdev + : (void *) drbd_m_holder)) { + retcode = ERR_BDCLAIM_MD_DISK; + goto release_bdev_fail; + } + + if ((nbc->backing_bdev == nbc->md_bdev) != + (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL || + nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) { + retcode = ERR_MD_IDX_INVALID; + goto release_bdev2_fail; + } + + /* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */ + drbd_md_set_sector_offsets(mdev, nbc); + + if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { + dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", + (unsigned long long) drbd_get_max_capacity(nbc), + (unsigned long long) nbc->dc.disk_size); + retcode = ERR_DISK_TO_SMALL; + goto release_bdev2_fail; + } + + if (nbc->dc.meta_dev_idx < 0) { + max_possible_sectors = DRBD_MAX_SECTORS_FLEX; + /* at least one MB, otherwise it does not make sense */ + min_md_device_sectors = (2<<10); + } else { + max_possible_sectors = DRBD_MAX_SECTORS; + min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); + } + + if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors) + dev_warn(DEV, "truncating very big lower level device " + "to currently maximum possible %llu sectors\n", + (unsigned long long) max_possible_sectors); + + if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { + retcode = ERR_MD_DISK_TO_SMALL; + dev_warn(DEV, "refusing attach: md-device too small, " + "at least %llu sectors needed for this meta-disk type\n", + (unsigned long long) min_md_device_sectors); + goto release_bdev2_fail; + } + + /* Make sure the new disk is big enough + * (we may currently be R_PRIMARY with no local disk...) */ + if (drbd_get_max_capacity(nbc) < + drbd_get_capacity(mdev->this_bdev)) { + retcode = ERR_DISK_TO_SMALL; + goto release_bdev2_fail; + } + + nbc->known_size = drbd_get_capacity(nbc->backing_bdev); + + drbd_suspend_io(mdev); + /* also wait for the last barrier ack. */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); + /* and for any other previously queued work */ + drbd_flush_workqueue(mdev); + + retcode = _drbd_request_state(mdev, NS(disk, D_ATTACHING), CS_VERBOSE); + drbd_resume_io(mdev); + if (retcode < SS_SUCCESS) + goto release_bdev2_fail; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) + goto force_diskless; + + drbd_md_set_sector_offsets(mdev, nbc); + + if (!mdev->bitmap) { + if (drbd_bm_init(mdev)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + } + + retcode = drbd_md_read(mdev, nbc); + if (retcode != NO_ERROR) + goto force_diskless_dec; + + if (mdev->state.conn < C_CONNECTED && + mdev->state.role == R_PRIMARY && + (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) { + dev_err(DEV, "Can only attach to data with current UUID=%016llX\n", + (unsigned long long)mdev->ed_uuid); + retcode = ERR_DATA_NOT_CURRENT; + goto force_diskless_dec; + } + + /* Since we are diskless, fix the activity log first... */ + if (drbd_check_al_size(mdev)) { + retcode = ERR_NOMEM; + goto force_diskless_dec; + } + + /* Prevent shrinking of consistent devices ! */ + if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && + drbd_new_dev_size(mdev, nbc) < nbc->md.la_size_sect) { + dev_warn(DEV, "refusing to truncate a consistent device\n"); + retcode = ERR_DISK_TO_SMALL; + goto force_diskless_dec; + } + + if (!drbd_al_read_log(mdev, nbc)) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + + /* allocate a second IO page if logical_block_size != 512 */ + logical_block_size = bdev_logical_block_size(nbc->md_bdev); + if (logical_block_size == 0) + logical_block_size = MD_SECTOR_SIZE; + + if (logical_block_size != MD_SECTOR_SIZE) { + if (!mdev->md_io_tmpp) { + struct page *page = alloc_page(GFP_NOIO); + if (!page) + goto force_diskless_dec; + + dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n", + logical_block_size, MD_SECTOR_SIZE); + dev_warn(DEV, "Workaround engaged (has performance impact).\n"); + + mdev->md_io_tmpp = page; + } + } + + /* Reset the "barriers don't work" bits here, then force meta data to + * be written, to ensure we determine if barriers are supported. */ + if (nbc->dc.no_md_flush) + set_bit(MD_NO_BARRIER, &mdev->flags); + else + clear_bit(MD_NO_BARRIER, &mdev->flags); + + /* Point of no return reached. + * Devices and memory are no longer released by error cleanup below. + * now mdev takes over responsibility, and the state engine should + * clean it up somewhere. */ + D_ASSERT(mdev->ldev == NULL); + mdev->ldev = nbc; + mdev->resync = resync_lru; + nbc = NULL; + resync_lru = NULL; + + mdev->write_ordering = WO_bio_barrier; + drbd_bump_write_ordering(mdev, WO_bio_barrier); + + if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY)) + set_bit(CRASHED_PRIMARY, &mdev->flags); + else + clear_bit(CRASHED_PRIMARY, &mdev->flags); + + if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) { + set_bit(CRASHED_PRIMARY, &mdev->flags); + cp_discovered = 1; + } + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + mdev->read_cnt = 0; + mdev->writ_cnt = 0; + + drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE); + + /* If I am currently not R_PRIMARY, + * but meta data primary indicator is set, + * I just now recover from a hard crash, + * and have been R_PRIMARY before that crash. + * + * Now, if I had no connection before that crash + * (have been degraded R_PRIMARY), chances are that + * I won't find my peer now either. + * + * In that case, and _only_ in that case, + * we use the degr-wfc-timeout instead of the default, + * so we can automatically recover from a crash of a + * degraded but active "cluster" after a certain timeout. + */ + clear_bit(USE_DEGR_WFC_T, &mdev->flags); + if (mdev->state.role != R_PRIMARY && + drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) && + !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) + set_bit(USE_DEGR_WFC_T, &mdev->flags); + + dd = drbd_determin_dev_size(mdev); + if (dd == dev_size_error) { + retcode = ERR_NOMEM_BITMAP; + goto force_diskless_dec; + } else if (dd == grew) + set_bit(RESYNC_AFTER_NEG, &mdev->flags); + + if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) { + dev_info(DEV, "Assuming that all blocks are out of sync " + "(aka FullSync)\n"); + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from attaching")) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } else { + if (drbd_bitmap_io(mdev, &drbd_bm_read, "read from attaching") < 0) { + retcode = ERR_IO_MD_DISK; + goto force_diskless_dec; + } + } + + if (cp_discovered) { + drbd_al_apply_to_bm(mdev); + drbd_al_to_on_disk_bm(mdev); + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + ns.i = os.i; + /* If MDF_CONSISTENT is not set go into inconsistent state, + otherwise investigate MDF_WasUpToDate... + If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state, + otherwise into D_CONSISTENT state. + */ + if (drbd_md_test_flag(mdev->ldev, MDF_CONSISTENT)) { + if (drbd_md_test_flag(mdev->ldev, MDF_WAS_UP_TO_DATE)) + ns.disk = D_CONSISTENT; + else + ns.disk = D_OUTDATED; + } else { + ns.disk = D_INCONSISTENT; + } + + if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED)) + ns.pdsk = D_OUTDATED; + + if ( ns.disk == D_CONSISTENT && + (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE)) + ns.disk = D_UP_TO_DATE; + + /* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND, + MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before + this point, because drbd_request_state() modifies these + flags. */ + + /* In case we are C_CONNECTED postpone any decision on the new disk + state after the negotiation phase. */ + if (mdev->state.conn == C_CONNECTED) { + mdev->new_state_tmp.i = ns.i; + ns.i = os.i; + ns.disk = D_NEGOTIATING; + } + + rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) + goto force_diskless_dec; + + if (mdev->state.role == R_PRIMARY) + mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1; + else + mdev->ldev->md.uuid[UI_CURRENT] &= ~(u64)1; + + drbd_md_mark_dirty(mdev); + drbd_md_sync(mdev); + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); + put_ldev(mdev); + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; + + force_diskless_dec: + put_ldev(mdev); + force_diskless: + drbd_force_state(mdev, NS(disk, D_DISKLESS)); + drbd_md_sync(mdev); + release_bdev2_fail: + if (nbc) + bd_release(nbc->md_bdev); + release_bdev_fail: + if (nbc) + bd_release(nbc->backing_bdev); + fail: + if (nbc) { + if (nbc->lo_file) + fput(nbc->lo_file); + if (nbc->md_file) + fput(nbc->md_file); + kfree(nbc); + } + lc_destroy(resync_lru); + + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; +} + +static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); + return 0; +} + +static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int i, ns; + enum drbd_ret_codes retcode; + struct net_conf *new_conf = NULL; + struct crypto_hash *tfm = NULL; + struct crypto_hash *integrity_w_tfm = NULL; + struct crypto_hash *integrity_r_tfm = NULL; + struct hlist_head *new_tl_hash = NULL; + struct hlist_head *new_ee_hash = NULL; + struct drbd_conf *odev; + char hmac_name[CRYPTO_MAX_ALG_NAME]; + void *int_dig_out = NULL; + void *int_dig_in = NULL; + void *int_dig_vv = NULL; + struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr; + + drbd_reconfig_start(mdev); + + if (mdev->state.conn > C_STANDALONE) { + retcode = ERR_NET_CONFIGURED; + goto fail; + } + + /* allocation not in the IO path, cqueue thread context */ + new_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL); + if (!new_conf) { + retcode = ERR_NOMEM; + goto fail; + } + + memset(new_conf, 0, sizeof(struct net_conf)); + new_conf->timeout = DRBD_TIMEOUT_DEF; + new_conf->try_connect_int = DRBD_CONNECT_INT_DEF; + new_conf->ping_int = DRBD_PING_INT_DEF; + new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF; + new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF; + new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF; + new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF; + new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF; + new_conf->ko_count = DRBD_KO_COUNT_DEF; + new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF; + new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF; + new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF; + new_conf->want_lose = 0; + new_conf->two_primaries = 0; + new_conf->wire_protocol = DRBD_PROT_C; + new_conf->ping_timeo = DRBD_PING_TIMEO_DEF; + new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF; + + if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + if (new_conf->two_primaries + && (new_conf->wire_protocol != DRBD_PROT_C)) { + retcode = ERR_NOT_PROTO_C; + goto fail; + }; + + if (mdev->state.role == R_PRIMARY && new_conf->want_lose) { + retcode = ERR_DISCARD; + goto fail; + } + + retcode = NO_ERROR; + + new_my_addr = (struct sockaddr *)&new_conf->my_addr; + new_peer_addr = (struct sockaddr *)&new_conf->peer_addr; + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev || odev == mdev) + continue; + if (get_net_conf(odev)) { + taken_addr = (struct sockaddr *)&odev->net_conf->my_addr; + if (new_conf->my_addr_len == odev->net_conf->my_addr_len && + !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len)) + retcode = ERR_LOCAL_ADDR; + + taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr; + if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len && + !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len)) + retcode = ERR_PEER_ADDR; + + put_net_conf(odev); + if (retcode != NO_ERROR) + goto fail; + } + } + + if (new_conf->cram_hmac_alg[0] != 0) { + snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)", + new_conf->cram_hmac_alg); + tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + tfm = NULL; + retcode = ERR_AUTH_ALG; + goto fail; + } + + if (crypto_tfm_alg_type(crypto_hash_tfm(tfm)) + != CRYPTO_ALG_TYPE_HASH) { + retcode = ERR_AUTH_ALG_ND; + goto fail; + } + } + + if (new_conf->integrity_alg[0]) { + integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(integrity_w_tfm)) { + integrity_w_tfm = NULL; + retcode=ERR_INTEGRITY_ALG; + goto fail; + } + + if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) { + retcode=ERR_INTEGRITY_ALG_ND; + goto fail; + } + + integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(integrity_r_tfm)) { + integrity_r_tfm = NULL; + retcode=ERR_INTEGRITY_ALG; + goto fail; + } + } + + ns = new_conf->max_epoch_size/8; + if (mdev->tl_hash_s != ns) { + new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); + if (!new_tl_hash) { + retcode = ERR_NOMEM; + goto fail; + } + } + + ns = new_conf->max_buffers/8; + if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) { + new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL); + if (!new_ee_hash) { + retcode = ERR_NOMEM; + goto fail; + } + } + + ((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0; + + if (integrity_w_tfm) { + i = crypto_hash_digestsize(integrity_w_tfm); + int_dig_out = kmalloc(i, GFP_KERNEL); + if (!int_dig_out) { + retcode = ERR_NOMEM; + goto fail; + } + int_dig_in = kmalloc(i, GFP_KERNEL); + if (!int_dig_in) { + retcode = ERR_NOMEM; + goto fail; + } + int_dig_vv = kmalloc(i, GFP_KERNEL); + if (!int_dig_vv) { + retcode = ERR_NOMEM; + goto fail; + } + } + + if (!mdev->bitmap) { + if(drbd_bm_init(mdev)) { + retcode = ERR_NOMEM; + goto fail; + } + } + + spin_lock_irq(&mdev->req_lock); + if (mdev->net_conf != NULL) { + retcode = ERR_NET_CONFIGURED; + spin_unlock_irq(&mdev->req_lock); + goto fail; + } + mdev->net_conf = new_conf; + + mdev->send_cnt = 0; + mdev->recv_cnt = 0; + + if (new_tl_hash) { + kfree(mdev->tl_hash); + mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8; + mdev->tl_hash = new_tl_hash; + } + + if (new_ee_hash) { + kfree(mdev->ee_hash); + mdev->ee_hash_s = mdev->net_conf->max_buffers/8; + mdev->ee_hash = new_ee_hash; + } + + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = tfm; + + crypto_free_hash(mdev->integrity_w_tfm); + mdev->integrity_w_tfm = integrity_w_tfm; + + crypto_free_hash(mdev->integrity_r_tfm); + mdev->integrity_r_tfm = integrity_r_tfm; + + kfree(mdev->int_dig_out); + kfree(mdev->int_dig_in); + kfree(mdev->int_dig_vv); + mdev->int_dig_out=int_dig_out; + mdev->int_dig_in=int_dig_in; + mdev->int_dig_vv=int_dig_vv; + spin_unlock_irq(&mdev->req_lock); + + retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE); + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; + +fail: + kfree(int_dig_out); + kfree(int_dig_in); + kfree(int_dig_vv); + crypto_free_hash(tfm); + crypto_free_hash(integrity_w_tfm); + crypto_free_hash(integrity_r_tfm); + kfree(new_tl_hash); + kfree(new_ee_hash); + kfree(new_conf); + + reply->ret_code = retcode; + drbd_reconfig_done(mdev); + return 0; +} + +static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + + retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED); + + if (retcode == SS_NOTHING_TO_DO) + goto done; + else if (retcode == SS_ALREADY_STANDALONE) + goto done; + else if (retcode == SS_PRIMARY_NOP) { + /* Our statche checking code wants to see the peer outdated. */ + retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, + pdsk, D_OUTDATED)); + } else if (retcode == SS_CW_FAILED_BY_PEER) { + /* The peer probably wants to see us outdated. */ + retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING, + disk, D_OUTDATED), + CS_ORDERED); + if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + retcode = SS_SUCCESS; + } + } + + if (retcode < SS_SUCCESS) + goto fail; + + if (wait_event_interruptible(mdev->state_wait, + mdev->state.conn != C_DISCONNECTING)) { + /* Do not test for mdev->state.conn == C_STANDALONE, since + someone else might connect us in the mean time! */ + retcode = ERR_INTR; + goto fail; + } + + done: + retcode = NO_ERROR; + fail: + drbd_md_sync(mdev); + reply->ret_code = retcode; + return 0; +} + +void resync_after_online_grow(struct drbd_conf *mdev) +{ + int iass; /* I am sync source */ + + dev_info(DEV, "Resync of new storage after online grow\n"); + if (mdev->state.role != mdev->state.peer) + iass = (mdev->state.role == R_PRIMARY); + else + iass = test_bit(DISCARD_CONCURRENT, &mdev->flags); + + if (iass) + drbd_start_resync(mdev, C_SYNC_SOURCE); + else + _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE); +} + +static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + struct resize rs; + int retcode = NO_ERROR; + int ldsc = 0; /* local disk size changed */ + enum determine_dev_size dd; + + memset(&rs, 0, sizeof(struct resize)); + if (!resize_from_tags(mdev, nlp->tag_list, &rs)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + if (mdev->state.conn > C_CONNECTED) { + retcode = ERR_RESIZE_RESYNC; + goto fail; + } + + if (mdev->state.role == R_SECONDARY && + mdev->state.peer == R_SECONDARY) { + retcode = ERR_NO_PRIMARY; + goto fail; + } + + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; + goto fail; + } + + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); + ldsc = 1; + } + + mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; + dd = drbd_determin_dev_size(mdev); + drbd_md_sync(mdev); + put_ldev(mdev); + if (dd == dev_size_error) { + retcode = ERR_NOMEM_BITMAP; + goto fail; + } + + if (mdev->state.conn == C_CONNECTED && (dd != unchanged || ldsc)) { + if (dd == grew) + set_bit(RESIZE_PENDING, &mdev->flags); + + drbd_send_uuids(mdev); + drbd_send_sizes(mdev, 1); + } + + fail: + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + int err; + int ovr; /* online verify running */ + int rsr; /* re-sync running */ + struct crypto_hash *verify_tfm = NULL; + struct crypto_hash *csums_tfm = NULL; + struct syncer_conf sc; + cpumask_var_t new_cpu_mask; + + if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) { + retcode = ERR_NOMEM; + goto fail; + } + + if (nlp->flags & DRBD_NL_SET_DEFAULTS) { + memset(&sc, 0, sizeof(struct syncer_conf)); + sc.rate = DRBD_RATE_DEF; + sc.after = DRBD_AFTER_DEF; + sc.al_extents = DRBD_AL_EXTENTS_DEF; + } else + memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf)); + + if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) { + retcode = ERR_MANDATORY_TAG; + goto fail; + } + + /* re-sync running */ + rsr = ( mdev->state.conn == C_SYNC_SOURCE || + mdev->state.conn == C_SYNC_TARGET || + mdev->state.conn == C_PAUSED_SYNC_S || + mdev->state.conn == C_PAUSED_SYNC_T ); + + if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) { + retcode = ERR_CSUMS_RESYNC_RUNNING; + goto fail; + } + + if (!rsr && sc.csums_alg[0]) { + csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(csums_tfm)) { + csums_tfm = NULL; + retcode = ERR_CSUMS_ALG; + goto fail; + } + + if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) { + retcode = ERR_CSUMS_ALG_ND; + goto fail; + } + } + + /* online verify running */ + ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T); + + if (ovr) { + if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) { + retcode = ERR_VERIFY_RUNNING; + goto fail; + } + } + + if (!ovr && sc.verify_alg[0]) { + verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(verify_tfm)) { + verify_tfm = NULL; + retcode = ERR_VERIFY_ALG; + goto fail; + } + + if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) { + retcode = ERR_VERIFY_ALG_ND; + goto fail; + } + } + + /* silently ignore cpu mask on UP kernel */ + if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) { + err = __bitmap_parse(sc.cpu_mask, 32, 0, + cpumask_bits(new_cpu_mask), nr_cpu_ids); + if (err) { + dev_warn(DEV, "__bitmap_parse() failed with %d\n", err); + retcode = ERR_CPU_MASK_PARSE; + goto fail; + } + } + + ERR_IF (sc.rate < 1) sc.rate = 1; + ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */ +#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT) + if (sc.al_extents > AL_MAX) { + dev_err(DEV, "sc.al_extents > %d\n", AL_MAX); + sc.al_extents = AL_MAX; + } +#undef AL_MAX + + /* most sanity checks done, try to assign the new sync-after + * dependency. need to hold the global lock in there, + * to avoid a race in the dependency loop check. */ + retcode = drbd_alter_sa(mdev, sc.after); + if (retcode != NO_ERROR) + goto fail; + + /* ok, assign the rest of it as well. + * lock against receive_SyncParam() */ + spin_lock(&mdev->peer_seq_lock); + mdev->sync_conf = sc; + + if (!rsr) { + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = csums_tfm; + csums_tfm = NULL; + } + + if (!ovr) { + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = verify_tfm; + verify_tfm = NULL; + } + spin_unlock(&mdev->peer_seq_lock); + + if (get_ldev(mdev)) { + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + drbd_al_shrink(mdev); + err = drbd_check_al_size(mdev); + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + + put_ldev(mdev); + drbd_md_sync(mdev); + + if (err) { + retcode = ERR_NOMEM; + goto fail; + } + } + + if (mdev->state.conn >= C_CONNECTED) + drbd_send_sync_param(mdev, &sc); + + if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) { + cpumask_copy(mdev->cpu_mask, new_cpu_mask); + drbd_calc_cpu_mask(mdev); + mdev->receiver.reset_cpu_mask = 1; + mdev->asender.reset_cpu_mask = 1; + mdev->worker.reset_cpu_mask = 1; + } + + kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE); +fail: + free_cpumask_var(new_cpu_mask); + crypto_free_hash(csums_tfm); + crypto_free_hash(verify_tfm); + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode; + + retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); + + if (retcode < SS_SUCCESS && retcode != SS_NEED_CONNECTION) + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); + + while (retcode == SS_NEED_CONNECTION) { + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn < C_CONNECTED) + retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + + if (retcode != SS_NEED_CONNECTION) + break; + + retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); + } + + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + + reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); + + return 0; +} + +static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + + if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO) + retcode = ERR_PAUSE_IS_SET; + + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + + if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) + retcode = ERR_PAUSE_IS_CLEAR; + + reply->ret_code = retcode; + return 0; +} + +static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(susp, 1)); + + return 0; +} + +static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(susp, 0)); + return 0; +} + +static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED)); + return 0; +} + +static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if (get_ldev(mdev)) { + tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl); + put_ldev(mdev); + } + + if (get_net_conf(mdev)) { + tl = net_conf_to_tags(mdev, mdev->net_conf, tl); + put_net_conf(mdev); + } + tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl); + + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl = reply->tag_list; + union drbd_state s = mdev->state; + unsigned long rs_left; + unsigned int res; + + tl = get_state_to_tags(mdev, (struct get_state *)&s, tl); + + /* no local ref, no bitmap, no syncer progress. */ + if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) { + if (get_ldev(mdev)) { + drbd_get_syncer_progress(mdev, &rs_left, &res); + tl = tl_add_int(tl, T_sync_progress, &res); + put_ldev(mdev); + } + } + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + + tl = reply->tag_list; + + if (get_ldev(mdev)) { + tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64)); + tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags); + put_ldev(mdev); + } + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +/** + * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use + * @mdev: DRBD device. + * @nlp: Netlink/connector packet from drbdsetup + * @reply: Reply packet for drbdsetup + */ +static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + unsigned short *tl; + char rv; + + tl = reply->tag_list; + + rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED : + test_bit(USE_DEGR_WFC_T, &mdev->flags) ? UT_DEGRADED : UT_DEFAULT; + + tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv)); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + return (int)((char *)tl - (char *)reply->tag_list); +} + +static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + /* default to resume from last known position, if possible */ + struct start_ov args = + { .start_sector = mdev->ov_start_sector }; + + if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; + } + /* w_make_ov_request expects position to be aligned */ + mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; + reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + return 0; +} + + +static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, + struct drbd_nl_cfg_reply *reply) +{ + int retcode = NO_ERROR; + int skip_initial_sync = 0; + int err; + + struct new_c_uuid args; + + memset(&args, 0, sizeof(struct new_c_uuid)); + if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) { + reply->ret_code = ERR_MANDATORY_TAG; + return 0; + } + + mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */ + + if (!get_ldev(mdev)) { + retcode = ERR_NO_DISK; + goto out; + } + + /* this is "skip initial sync", assume to be clean */ + if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 && + mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) { + dev_info(DEV, "Preparing to skip initial sync\n"); + skip_initial_sync = 1; + } else if (mdev->state.conn != C_STANDALONE) { + retcode = ERR_CONNECTED; + goto out_dec; + } + + drbd_uuid_set(mdev, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */ + drbd_uuid_new_current(mdev); /* New current, previous to UI_BITMAP */ + + if (args.clear_bm) { + err = drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, "clear_n_write from new_c_uuid"); + if (err) { + dev_err(DEV, "Writing bitmap failed with %d\n",err); + retcode = ERR_IO_MD_DISK; + } + if (skip_initial_sync) { + drbd_send_uuids_skip_initial_sync(mdev); + _drbd_uuid_set(mdev, UI_BITMAP, 0); + spin_lock_irq(&mdev->req_lock); + _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + spin_unlock_irq(&mdev->req_lock); + } + } + + drbd_md_sync(mdev); +out_dec: + put_ldev(mdev); +out: + mutex_unlock(&mdev->state_mutex); + + reply->ret_code = retcode; + return 0; +} + +static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp) +{ + struct drbd_conf *mdev; + + if (nlp->drbd_minor >= minor_count) + return NULL; + + mdev = minor_to_mdev(nlp->drbd_minor); + + if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) { + struct gendisk *disk = NULL; + mdev = drbd_new_device(nlp->drbd_minor); + + spin_lock_irq(&drbd_pp_lock); + if (minor_table[nlp->drbd_minor] == NULL) { + minor_table[nlp->drbd_minor] = mdev; + disk = mdev->vdisk; + mdev = NULL; + } /* else: we lost the race */ + spin_unlock_irq(&drbd_pp_lock); + + if (disk) /* we won the race above */ + /* in case we ever add a drbd_delete_device(), + * don't forget the del_gendisk! */ + add_disk(disk); + else /* we lost the race above */ + drbd_free_mdev(mdev); + + mdev = minor_to_mdev(nlp->drbd_minor); + } + + return mdev; +} + +struct cn_handler_struct { + int (*function)(struct drbd_conf *, + struct drbd_nl_cfg_req *, + struct drbd_nl_cfg_reply *); + int reply_body_size; +}; + +static struct cn_handler_struct cnd_table[] = { + [ P_primary ] = { &drbd_nl_primary, 0 }, + [ P_secondary ] = { &drbd_nl_secondary, 0 }, + [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 }, + [ P_detach ] = { &drbd_nl_detach, 0 }, + [ P_net_conf ] = { &drbd_nl_net_conf, 0 }, + [ P_disconnect ] = { &drbd_nl_disconnect, 0 }, + [ P_resize ] = { &drbd_nl_resize, 0 }, + [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 }, + [ P_invalidate ] = { &drbd_nl_invalidate, 0 }, + [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 }, + [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 }, + [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 }, + [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 }, + [ P_resume_io ] = { &drbd_nl_resume_io, 0 }, + [ P_outdate ] = { &drbd_nl_outdate, 0 }, + [ P_get_config ] = { &drbd_nl_get_config, + sizeof(struct syncer_conf_tag_len_struct) + + sizeof(struct disk_conf_tag_len_struct) + + sizeof(struct net_conf_tag_len_struct) }, + [ P_get_state ] = { &drbd_nl_get_state, + sizeof(struct get_state_tag_len_struct) + + sizeof(struct sync_progress_tag_len_struct) }, + [ P_get_uuids ] = { &drbd_nl_get_uuids, + sizeof(struct get_uuids_tag_len_struct) }, + [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag, + sizeof(struct get_timeout_flag_tag_len_struct)}, + [ P_start_ov ] = { &drbd_nl_start_ov, 0 }, + [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, +}; + +static void drbd_connector_callback(struct cn_msg *req) +{ + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; + struct cn_handler_struct *cm; + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply *reply; + struct drbd_conf *mdev; + int retcode, rr; + int reply_size = sizeof(struct cn_msg) + + sizeof(struct drbd_nl_cfg_reply) + + sizeof(short int); + + if (!try_module_get(THIS_MODULE)) { + printk(KERN_ERR "drbd: try_module_get() failed!\n"); + return; + } + + mdev = ensure_mdev(nlp); + if (!mdev) { + retcode = ERR_MINOR_INVALID; + goto fail; + } + + trace_drbd_netlink(req, 1); + + if (nlp->packet_type >= P_nl_after_last_packet) { + retcode = ERR_PACKET_NR; + goto fail; + } + + cm = cnd_table + nlp->packet_type; + + /* This may happen if packet number is 0: */ + if (cm->function == NULL) { + retcode = ERR_PACKET_NR; + goto fail; + } + + reply_size += cm->reply_body_size; + + /* allocation not in the IO path, cqueue thread context */ + cn_reply = kmalloc(reply_size, GFP_KERNEL); + if (!cn_reply) { + retcode = ERR_NOMEM; + goto fail; + } + reply = (struct drbd_nl_cfg_reply *) cn_reply->data; + + reply->packet_type = + cm->reply_body_size ? nlp->packet_type : P_nl_after_last_packet; + reply->minor = nlp->drbd_minor; + reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */ + /* reply->tag_list; might be modified by cm->function. */ + + rr = cm->function(mdev, nlp, reply); + + cn_reply->id = req->id; + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; + cn_reply->flags = 0; + + trace_drbd_netlink(cn_reply, 0); + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); + if (rr && rr != -ESRCH) + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); + + kfree(cn_reply); + module_put(THIS_MODULE); + return; + fail: + drbd_nl_send_reply(req, retcode); + module_put(THIS_MODULE); +} + +static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */ + +static unsigned short * +__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, + unsigned short len, int nul_terminated) +{ + unsigned short l = tag_descriptions[tag_number(tag)].max_len; + len = (len < l) ? len : l; + put_unaligned(tag, tl++); + put_unaligned(len, tl++); + memcpy(tl, data, len); + tl = (unsigned short*)((char*)tl + len); + if (nul_terminated) + *((char*)tl - 1) = 0; + return tl; +} + +static unsigned short * +tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len) +{ + return __tl_add_blob(tl, tag, data, len, 0); +} + +static unsigned short * +tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str) +{ + return __tl_add_blob(tl, tag, str, strlen(str)+1, 0); +} + +static unsigned short * +tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val) +{ + put_unaligned(tag, tl++); + switch(tag_type(tag)) { + case TT_INTEGER: + put_unaligned(sizeof(int), tl++); + put_unaligned(*(int *)val, (int *)tl); + tl = (unsigned short*)((char*)tl+sizeof(int)); + break; + case TT_INT64: + put_unaligned(sizeof(u64), tl++); + put_unaligned(*(u64 *)val, (u64 *)tl); + tl = (unsigned short*)((char*)tl+sizeof(u64)); + break; + default: + /* someone did something stupid. */ + ; + } + return tl; +} + +void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct get_state_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ + + tl = get_state_to_tags(mdev, (struct get_state *)&state, tl); + + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_get_state; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + trace_drbd_netlink(cn_reply, 0); + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); +} + +void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct call_helper_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + + /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */ + + tl = tl_add_str(tl, T_helper, helper_name); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_call_helper; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + trace_drbd_netlink(cn_reply, 0); + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); +} + +void drbd_bcast_ee(struct drbd_conf *mdev, + const char *reason, const int dgs, + const char* seen_hash, const char* calc_hash, + const struct drbd_epoch_entry* e) +{ + struct cn_msg *cn_reply; + struct drbd_nl_cfg_reply *reply; + struct bio_vec *bvec; + unsigned short *tl; + int i; + + if (!e) + return; + if (!reason || !reason[0]) + return; + + /* apparently we have to memcpy twice, first to prepare the data for the + * struct cn_msg, then within cn_netlink_send from the cn_msg to the + * netlink skb. */ + /* receiver thread context, which is not in the writeout path (of this node), + * but may be in the writeout path of the _other_ node. + * GFP_NOIO to avoid potential "distributed deadlock". */ + cn_reply = kmalloc( + sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct dump_ee_tag_len_struct)+ + sizeof(short int), + GFP_NOIO); + + if (!cn_reply) { + dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n", + (unsigned long long)e->sector, e->size); + return; + } + + reply = (struct drbd_nl_cfg_reply*)cn_reply->data; + tl = reply->tag_list; + + tl = tl_add_str(tl, T_dump_ee_reason, reason); + tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs); + tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs); + tl = tl_add_int(tl, T_ee_sector, &e->sector); + tl = tl_add_int(tl, T_ee_block_id, &e->block_id); + + put_unaligned(T_ee_data, tl++); + put_unaligned(e->size, tl++); + + __bio_for_each_segment(bvec, e->private_bio, i, 0) { + void *d = kmap(bvec->bv_page); + memcpy(tl, d + bvec->bv_offset, bvec->bv_len); + kunmap(bvec->bv_page); + tl=(unsigned short*)((char*)tl + bvec->bv_len); + } + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1,&drbd_nl_seq); + cn_reply->ack = 0; // not used here. + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char*)tl - (char*)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_dump_ee; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + trace_drbd_netlink(cn_reply, 0); + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + kfree(cn_reply); +} + +void drbd_bcast_sync_progress(struct drbd_conf *mdev) +{ + char buffer[sizeof(struct cn_msg)+ + sizeof(struct drbd_nl_cfg_reply)+ + sizeof(struct sync_progress_tag_len_struct)+ + sizeof(short int)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + unsigned short *tl = reply->tag_list; + unsigned long rs_left; + unsigned int res; + + /* no local ref, no bitmap, no syncer progress, no broadcast. */ + if (!get_ldev(mdev)) + return; + drbd_get_syncer_progress(mdev, &rs_left, &res); + put_ldev(mdev); + + tl = tl_add_int(tl, T_sync_progress, &res); + put_unaligned(TT_END, tl++); /* Close the tag list */ + + cn_reply->id.idx = CN_IDX_DRBD; + cn_reply->id.val = CN_VAL_DRBD; + + cn_reply->seq = atomic_add_return(1, &drbd_nl_seq); + cn_reply->ack = 0; /* not used here. */ + cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + + (int)((char *)tl - (char *)reply->tag_list); + cn_reply->flags = 0; + + reply->packet_type = P_sync_progress; + reply->minor = mdev_to_minor(mdev); + reply->ret_code = NO_ERROR; + + trace_drbd_netlink(cn_reply, 0); + cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); +} + +int __init drbd_nl_init(void) +{ + static struct cb_id cn_id_drbd; + int err, try=10; + + cn_id_drbd.val = CN_VAL_DRBD; + do { + cn_id_drbd.idx = cn_idx; + err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback); + if (!err) + break; + cn_idx = (cn_idx + CN_IDX_STEP); + } while (try--); + + if (err) { + printk(KERN_ERR "drbd: cn_drbd failed to register\n"); + return err; + } + + return 0; +} + +void drbd_nl_cleanup(void) +{ + static struct cb_id cn_id_drbd; + + cn_id_drbd.idx = cn_idx; + cn_id_drbd.val = CN_VAL_DRBD; + + cn_del_callback(&cn_id_drbd); +} + +void drbd_nl_send_reply(struct cn_msg *req, int ret_code) +{ + char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)]; + struct cn_msg *cn_reply = (struct cn_msg *) buffer; + struct drbd_nl_cfg_reply *reply = + (struct drbd_nl_cfg_reply *)cn_reply->data; + int rr; + + cn_reply->id = req->id; + + cn_reply->seq = req->seq; + cn_reply->ack = req->ack + 1; + cn_reply->len = sizeof(struct drbd_nl_cfg_reply); + cn_reply->flags = 0; + + reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; + reply->ret_code = ret_code; + + trace_drbd_netlink(cn_reply, 0); + rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); + if (rr && rr != -ESRCH) + printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); +} + diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c new file mode 100644 index 00000000000..98fcb7450c7 --- /dev/null +++ b/drivers/block/drbd/drbd_proc.c @@ -0,0 +1,266 @@ +/* + drbd_proc.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" + +static int drbd_proc_open(struct inode *inode, struct file *file); + + +struct proc_dir_entry *drbd_proc; +struct file_operations drbd_proc_fops = { + .owner = THIS_MODULE, + .open = drbd_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + + +/*lge + * progress bars shamelessly adapted from driver/md/md.c + * output looks like + * [=====>..............] 33.5% (23456/123456) + * finish: 2:20:20 speed: 6,345 (6,456) K/sec + */ +static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq) +{ + unsigned long db, dt, dbdt, rt, rs_left; + unsigned int res; + int i, x, y; + + drbd_get_syncer_progress(mdev, &rs_left, &res); + + x = res/50; + y = 20-x; + seq_printf(seq, "\t["); + for (i = 1; i < x; i++) + seq_printf(seq, "="); + seq_printf(seq, ">"); + for (i = 0; i < y; i++) + seq_printf(seq, "."); + seq_printf(seq, "] "); + + seq_printf(seq, "sync'ed:%3u.%u%% ", res / 10, res % 10); + /* if more than 1 GB display in MB */ + if (mdev->rs_total > 0x100000L) + seq_printf(seq, "(%lu/%lu)M\n\t", + (unsigned long) Bit2KB(rs_left >> 10), + (unsigned long) Bit2KB(mdev->rs_total >> 10)); + else + seq_printf(seq, "(%lu/%lu)K\n\t", + (unsigned long) Bit2KB(rs_left), + (unsigned long) Bit2KB(mdev->rs_total)); + + /* see drivers/md/md.c + * We do not want to overflow, so the order of operands and + * the * 100 / 100 trick are important. We do a +1 to be + * safe against division by zero. We only estimate anyway. + * + * dt: time from mark until now + * db: blocks written from mark until now + * rt: remaining time + */ + dt = (jiffies - mdev->rs_mark_time) / HZ; + + if (dt > 20) { + /* if we made no update to rs_mark_time for too long, + * we are stalled. show that. */ + seq_printf(seq, "stalled\n"); + return; + } + + if (!dt) + dt++; + db = mdev->rs_mark_left - rs_left; + rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */ + + seq_printf(seq, "finish: %lu:%02lu:%02lu", + rt / 3600, (rt % 3600) / 60, rt % 60); + + /* current speed average over (SYNC_MARKS * SYNC_MARK_STEP) jiffies */ + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " speed: %ld,%03ld", + dbdt/1000, dbdt % 1000); + else + seq_printf(seq, " speed: %ld", dbdt); + + /* mean speed since syncer started + * we do account for PausedSync periods */ + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) + dt = 1; + db = mdev->rs_total - rs_left; + dbdt = Bit2KB(db/dt); + if (dbdt > 1000) + seq_printf(seq, " (%ld,%03ld)", + dbdt/1000, dbdt % 1000); + else + seq_printf(seq, " (%ld)", dbdt); + + seq_printf(seq, " K/sec\n"); +} + +static void resync_dump_detail(struct seq_file *seq, struct lc_element *e) +{ + struct bm_extent *bme = lc_entry(e, struct bm_extent, lce); + + seq_printf(seq, "%5d %s %s\n", bme->rs_left, + bme->flags & BME_NO_WRITES ? "NO_WRITES" : "---------", + bme->flags & BME_LOCKED ? "LOCKED" : "------" + ); +} + +static int drbd_seq_show(struct seq_file *seq, void *v) +{ + int i, hole = 0; + const char *sn; + struct drbd_conf *mdev; + + static char write_ordering_chars[] = { + [WO_none] = 'n', + [WO_drain_io] = 'd', + [WO_bdev_flush] = 'f', + [WO_bio_barrier] = 'b', + }; + + seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", + API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag()); + + /* + cs .. connection state + ro .. node role (local/remote) + ds .. disk state (local/remote) + protocol + various flags + ns .. network send + nr .. network receive + dw .. disk write + dr .. disk read + al .. activity log write count + bm .. bitmap update write count + pe .. pending (waiting for ack or data reply) + ua .. unack'd (still need to send ack or data reply) + ap .. application requests accepted, but not yet completed + ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending + wo .. write ordering mode currently in use + oos .. known out-of-sync kB + */ + + for (i = 0; i < minor_count; i++) { + mdev = minor_to_mdev(i); + if (!mdev) { + hole = 1; + continue; + } + if (hole) { + hole = 0; + seq_printf(seq, "\n"); + } + + sn = drbd_conn_str(mdev->state.conn); + + if (mdev->state.conn == C_STANDALONE && + mdev->state.disk == D_DISKLESS && + mdev->state.role == R_SECONDARY) { + seq_printf(seq, "%2d: cs:Unconfigured\n", i); + } else { + seq_printf(seq, + "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n" + " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " + "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c", + i, sn, + drbd_role_str(mdev->state.role), + drbd_role_str(mdev->state.peer), + drbd_disk_str(mdev->state.disk), + drbd_disk_str(mdev->state.pdsk), + (mdev->net_conf == NULL ? ' ' : + (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')), + mdev->state.susp ? 's' : 'r', + mdev->state.aftr_isp ? 'a' : '-', + mdev->state.peer_isp ? 'p' : '-', + mdev->state.user_isp ? 'u' : '-', + mdev->congestion_reason ?: '-', + mdev->send_cnt/2, + mdev->recv_cnt/2, + mdev->writ_cnt/2, + mdev->read_cnt/2, + mdev->al_writ_cnt, + mdev->bm_writ_cnt, + atomic_read(&mdev->local_cnt), + atomic_read(&mdev->ap_pending_cnt) + + atomic_read(&mdev->rs_pending_cnt), + atomic_read(&mdev->unacked_cnt), + atomic_read(&mdev->ap_bio_cnt), + mdev->epochs, + write_ordering_chars[mdev->write_ordering] + ); + seq_printf(seq, " oos:%lu\n", + Bit2KB(drbd_bm_total_weight(mdev))); + } + if (mdev->state.conn == C_SYNC_SOURCE || + mdev->state.conn == C_SYNC_TARGET) + drbd_syncer_progress(mdev, seq); + + if (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T) + seq_printf(seq, "\t%3d%% %lu/%lu\n", + (int)((mdev->rs_total-mdev->ov_left) / + (mdev->rs_total/100+1)), + mdev->rs_total - mdev->ov_left, + mdev->rs_total); + + if (proc_details >= 1 && get_ldev_if_state(mdev, D_FAILED)) { + lc_seq_printf_stats(seq, mdev->resync); + lc_seq_printf_stats(seq, mdev->act_log); + put_ldev(mdev); + } + + if (proc_details >= 2) { + if (mdev->resync) { + lc_seq_dump_details(seq, mdev->resync, "rs_left", + resync_dump_detail); + } + } + } + + return 0; +} + +static int drbd_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, drbd_seq_show, PDE(inode)->data); +} + +/* PROC FS stuff end */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c new file mode 100644 index 00000000000..63686c4d85c --- /dev/null +++ b/drivers/block/drbd/drbd_receiver.c @@ -0,0 +1,4456 @@ +/* + drbd_receiver.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define __KERNEL_SYSCALLS__ +#include +#include +#include +#include +#include +#include +#include "drbd_int.h" +#include "drbd_tracing.h" +#include "drbd_req.h" + +#include "drbd_vli.h" + +struct flush_work { + struct drbd_work w; + struct drbd_epoch *epoch; +}; + +enum finish_epoch { + FE_STILL_LIVE, + FE_DESTROYED, + FE_RECYCLED, +}; + +static int drbd_do_handshake(struct drbd_conf *mdev); +static int drbd_do_auth(struct drbd_conf *mdev); + +static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event); +static int e_end_block(struct drbd_conf *, struct drbd_work *, int); + +static struct drbd_epoch *previous_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) +{ + struct drbd_epoch *prev; + spin_lock(&mdev->epoch_lock); + prev = list_entry(epoch->list.prev, struct drbd_epoch, list); + if (prev == epoch || prev == mdev->current_epoch) + prev = NULL; + spin_unlock(&mdev->epoch_lock); + return prev; +} + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +static struct page *drbd_pp_first_page_or_try_alloc(struct drbd_conf *mdev) +{ + struct page *page = NULL; + + /* Yes, testing drbd_pp_vacant outside the lock is racy. + * So what. It saves a spin_lock. */ + if (drbd_pp_vacant > 0) { + spin_lock(&drbd_pp_lock); + page = drbd_pp_pool; + if (page) { + drbd_pp_pool = (struct page *)page_private(page); + set_page_private(page, 0); /* just to be polite */ + drbd_pp_vacant--; + } + spin_unlock(&drbd_pp_lock); + } + /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + if (!page) + page = alloc_page(GFP_TRY); + if (page) + atomic_inc(&mdev->pp_in_use); + return page; +} + +/* kick lower level device, if we have more than (arbitrary number) + * reference counts on it, which typically are locally submitted io + * requests. don't use unacked_cnt, so we speed up proto A and B, too. */ +static void maybe_kick_lo(struct drbd_conf *mdev) +{ + if (atomic_read(&mdev->local_cnt) >= mdev->net_conf->unplug_watermark) + drbd_kick_lo(mdev); +} + +static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed) +{ + struct drbd_epoch_entry *e; + struct list_head *le, *tle; + + /* The EEs are always appended to the end of the list. Since + they are sent in order over the wire, they have to finish + in order. As soon as we see the first not finished we can + stop to examine the list... */ + + list_for_each_safe(le, tle, &mdev->net_ee) { + e = list_entry(le, struct drbd_epoch_entry, w.list); + if (drbd_bio_has_active_page(e->private_bio)) + break; + list_move(le, to_be_freed); + } +} + +static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev) +{ + LIST_HEAD(reclaimed); + struct drbd_epoch_entry *e, *t; + + maybe_kick_lo(mdev); + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev, &reclaimed); + spin_unlock_irq(&mdev->req_lock); + + list_for_each_entry_safe(e, t, &reclaimed, w.list) + drbd_free_ee(mdev, e); +} + +/** + * drbd_pp_alloc() - Returns a page, fails only if a signal comes in + * @mdev: DRBD device. + * @retry: whether or not to retry allocation forever (or until signalled) + * + * Tries to allocate a page, first from our own page pool, then from the + * kernel, unless this allocation would exceed the max_buffers setting. + * If @retry is non-zero, retry until DRBD frees a page somewhere else. + */ +static struct page *drbd_pp_alloc(struct drbd_conf *mdev, int retry) +{ + struct page *page = NULL; + DEFINE_WAIT(wait); + + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { + page = drbd_pp_first_page_or_try_alloc(mdev); + if (page) + return page; + } + + for (;;) { + prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); + + drbd_kick_lo_and_reclaim_net(mdev); + + if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) { + page = drbd_pp_first_page_or_try_alloc(mdev); + if (page) + break; + } + + if (!retry) + break; + + if (signal_pending(current)) { + dev_warn(DEV, "drbd_pp_alloc interrupted!\n"); + break; + } + + schedule(); + } + finish_wait(&drbd_pp_wait, &wait); + + return page; +} + +/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc. + * Is also used from inside an other spin_lock_irq(&mdev->req_lock) */ +static void drbd_pp_free(struct drbd_conf *mdev, struct page *page) +{ + int free_it; + + spin_lock(&drbd_pp_lock); + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { + free_it = 1; + } else { + set_page_private(page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = page; + drbd_pp_vacant++; + free_it = 0; + } + spin_unlock(&drbd_pp_lock); + + atomic_dec(&mdev->pp_in_use); + + if (free_it) + __free_page(page); + + wake_up(&drbd_pp_wait); +} + +static void drbd_pp_free_bio_pages(struct drbd_conf *mdev, struct bio *bio) +{ + struct page *p_to_be_freed = NULL; + struct page *page; + struct bio_vec *bvec; + int i; + + spin_lock(&drbd_pp_lock); + __bio_for_each_segment(bvec, bio, i, 0) { + if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count) { + set_page_private(bvec->bv_page, (unsigned long)p_to_be_freed); + p_to_be_freed = bvec->bv_page; + } else { + set_page_private(bvec->bv_page, (unsigned long)drbd_pp_pool); + drbd_pp_pool = bvec->bv_page; + drbd_pp_vacant++; + } + } + spin_unlock(&drbd_pp_lock); + atomic_sub(bio->bi_vcnt, &mdev->pp_in_use); + + while (p_to_be_freed) { + page = p_to_be_freed; + p_to_be_freed = (struct page *)page_private(page); + set_page_private(page, 0); /* just to be polite */ + put_page(page); + } + + wake_up(&drbd_pp_wait); +} + +/* +You need to hold the req_lock: + _drbd_wait_ee_list_empty() + +You must not have the req_lock: + drbd_free_ee() + drbd_alloc_ee() + drbd_init_ee() + drbd_release_ee() + drbd_ee_fix_bhs() + drbd_process_done_ee() + drbd_clear_done_ee() + drbd_wait_ee_list_empty() +*/ + +struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, + u64 id, + sector_t sector, + unsigned int data_size, + gfp_t gfp_mask) __must_hold(local) +{ + struct request_queue *q; + struct drbd_epoch_entry *e; + struct page *page; + struct bio *bio; + unsigned int ds; + + if (FAULT_ACTIVE(mdev, DRBD_FAULT_AL_EE)) + return NULL; + + e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM); + if (!e) { + if (!(gfp_mask & __GFP_NOWARN)) + dev_err(DEV, "alloc_ee: Allocation of an EE failed\n"); + return NULL; + } + + bio = bio_alloc(gfp_mask & ~__GFP_HIGHMEM, div_ceil(data_size, PAGE_SIZE)); + if (!bio) { + if (!(gfp_mask & __GFP_NOWARN)) + dev_err(DEV, "alloc_ee: Allocation of a bio failed\n"); + goto fail1; + } + + bio->bi_bdev = mdev->ldev->backing_bdev; + bio->bi_sector = sector; + + ds = data_size; + while (ds) { + page = drbd_pp_alloc(mdev, (gfp_mask & __GFP_WAIT)); + if (!page) { + if (!(gfp_mask & __GFP_NOWARN)) + dev_err(DEV, "alloc_ee: Allocation of a page failed\n"); + goto fail2; + } + if (!bio_add_page(bio, page, min_t(int, ds, PAGE_SIZE), 0)) { + drbd_pp_free(mdev, page); + dev_err(DEV, "alloc_ee: bio_add_page(s=%llu," + "data_size=%u,ds=%u) failed\n", + (unsigned long long)sector, data_size, ds); + + q = bdev_get_queue(bio->bi_bdev); + if (q->merge_bvec_fn) { + struct bvec_merge_data bvm = { + .bi_bdev = bio->bi_bdev, + .bi_sector = bio->bi_sector, + .bi_size = bio->bi_size, + .bi_rw = bio->bi_rw, + }; + int l = q->merge_bvec_fn(q, &bvm, + &bio->bi_io_vec[bio->bi_vcnt]); + dev_err(DEV, "merge_bvec_fn() = %d\n", l); + } + + /* dump more of the bio. */ + dev_err(DEV, "bio->bi_max_vecs = %d\n", bio->bi_max_vecs); + dev_err(DEV, "bio->bi_vcnt = %d\n", bio->bi_vcnt); + dev_err(DEV, "bio->bi_size = %d\n", bio->bi_size); + dev_err(DEV, "bio->bi_phys_segments = %d\n", bio->bi_phys_segments); + + goto fail2; + break; + } + ds -= min_t(int, ds, PAGE_SIZE); + } + + D_ASSERT(data_size == bio->bi_size); + + bio->bi_private = e; + e->mdev = mdev; + e->sector = sector; + e->size = bio->bi_size; + + e->private_bio = bio; + e->block_id = id; + INIT_HLIST_NODE(&e->colision); + e->epoch = NULL; + e->flags = 0; + + trace_drbd_ee(mdev, e, "allocated"); + + return e; + + fail2: + drbd_pp_free_bio_pages(mdev, bio); + bio_put(bio); + fail1: + mempool_free(e, drbd_ee_mempool); + + return NULL; +} + +void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +{ + struct bio *bio = e->private_bio; + trace_drbd_ee(mdev, e, "freed"); + drbd_pp_free_bio_pages(mdev, bio); + bio_put(bio); + D_ASSERT(hlist_unhashed(&e->colision)); + mempool_free(e, drbd_ee_mempool); +} + +int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list) +{ + LIST_HEAD(work_list); + struct drbd_epoch_entry *e, *t; + int count = 0; + + spin_lock_irq(&mdev->req_lock); + list_splice_init(list, &work_list); + spin_unlock_irq(&mdev->req_lock); + + list_for_each_entry_safe(e, t, &work_list, w.list) { + drbd_free_ee(mdev, e); + count++; + } + return count; +} + + +/* + * This function is called from _asender only_ + * but see also comments in _req_mod(,barrier_acked) + * and receive_Barrier. + * + * Move entries from net_ee to done_ee, if ready. + * Grab done_ee, call all callbacks, free the entries. + * The callbacks typically send out ACKs. + */ +static int drbd_process_done_ee(struct drbd_conf *mdev) +{ + LIST_HEAD(work_list); + LIST_HEAD(reclaimed); + struct drbd_epoch_entry *e, *t; + int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS); + + spin_lock_irq(&mdev->req_lock); + reclaim_net_ee(mdev, &reclaimed); + list_splice_init(&mdev->done_ee, &work_list); + spin_unlock_irq(&mdev->req_lock); + + list_for_each_entry_safe(e, t, &reclaimed, w.list) + drbd_free_ee(mdev, e); + + /* possible callbacks here: + * e_end_block, and e_end_resync_block, e_send_discard_ack. + * all ignore the last argument. + */ + list_for_each_entry_safe(e, t, &work_list, w.list) { + trace_drbd_ee(mdev, e, "process_done_ee"); + /* list_del not necessary, next/prev members not touched */ + ok = e->w.cb(mdev, &e->w, !ok) && ok; + drbd_free_ee(mdev, e); + } + wake_up(&mdev->ee_wait); + + return ok; +} + +void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) +{ + DEFINE_WAIT(wait); + + /* avoids spin_lock/unlock + * and calling prepare_to_wait in the fast path */ + while (!list_empty(head)) { + prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mdev->req_lock); + drbd_kick_lo(mdev); + schedule(); + finish_wait(&mdev->ee_wait, &wait); + spin_lock_irq(&mdev->req_lock); + } +} + +void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head) +{ + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev, head); + spin_unlock_irq(&mdev->req_lock); +} + +/* see also kernel_accept; which is only present since 2.6.18. + * also we want to log which part of it failed, exactly */ +static int drbd_accept(struct drbd_conf *mdev, const char **what, + struct socket *sock, struct socket **newsock) +{ + struct sock *sk = sock->sk; + int err = 0; + + *what = "listen"; + err = sock->ops->listen(sock, 5); + if (err < 0) + goto out; + + *what = "sock_create_lite"; + err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, + newsock); + if (err < 0) + goto out; + + *what = "accept"; + err = sock->ops->accept(sock, *newsock, 0); + if (err < 0) { + sock_release(*newsock); + *newsock = NULL; + goto out; + } + (*newsock)->ops = sock->ops; + +out: + return err; +} + +static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock, + void *buf, size_t size, int flags) +{ + mm_segment_t oldfs; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&iov, + .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) + }; + int rv; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); + set_fs(oldfs); + + return rv; +} + +static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size) +{ + mm_segment_t oldfs; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&iov, + .msg_flags = MSG_WAITALL | MSG_NOSIGNAL + }; + int rv; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + + for (;;) { + rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags); + if (rv == size) + break; + + /* Note: + * ECONNRESET other side closed the connection + * ERESTARTSYS (on sock) we got a signal + */ + + if (rv < 0) { + if (rv == -ECONNRESET) + dev_info(DEV, "sock was reset by peer\n"); + else if (rv != -ERESTARTSYS) + dev_err(DEV, "sock_recvmsg returned %d\n", rv); + break; + } else if (rv == 0) { + dev_info(DEV, "sock was shut down by peer\n"); + break; + } else { + /* signal came in, or peer/link went down, + * after we read a partial message + */ + /* D_ASSERT(signal_pending(current)); */ + break; + } + }; + + set_fs(oldfs); + + if (rv != size) + drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE)); + + return rv; +} + +static struct socket *drbd_try_connect(struct drbd_conf *mdev) +{ + const char *what; + struct socket *sock; + struct sockaddr_in6 src_in6; + int err; + int disconnect_on_error = 1; + + if (!get_net_conf(mdev)) + return NULL; + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + sock = NULL; + goto out; + } + + sock->sk->sk_rcvtimeo = + sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + + /* explicitly bind to the configured IP as source IP + * for the outgoing connections. + * This is needed for multihomed hosts and to be + * able to use lo: interfaces for drbd. + * Make sure to use 0 as port number, so linux selects + * a free one dynamically. + */ + memcpy(&src_in6, mdev->net_conf->my_addr, + min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6))); + if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6) + src_in6.sin6_port = 0; + else + ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */ + + what = "bind before connect"; + err = sock->ops->bind(sock, + (struct sockaddr *) &src_in6, + mdev->net_conf->my_addr_len); + if (err < 0) + goto out; + + /* connect may fail, peer not yet available. + * stay C_WF_CONNECTION, don't go Disconnecting! */ + disconnect_on_error = 0; + what = "connect"; + err = sock->ops->connect(sock, + (struct sockaddr *)mdev->net_conf->peer_addr, + mdev->net_conf->peer_addr_len, 0); + +out: + if (err < 0) { + if (sock) { + sock_release(sock); + sock = NULL; + } + switch (-err) { + /* timeout, busy, signal pending */ + case ETIMEDOUT: case EAGAIN: case EINPROGRESS: + case EINTR: case ERESTARTSYS: + /* peer not (yet) available, network problem */ + case ECONNREFUSED: case ENETUNREACH: + case EHOSTDOWN: case EHOSTUNREACH: + disconnect_on_error = 0; + break; + default: + dev_err(DEV, "%s failed, err = %d\n", what, err); + } + if (disconnect_on_error) + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + put_net_conf(mdev); + return sock; +} + +static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) +{ + int timeo, err; + struct socket *s_estab = NULL, *s_listen; + const char *what; + + if (!get_net_conf(mdev)) + return NULL; + + what = "sock_create_kern"; + err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family, + SOCK_STREAM, IPPROTO_TCP, &s_listen); + if (err) { + s_listen = NULL; + goto out; + } + + timeo = mdev->net_conf->try_connect_int * HZ; + timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ + + s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ + s_listen->sk->sk_rcvtimeo = timeo; + s_listen->sk->sk_sndtimeo = timeo; + + what = "bind before listen"; + err = s_listen->ops->bind(s_listen, + (struct sockaddr *) mdev->net_conf->my_addr, + mdev->net_conf->my_addr_len); + if (err < 0) + goto out; + + err = drbd_accept(mdev, &what, s_listen, &s_estab); + +out: + if (s_listen) + sock_release(s_listen); + if (err < 0) { + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { + dev_err(DEV, "%s failed, err = %d\n", what, err); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + } + put_net_conf(mdev); + + return s_estab; +} + +static int drbd_send_fp(struct drbd_conf *mdev, + struct socket *sock, enum drbd_packets cmd) +{ + struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + + return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0); +} + +static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock) +{ + struct p_header *h = (struct p_header *) &mdev->data.sbuf.header; + int rr; + + rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0); + + if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC) + return be16_to_cpu(h->command); + + return 0xffff; +} + +/** + * drbd_socket_okay() - Free the socket if its connection is not okay + * @mdev: DRBD device. + * @sock: pointer to the pointer to the socket. + */ +static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock) +{ + int rr; + char tb[4]; + + if (!*sock) + return FALSE; + + rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK); + + if (rr > 0 || rr == -EAGAIN) { + return TRUE; + } else { + sock_release(*sock); + *sock = NULL; + return FALSE; + } +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + * -2 We do not have a network config... + */ +static int drbd_connect(struct drbd_conf *mdev) +{ + struct socket *s, *sock, *msock; + int try, h, ok; + + D_ASSERT(!mdev->data.socket); + + if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) + dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n"); + + if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS) + return -2; + + clear_bit(DISCARD_CONCURRENT, &mdev->flags); + + sock = NULL; + msock = NULL; + + do { + for (try = 0;;) { + /* 3 tries, this should take less than a second! */ + s = drbd_try_connect(mdev); + if (s || ++try >= 3) + break; + /* give the other side time to call bind() & listen() */ + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + } + + if (s) { + if (!sock) { + drbd_send_fp(mdev, s, P_HAND_SHAKE_S); + sock = s; + s = NULL; + } else if (!msock) { + drbd_send_fp(mdev, s, P_HAND_SHAKE_M); + msock = s; + s = NULL; + } else { + dev_err(DEV, "Logic error in drbd_connect()\n"); + goto out_release_sockets; + } + } + + if (sock && msock) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + ok = drbd_socket_okay(mdev, &sock); + ok = drbd_socket_okay(mdev, &msock) && ok; + if (ok) + break; + } + +retry: + s = drbd_wait_for_connect(mdev); + if (s) { + try = drbd_recv_fp(mdev, s); + drbd_socket_okay(mdev, &sock); + drbd_socket_okay(mdev, &msock); + switch (try) { + case P_HAND_SHAKE_S: + if (sock) { + dev_warn(DEV, "initial packet S crossed\n"); + sock_release(sock); + } + sock = s; + break; + case P_HAND_SHAKE_M: + if (msock) { + dev_warn(DEV, "initial packet M crossed\n"); + sock_release(msock); + } + msock = s; + set_bit(DISCARD_CONCURRENT, &mdev->flags); + break; + default: + dev_warn(DEV, "Error receiving initial packet\n"); + sock_release(s); + if (random32() & 1) + goto retry; + } + } + + if (mdev->state.conn <= C_DISCONNECTING) + goto out_release_sockets; + if (signal_pending(current)) { + flush_signals(current); + smp_rmb(); + if (get_t_state(&mdev->receiver) == Exiting) + goto out_release_sockets; + } + + if (sock && msock) { + ok = drbd_socket_okay(mdev, &sock); + ok = drbd_socket_okay(mdev, &msock) && ok; + if (ok) + break; + } + } while (1); + + msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + + sock->sk->sk_allocation = GFP_NOIO; + msock->sk->sk_allocation = GFP_NOIO; + + sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock->sk->sk_priority = TC_PRIO_INTERACTIVE; + + if (mdev->net_conf->sndbuf_size) { + sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + } + + if (mdev->net_conf->rcvbuf_size) { + sock->sk->sk_rcvbuf = mdev->net_conf->rcvbuf_size; + sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + } + + /* NOT YET ... + * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + * first set it to the P_HAND_SHAKE timeout, + * which we set to 4x the configured ping_timeout. */ + sock->sk->sk_sndtimeo = + sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10; + + msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + /* we don't want delays. + * we use TCP_CORK where apropriate, though */ + drbd_tcp_nodelay(sock); + drbd_tcp_nodelay(msock); + + mdev->data.socket = sock; + mdev->meta.socket = msock; + mdev->last_received = jiffies; + + D_ASSERT(mdev->asender.task == NULL); + + h = drbd_do_handshake(mdev); + if (h <= 0) + return h; + + if (mdev->cram_hmac_tfm) { + /* drbd_request_state(mdev, NS(conn, WFAuth)); */ + if (!drbd_do_auth(mdev)) { + dev_err(DEV, "Authentication of peer failed\n"); + return -1; + } + } + + if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) + return 0; + + sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; + sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + + atomic_set(&mdev->packet_seq, 0); + mdev->peer_seq = 0; + + drbd_thread_start(&mdev->asender); + + drbd_send_protocol(mdev); + drbd_send_sync_param(mdev, &mdev->sync_conf); + drbd_send_sizes(mdev, 0); + drbd_send_uuids(mdev); + drbd_send_state(mdev); + clear_bit(USE_DEGR_WFC_T, &mdev->flags); + clear_bit(RESIZE_PENDING, &mdev->flags); + + return 1; + +out_release_sockets: + if (sock) + sock_release(sock); + if (msock) + sock_release(msock); + return -1; +} + +static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h) +{ + int r; + + r = drbd_recv(mdev, h, sizeof(*h)); + + if (unlikely(r != sizeof(*h))) { + dev_err(DEV, "short read expecting header on sock: r=%d\n", r); + return FALSE; + }; + h->command = be16_to_cpu(h->command); + h->length = be16_to_cpu(h->length); + if (unlikely(h->magic != BE_DRBD_MAGIC)) { + dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + return FALSE; + } + mdev->last_received = jiffies; + + return TRUE; +} + +static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch) +{ + int rv; + + if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) { + rv = blkdev_issue_flush(mdev->ldev->backing_bdev, NULL); + if (rv) { + dev_err(DEV, "local disk flush failed with status %d\n", rv); + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + drbd_bump_write_ordering(mdev, WO_drain_io); + } + put_ldev(mdev); + } + + return drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); +} + +static int w_flush(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct flush_work *fw = (struct flush_work *)w; + struct drbd_epoch *epoch = fw->epoch; + + kfree(w); + + if (!test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags)) + drbd_flush_after_epoch(mdev, epoch); + + drbd_may_finish_epoch(mdev, epoch, EV_PUT | + (mdev->state.conn < C_CONNECTED ? EV_CLEANUP : 0)); + + return 1; +} + +/** + * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it. + * @mdev: DRBD device. + * @epoch: Epoch object. + * @ev: Epoch event. + */ +static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, + struct drbd_epoch *epoch, + enum epoch_event ev) +{ + int finish, epoch_size; + struct drbd_epoch *next_epoch; + int schedule_flush = 0; + enum finish_epoch rv = FE_STILL_LIVE; + + spin_lock(&mdev->epoch_lock); + do { + next_epoch = NULL; + finish = 0; + + epoch_size = atomic_read(&epoch->epoch_size); + + switch (ev & ~EV_CLEANUP) { + case EV_PUT: + atomic_dec(&epoch->active); + break; + case EV_GOT_BARRIER_NR: + set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags); + + /* Special case: If we just switched from WO_bio_barrier to + WO_bdev_flush we should not finish the current epoch */ + if (test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) && epoch_size == 1 && + mdev->write_ordering != WO_bio_barrier && + epoch == mdev->current_epoch) + clear_bit(DE_CONTAINS_A_BARRIER, &epoch->flags); + break; + case EV_BARRIER_DONE: + set_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags); + break; + case EV_BECAME_LAST: + /* nothing to do*/ + break; + } + + trace_drbd_epoch(mdev, epoch, ev); + + if (epoch_size != 0 && + atomic_read(&epoch->active) == 0 && + test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && + epoch->list.prev == &mdev->current_epoch->list && + !test_bit(DE_IS_FINISHING, &epoch->flags)) { + /* Nearly all conditions are met to finish that epoch... */ + if (test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) || + mdev->write_ordering == WO_none || + (epoch_size == 1 && test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) || + ev & EV_CLEANUP) { + finish = 1; + set_bit(DE_IS_FINISHING, &epoch->flags); + } else if (!test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) && + mdev->write_ordering == WO_bio_barrier) { + atomic_inc(&epoch->active); + schedule_flush = 1; + } + } + if (finish) { + if (!(ev & EV_CLEANUP)) { + spin_unlock(&mdev->epoch_lock); + drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); + spin_lock(&mdev->epoch_lock); + } + dec_unacked(mdev); + + if (mdev->current_epoch != epoch) { + next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); + list_del(&epoch->list); + ev = EV_BECAME_LAST | (ev & EV_CLEANUP); + mdev->epochs--; + trace_drbd_epoch(mdev, epoch, EV_TRACE_FREE); + kfree(epoch); + + if (rv == FE_STILL_LIVE) + rv = FE_DESTROYED; + } else { + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + /* atomic_set(&epoch->active, 0); is alrady zero */ + if (rv == FE_STILL_LIVE) + rv = FE_RECYCLED; + } + } + + if (!next_epoch) + break; + + epoch = next_epoch; + } while (1); + + spin_unlock(&mdev->epoch_lock); + + if (schedule_flush) { + struct flush_work *fw; + fw = kmalloc(sizeof(*fw), GFP_ATOMIC); + if (fw) { + trace_drbd_epoch(mdev, epoch, EV_TRACE_FLUSH); + fw->w.cb = w_flush; + fw->epoch = epoch; + drbd_queue_work(&mdev->data.work, &fw->w); + } else { + dev_warn(DEV, "Could not kmalloc a flush_work obj\n"); + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); + /* That is not a recursion, only one level */ + drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE); + drbd_may_finish_epoch(mdev, epoch, EV_PUT); + } + } + + return rv; +} + +/** + * drbd_bump_write_ordering() - Fall back to an other write ordering method + * @mdev: DRBD device. + * @wo: Write ordering method to try. + */ +void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local) +{ + enum write_ordering_e pwo; + static char *write_ordering_str[] = { + [WO_none] = "none", + [WO_drain_io] = "drain", + [WO_bdev_flush] = "flush", + [WO_bio_barrier] = "barrier", + }; + + pwo = mdev->write_ordering; + wo = min(pwo, wo); + if (wo == WO_bio_barrier && mdev->ldev->dc.no_disk_barrier) + wo = WO_bdev_flush; + if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush) + wo = WO_drain_io; + if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain) + wo = WO_none; + mdev->write_ordering = wo; + if (pwo != mdev->write_ordering || wo == WO_bio_barrier) + dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]); +} + +/** + * w_e_reissue() - Worker callback; Resubmit a bio, without BIO_RW_BARRIER set + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways (unused in this callback) + */ +int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __releases(local) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + struct bio *bio = e->private_bio; + + /* We leave DE_CONTAINS_A_BARRIER and EE_IS_BARRIER in place, + (and DE_BARRIER_IN_NEXT_EPOCH_ISSUED in the previous Epoch) + so that we can finish that epoch in drbd_may_finish_epoch(). + That is necessary if we already have a long chain of Epochs, before + we realize that BIO_RW_BARRIER is actually not supported */ + + /* As long as the -ENOTSUPP on the barrier is reported immediately + that will never trigger. If it is reported late, we will just + print that warning and continue correctly for all future requests + with WO_bdev_flush */ + if (previous_epoch(mdev, e->epoch)) + dev_warn(DEV, "Write ordering was not enforced (one time event)\n"); + + /* prepare bio for re-submit, + * re-init volatile members */ + /* we still have a local reference, + * get_ldev was done in receive_Data. */ + bio->bi_bdev = mdev->ldev->backing_bdev; + bio->bi_sector = e->sector; + bio->bi_size = e->size; + bio->bi_idx = 0; + + bio->bi_flags &= ~(BIO_POOL_MASK - 1); + bio->bi_flags |= 1 << BIO_UPTODATE; + + /* don't know whether this is necessary: */ + bio->bi_phys_segments = 0; + bio->bi_next = NULL; + + /* these should be unchanged: */ + /* bio->bi_end_io = drbd_endio_write_sec; */ + /* bio->bi_vcnt = whatever; */ + + e->w.cb = e_end_block; + + /* This is no longer a barrier request. */ + bio->bi_rw &= ~(1UL << BIO_RW_BARRIER); + + drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, bio); + + return 1; +} + +static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) +{ + int rv, issue_flush; + struct p_barrier *p = (struct p_barrier *)h; + struct drbd_epoch *epoch; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + + rv = drbd_recv(mdev, h->payload, h->length); + ERR_IF(rv != h->length) return FALSE; + + inc_unacked(mdev); + + if (mdev->net_conf->wire_protocol != DRBD_PROT_C) + drbd_kick_lo(mdev); + + mdev->current_epoch->barrier_nr = p->barrier; + rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR); + + /* P_BARRIER_ACK may imply that the corresponding extent is dropped from + * the activity log, which means it would not be resynced in case the + * R_PRIMARY crashes now. + * Therefore we must send the barrier_ack after the barrier request was + * completed. */ + switch (mdev->write_ordering) { + case WO_bio_barrier: + case WO_none: + if (rv == FE_RECYCLED) + return TRUE; + break; + + case WO_bdev_flush: + case WO_drain_io: + D_ASSERT(rv == FE_STILL_LIVE); + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &mdev->current_epoch->flags); + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); + if (rv == FE_RECYCLED) + return TRUE; + + /* The asender will send all the ACKs and barrier ACKs out, since + all EEs moved from the active_ee to the done_ee. We need to + provide a new epoch object for the EEs that come in soon */ + break; + } + + /* receiver context, in the writeout path of the other node. + * avoid potential distributed deadlock */ + epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO); + if (!epoch) { + dev_warn(DEV, "Allocation of an epoch failed, slowing down\n"); + issue_flush = !test_and_set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); + drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + if (issue_flush) { + rv = drbd_flush_after_epoch(mdev, mdev->current_epoch); + if (rv == FE_RECYCLED) + return TRUE; + } + + drbd_wait_ee_list_empty(mdev, &mdev->done_ee); + + return TRUE; + } + + epoch->flags = 0; + atomic_set(&epoch->epoch_size, 0); + atomic_set(&epoch->active, 0); + + spin_lock(&mdev->epoch_lock); + if (atomic_read(&mdev->current_epoch->epoch_size)) { + list_add(&epoch->list, &mdev->current_epoch->list); + mdev->current_epoch = epoch; + mdev->epochs++; + trace_drbd_epoch(mdev, epoch, EV_TRACE_ALLOC); + } else { + /* The current_epoch got recycled while we allocated this one... */ + kfree(epoch); + } + spin_unlock(&mdev->epoch_lock); + + return TRUE; +} + +/* used from receive_RSDataReply (recv_resync_read) + * and from receive_Data */ +static struct drbd_epoch_entry * +read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local) +{ + struct drbd_epoch_entry *e; + struct bio_vec *bvec; + struct page *page; + struct bio *bio; + int dgs, ds, i, rr; + void *dig_in = mdev->int_dig_in; + void *dig_vv = mdev->int_dig_vv; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; + + if (dgs) { + rr = drbd_recv(mdev, dig_in, dgs); + if (rr != dgs) { + dev_warn(DEV, "short read receiving data digest: read %d expected %d\n", + rr, dgs); + return NULL; + } + } + + data_size -= dgs; + + ERR_IF(data_size & 0x1ff) return NULL; + ERR_IF(data_size > DRBD_MAX_SEGMENT_SIZE) return NULL; + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO); + if (!e) + return NULL; + bio = e->private_bio; + ds = data_size; + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + rr = drbd_recv(mdev, kmap(page), min_t(int, ds, PAGE_SIZE)); + kunmap(page); + if (rr != min_t(int, ds, PAGE_SIZE)) { + drbd_free_ee(mdev, e); + dev_warn(DEV, "short read receiving data: read %d expected %d\n", + rr, min_t(int, ds, PAGE_SIZE)); + return NULL; + } + ds -= rr; + } + + if (dgs) { + drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + if (memcmp(dig_in, dig_vv, dgs)) { + dev_err(DEV, "Digest integrity check FAILED.\n"); + drbd_bcast_ee(mdev, "digest failed", + dgs, dig_in, dig_vv, e); + drbd_free_ee(mdev, e); + return NULL; + } + } + mdev->recv_cnt += data_size>>9; + return e; +} + +/* drbd_drain_block() just takes a data block + * out of the socket input buffer, and discards it. + */ +static int drbd_drain_block(struct drbd_conf *mdev, int data_size) +{ + struct page *page; + int rr, rv = 1; + void *data; + + page = drbd_pp_alloc(mdev, 1); + + data = kmap(page); + while (data_size) { + rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE)); + if (rr != min_t(int, data_size, PAGE_SIZE)) { + rv = 0; + dev_warn(DEV, "short read receiving data: read %d expected %d\n", + rr, min_t(int, data_size, PAGE_SIZE)); + break; + } + data_size -= rr; + } + kunmap(page); + drbd_pp_free(mdev, page); + return rv; +} + +static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req, + sector_t sector, int data_size) +{ + struct bio_vec *bvec; + struct bio *bio; + int dgs, rr, i, expect; + void *dig_in = mdev->int_dig_in; + void *dig_vv = mdev->int_dig_vv; + + dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ? + crypto_hash_digestsize(mdev->integrity_r_tfm) : 0; + + if (dgs) { + rr = drbd_recv(mdev, dig_in, dgs); + if (rr != dgs) { + dev_warn(DEV, "short read receiving data reply digest: read %d expected %d\n", + rr, dgs); + return 0; + } + } + + data_size -= dgs; + + /* optimistically update recv_cnt. if receiving fails below, + * we disconnect anyways, and counters will be reset. */ + mdev->recv_cnt += data_size>>9; + + bio = req->master_bio; + D_ASSERT(sector == bio->bi_sector); + + bio_for_each_segment(bvec, bio, i) { + expect = min_t(int, data_size, bvec->bv_len); + rr = drbd_recv(mdev, + kmap(bvec->bv_page)+bvec->bv_offset, + expect); + kunmap(bvec->bv_page); + if (rr != expect) { + dev_warn(DEV, "short read receiving data reply: " + "read %d expected %d\n", + rr, expect); + return 0; + } + data_size -= rr; + } + + if (dgs) { + drbd_csum(mdev, mdev->integrity_r_tfm, bio, dig_vv); + if (memcmp(dig_in, dig_vv, dgs)) { + dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n"); + return 0; + } + } + + D_ASSERT(data_size == 0); + return 1; +} + +/* e_end_resync_block() is called via + * drbd_process_done_ee() by asender only */ +static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + sector_t sector = e->sector; + int ok; + + D_ASSERT(hlist_unhashed(&e->colision)); + + if (likely(drbd_bio_uptodate(e->private_bio))) { + drbd_set_in_sync(mdev, sector, e->size); + ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e); + } else { + /* Record failure to sync */ + drbd_rs_failed_io(mdev, sector, e->size); + + ok = drbd_send_ack(mdev, P_NEG_ACK, e); + } + dec_unacked(mdev); + + return ok; +} + +static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local) +{ + struct drbd_epoch_entry *e; + + e = read_in_block(mdev, ID_SYNCER, sector, data_size); + if (!e) { + put_ldev(mdev); + return FALSE; + } + + dec_rs_pending(mdev); + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->private_bio->bi_rw = WRITE; + e->w.cb = e_end_resync_block; + + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_resync_block() + * respective _drbd_clear_done_ee */ + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->sync_ee); + spin_unlock_irq(&mdev->req_lock); + + trace_drbd_ee(mdev, e, "submitting for (rs)write"); + trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); + drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; +} + +static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h) +{ + struct drbd_request *req; + sector_t sector; + unsigned int header_size, data_size; + int ok; + struct p_data *p = (struct p_data *)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + + spin_lock_irq(&mdev->req_lock); + req = _ar_id_to_req(mdev, p->block_id, sector); + spin_unlock_irq(&mdev->req_lock); + if (unlikely(!req)) { + dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n"); + return FALSE; + } + + /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid + * special casing it there for the various failure cases. + * still no race with drbd_fail_pending_reads */ + ok = recv_dless_read(mdev, req, sector, data_size); + + if (ok) + req_mod(req, data_received); + /* else: nothing. handled from drbd_disconnect... + * I don't think we may complete this just yet + * in case we are "on-disconnect: freeze" */ + + return ok; +} + +static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + unsigned int header_size, data_size; + int ok; + struct p_data *p = (struct p_data *)h; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + sector = be64_to_cpu(p->sector); + D_ASSERT(p->block_id == ID_SYNCER); + + if (get_ldev(mdev)) { + /* data is submitted to disk within recv_resync_read. + * corresponding put_ldev done below on error, + * or in drbd_endio_write_sec. */ + ok = recv_resync_read(mdev, sector, data_size); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Can not write resync data to local disk.\n"); + + ok = drbd_drain_block(mdev, data_size); + + drbd_send_ack_dp(mdev, P_NEG_ACK, p); + } + + return ok; +} + +/* e_end_block() is called via drbd_process_done_ee(). + * this means this function only runs in the asender thread + */ +static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + sector_t sector = e->sector; + struct drbd_epoch *epoch; + int ok = 1, pcmd; + + if (e->flags & EE_IS_BARRIER) { + epoch = previous_epoch(mdev, e->epoch); + if (epoch) + drbd_may_finish_epoch(mdev, epoch, EV_BARRIER_DONE + (cancel ? EV_CLEANUP : 0)); + } + + if (mdev->net_conf->wire_protocol == DRBD_PROT_C) { + if (likely(drbd_bio_uptodate(e->private_bio))) { + pcmd = (mdev->state.conn >= C_SYNC_SOURCE && + mdev->state.conn <= C_PAUSED_SYNC_T && + e->flags & EE_MAY_SET_IN_SYNC) ? + P_RS_WRITE_ACK : P_WRITE_ACK; + ok &= drbd_send_ack(mdev, pcmd, e); + if (pcmd == P_RS_WRITE_ACK) + drbd_set_in_sync(mdev, sector, e->size); + } else { + ok = drbd_send_ack(mdev, P_NEG_ACK, e); + /* we expect it to be marked out of sync anyways... + * maybe assert this? */ + } + dec_unacked(mdev); + } + /* we delete from the conflict detection hash _after_ we sent out the + * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ + if (mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + } else { + D_ASSERT(hlist_unhashed(&e->colision)); + } + + drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + + return ok; +} + +static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused) +{ + struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w; + int ok = 1; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); + + spin_lock_irq(&mdev->req_lock); + D_ASSERT(!hlist_unhashed(&e->colision)); + hlist_del_init(&e->colision); + spin_unlock_irq(&mdev->req_lock); + + dec_unacked(mdev); + + return ok; +} + +/* Called from receive_Data. + * Synchronize packets on sock with packets on msock. + * + * This is here so even when a P_DATA packet traveling via sock overtook an Ack + * packet traveling on msock, they are still processed in the order they have + * been sent. + * + * Note: we don't care for Ack packets overtaking P_DATA packets. + * + * In case packet_seq is larger than mdev->peer_seq number, there are + * outstanding packets on the msock. We wait for them to arrive. + * In case we are the logically next packet, we update mdev->peer_seq + * ourselves. Correctly handles 32bit wrap around. + * + * Assume we have a 10 GBit connection, that is about 1<<30 byte per second, + * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds + * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have + * 1<<9 == 512 seconds aka ages for the 32bit wrap around... + * + * returns 0 if we may process the packet, + * -ERESTARTSYS if we were interrupted (by disconnect signal). */ +static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq) +{ + DEFINE_WAIT(wait); + unsigned int p_seq; + long timeout; + int ret = 0; + spin_lock(&mdev->peer_seq_lock); + for (;;) { + prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE); + if (seq_le(packet_seq, mdev->peer_seq+1)) + break; + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + p_seq = mdev->peer_seq; + spin_unlock(&mdev->peer_seq_lock); + timeout = schedule_timeout(30*HZ); + spin_lock(&mdev->peer_seq_lock); + if (timeout == 0 && p_seq == mdev->peer_seq) { + ret = -ETIMEDOUT; + dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n"); + break; + } + } + finish_wait(&mdev->seq_wait, &wait); + if (mdev->peer_seq+1 == packet_seq) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + return ret; +} + +/* mirrored write */ +static int receive_Data(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + struct drbd_epoch_entry *e; + struct p_data *p = (struct p_data *)h; + int header_size, data_size; + int rw = WRITE; + u32 dp_flags; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + ERR_IF(data_size == 0) return FALSE; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + if (!get_ldev(mdev)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Can not write mirrored data block " + "to local disk.\n"); + spin_lock(&mdev->peer_seq_lock); + if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num)) + mdev->peer_seq++; + spin_unlock(&mdev->peer_seq_lock); + + drbd_send_ack_dp(mdev, P_NEG_ACK, p); + atomic_inc(&mdev->current_epoch->epoch_size); + return drbd_drain_block(mdev, data_size); + } + + /* get_ldev(mdev) successful. + * Corresponding put_ldev done either below (on various errors), + * or in drbd_endio_write_sec, if we successfully submit the data at + * the end of this function. */ + + sector = be64_to_cpu(p->sector); + e = read_in_block(mdev, p->block_id, sector, data_size); + if (!e) { + put_ldev(mdev); + return FALSE; + } + + e->private_bio->bi_end_io = drbd_endio_write_sec; + e->w.cb = e_end_block; + + spin_lock(&mdev->epoch_lock); + e->epoch = mdev->current_epoch; + atomic_inc(&e->epoch->epoch_size); + atomic_inc(&e->epoch->active); + + if (mdev->write_ordering == WO_bio_barrier && atomic_read(&e->epoch->epoch_size) == 1) { + struct drbd_epoch *epoch; + /* Issue a barrier if we start a new epoch, and the previous epoch + was not a epoch containing a single request which already was + a Barrier. */ + epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); + if (epoch == e->epoch) { + set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); + trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); + rw |= (1<flags |= EE_IS_BARRIER; + } else { + if (atomic_read(&epoch->epoch_size) > 1 || + !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { + set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); + trace_drbd_epoch(mdev, epoch, EV_TRACE_SETTING_BI); + set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); + trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); + rw |= (1<flags |= EE_IS_BARRIER; + } + } + } + spin_unlock(&mdev->epoch_lock); + + dp_flags = be32_to_cpu(p->dp_flags); + if (dp_flags & DP_HARDBARRIER) { + dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n"); + /* rw |= (1<flags |= EE_MAY_SET_IN_SYNC; + + /* I'm the receiver, I do hold a net_cnt reference. */ + if (!mdev->net_conf->two_primaries) { + spin_lock_irq(&mdev->req_lock); + } else { + /* don't get the req_lock yet, + * we may sleep in drbd_wait_peer_seq */ + const int size = e->size; + const int discard = test_bit(DISCARD_CONCURRENT, &mdev->flags); + DEFINE_WAIT(wait); + struct drbd_request *i; + struct hlist_node *n; + struct hlist_head *slot; + int first; + + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + BUG_ON(mdev->ee_hash == NULL); + BUG_ON(mdev->tl_hash == NULL); + + /* conflict detection and handling: + * 1. wait on the sequence number, + * in case this data packet overtook ACK packets. + * 2. check our hash tables for conflicting requests. + * we only need to walk the tl_hash, since an ee can not + * have a conflict with an other ee: on the submitting + * node, the corresponding req had already been conflicting, + * and a conflicting req is never sent. + * + * Note: for two_primaries, we are protocol C, + * so there cannot be any request that is DONE + * but still on the transfer log. + * + * unconditionally add to the ee_hash. + * + * if no conflicting request is found: + * submit. + * + * if any conflicting request is found + * that has not yet been acked, + * AND I have the "discard concurrent writes" flag: + * queue (via done_ee) the P_DISCARD_ACK; OUT. + * + * if any conflicting request is found: + * block the receiver, waiting on misc_wait + * until no more conflicting requests are there, + * or we get interrupted (disconnect). + * + * we do not just write after local io completion of those + * requests, but only after req is done completely, i.e. + * we wait for the P_DISCARD_ACK to arrive! + * + * then proceed normally, i.e. submit. + */ + if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num))) + goto out_interrupted; + + spin_lock_irq(&mdev->req_lock); + + hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev, sector); + first = 1; + for (;;) { + int have_unacked = 0; + int have_conflict = 0; + prepare_to_wait(&mdev->misc_wait, &wait, + TASK_INTERRUPTIBLE); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + /* only ALERT on first iteration, + * we may be woken up early... */ + if (first) + dev_alert(DEV, "%s[%u] Concurrent local write detected!" + " new: %llus +%u; pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + if (i->rq_state & RQ_NET_PENDING) + ++have_unacked; + ++have_conflict; + } + } +#undef OVERLAPS + if (!have_conflict) + break; + + /* Discard Ack only for the _first_ iteration */ + if (first && discard && have_unacked) { + dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n", + (unsigned long long)sector); + inc_unacked(mdev); + e->w.cb = e_send_discard_ack; + list_add_tail(&e->w.list, &mdev->done_ee); + + spin_unlock_irq(&mdev->req_lock); + + /* we could probably send that P_DISCARD_ACK ourselves, + * but I don't like the receiver using the msock */ + + put_ldev(mdev); + wake_asender(mdev); + finish_wait(&mdev->misc_wait, &wait); + return TRUE; + } + + if (signal_pending(current)) { + hlist_del_init(&e->colision); + + spin_unlock_irq(&mdev->req_lock); + + finish_wait(&mdev->misc_wait, &wait); + goto out_interrupted; + } + + spin_unlock_irq(&mdev->req_lock); + if (first) { + first = 0; + dev_alert(DEV, "Concurrent write! [W AFTERWARDS] " + "sec=%llus\n", (unsigned long long)sector); + } else if (discard) { + /* we had none on the first iteration. + * there must be none now. */ + D_ASSERT(have_unacked == 0); + } + schedule(); + spin_lock_irq(&mdev->req_lock); + } + finish_wait(&mdev->misc_wait, &wait); + } + + list_add(&e->w.list, &mdev->active_ee); + spin_unlock_irq(&mdev->req_lock); + + switch (mdev->net_conf->wire_protocol) { + case DRBD_PROT_C: + inc_unacked(mdev); + /* corresponding dec_unacked() in e_end_block() + * respective _drbd_clear_done_ee */ + break; + case DRBD_PROT_B: + /* I really don't like it that the receiver thread + * sends on the msock, but anyways */ + drbd_send_ack(mdev, P_RECV_ACK, e); + break; + case DRBD_PROT_A: + /* nothing to do */ + break; + } + + if (mdev->state.pdsk == D_DISKLESS) { + /* In case we have the only disk of the cluster, */ + drbd_set_out_of_sync(mdev, e->sector, e->size); + e->flags |= EE_CALL_AL_COMPLETE_IO; + drbd_al_begin_io(mdev, e->sector); + } + + e->private_bio->bi_rw = rw; + trace_drbd_ee(mdev, e, "submitting for (data)write"); + trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); + drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); + /* accounting done in endio */ + + maybe_kick_lo(mdev); + return TRUE; + +out_interrupted: + /* yes, the epoch_size now is imbalanced. + * but we drop the connection anyways, so we don't have a chance to + * receive a barrier... atomic_inc(&mdev->epoch_size); */ + put_ldev(mdev); + drbd_free_ee(mdev, e); + return FALSE; +} + +static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + struct drbd_epoch_entry *e; + struct digest_info *di = NULL; + int size, digest_size; + unsigned int fault_type; + struct p_block_req *p = + (struct p_block_req *)h; + const int brps = sizeof(*p)-sizeof(*h); + + if (drbd_recv(mdev, h->payload, brps) != brps) + return FALSE; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { + dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return FALSE; + } + if (sector + (size>>9) > capacity) { + dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__, + (unsigned long long)sector, size); + return FALSE; + } + + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Can not satisfy peer's read request, " + "no local data.\n"); + drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY : + P_NEG_RS_DREPLY , p); + return TRUE; + } + + /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD + * "criss-cross" setup, that might cause write-out on some other DRBD, + * which in turn might block on the other node at this very place. */ + e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO); + if (!e) { + put_ldev(mdev); + return FALSE; + } + + e->private_bio->bi_rw = READ; + e->private_bio->bi_end_io = drbd_endio_read_sec; + + switch (h->command) { + case P_DATA_REQUEST: + e->w.cb = w_e_end_data_req; + fault_type = DRBD_FAULT_DT_RD; + break; + case P_RS_DATA_REQUEST: + e->w.cb = w_e_end_rsdata_req; + fault_type = DRBD_FAULT_RS_RD; + /* Eventually this should become asynchronously. Currently it + * blocks the whole receiver just to delay the reading of a + * resync data block. + * the drbd_work_queue mechanism is made for this... + */ + if (!drbd_rs_begin_io(mdev, sector)) { + /* we have been interrupted, + * probably connection lost! */ + D_ASSERT(signal_pending(current)); + goto out_free_e; + } + break; + + case P_OV_REPLY: + case P_CSUM_RS_REQUEST: + fault_type = DRBD_FAULT_RS_RD; + digest_size = h->length - brps ; + di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO); + if (!di) + goto out_free_e; + + di->digest_size = digest_size; + di->digest = (((char *)di)+sizeof(struct digest_info)); + + if (drbd_recv(mdev, di->digest, digest_size) != digest_size) + goto out_free_e; + + e->block_id = (u64)(unsigned long)di; + if (h->command == P_CSUM_RS_REQUEST) { + D_ASSERT(mdev->agreed_pro_version >= 89); + e->w.cb = w_e_end_csum_rs_req; + } else if (h->command == P_OV_REPLY) { + e->w.cb = w_e_end_ov_reply; + dec_rs_pending(mdev); + break; + } + + if (!drbd_rs_begin_io(mdev, sector)) { + /* we have been interrupted, probably connection lost! */ + D_ASSERT(signal_pending(current)); + goto out_free_e; + } + break; + + case P_OV_REQUEST: + if (mdev->state.conn >= C_CONNECTED && + mdev->state.conn != C_VERIFY_T) + dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n", + drbd_conn_str(mdev->state.conn)); + if (mdev->ov_start_sector == ~(sector_t)0 && + mdev->agreed_pro_version >= 90) { + mdev->ov_start_sector = sector; + mdev->ov_position = sector; + mdev->ov_left = mdev->rs_total - BM_SECT_TO_BIT(sector); + dev_info(DEV, "Online Verify start sector: %llu\n", + (unsigned long long)sector); + } + e->w.cb = w_e_end_ov_req; + fault_type = DRBD_FAULT_RS_RD; + /* Eventually this should become asynchronous. Currently it + * blocks the whole receiver just to delay the reading of a + * resync data block. + * the drbd_work_queue mechanism is made for this... + */ + if (!drbd_rs_begin_io(mdev, sector)) { + /* we have been interrupted, + * probably connection lost! */ + D_ASSERT(signal_pending(current)); + goto out_free_e; + } + break; + + + default: + dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n", + cmdname(h->command)); + fault_type = DRBD_FAULT_MAX; + } + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + inc_unacked(mdev); + + trace_drbd_ee(mdev, e, "submitting for read"); + trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); + drbd_generic_make_request(mdev, fault_type, e->private_bio); + maybe_kick_lo(mdev); + + return TRUE; + +out_free_e: + kfree(di); + put_ldev(mdev); + drbd_free_ee(mdev, e); + return FALSE; +} + +static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) +{ + int self, peer, rv = -100; + unsigned long ch_self, ch_peer; + + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; + peer = mdev->p_uuid[UI_BITMAP] & 1; + + ch_peer = mdev->p_uuid[UI_SIZE]; + ch_self = mdev->comm_bm_set; + + switch (mdev->net_conf->after_sb_0p) { + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + case ASB_CALL_HELPER: + dev_err(DEV, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_DISCARD_YOUNGER_PRI: + if (self == 0 && peer == 1) { + rv = -1; + break; + } + if (self == 1 && peer == 0) { + rv = 1; + break; + } + /* Else fall through to one of the other strategies... */ + case ASB_DISCARD_OLDER_PRI: + if (self == 0 && peer == 1) { + rv = 1; + break; + } + if (self == 1 && peer == 0) { + rv = -1; + break; + } + /* Else fall through to one of the other strategies... */ + dev_warn(DEV, "Discard younger/older primary did not found a decision\n" + "Using discard-least-changes instead\n"); + case ASB_DISCARD_ZERO_CHG: + if (ch_peer == 0 && ch_self == 0) { + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) + ? -1 : 1; + break; + } else { + if (ch_peer == 0) { rv = 1; break; } + if (ch_self == 0) { rv = -1; break; } + } + if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG) + break; + case ASB_DISCARD_LEAST_CHG: + if (ch_self < ch_peer) + rv = -1; + else if (ch_self > ch_peer) + rv = 1; + else /* ( ch_self == ch_peer ) */ + /* Well, then use something else. */ + rv = test_bit(DISCARD_CONCURRENT, &mdev->flags) + ? -1 : 1; + break; + case ASB_DISCARD_LOCAL: + rv = -1; + break; + case ASB_DISCARD_REMOTE: + rv = 1; + } + + return rv; +} + +static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local) +{ + int self, peer, hg, rv = -100; + + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; + peer = mdev->p_uuid[UI_BITMAP] & 1; + + switch (mdev->net_conf->after_sb_1p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + dev_err(DEV, "Configuration error.\n"); + break; + case ASB_DISCONNECT: + break; + case ASB_CONSENSUS: + hg = drbd_asb_recover_0p(mdev); + if (hg == -1 && mdev->state.role == R_SECONDARY) + rv = hg; + if (hg == 1 && mdev->state.role == R_PRIMARY) + rv = hg; + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(mdev); + break; + case ASB_DISCARD_SECONDARY: + return mdev->state.role == R_PRIMARY ? 1 : -1; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(mdev); + if (hg == -1 && mdev->state.role == R_PRIMARY) { + self = drbd_set_role(mdev, R_SECONDARY, 0); + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); + if (self != SS_SUCCESS) { + drbd_khelper(mdev, "pri-lost-after-sb"); + } else { + dev_warn(DEV, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local) +{ + int self, peer, hg, rv = -100; + + self = mdev->ldev->md.uuid[UI_BITMAP] & 1; + peer = mdev->p_uuid[UI_BITMAP] & 1; + + switch (mdev->net_conf->after_sb_2p) { + case ASB_DISCARD_YOUNGER_PRI: + case ASB_DISCARD_OLDER_PRI: + case ASB_DISCARD_LEAST_CHG: + case ASB_DISCARD_LOCAL: + case ASB_DISCARD_REMOTE: + case ASB_CONSENSUS: + case ASB_DISCARD_SECONDARY: + dev_err(DEV, "Configuration error.\n"); + break; + case ASB_VIOLENTLY: + rv = drbd_asb_recover_0p(mdev); + break; + case ASB_DISCONNECT: + break; + case ASB_CALL_HELPER: + hg = drbd_asb_recover_0p(mdev); + if (hg == -1) { + /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE, + * we might be here in C_WF_REPORT_PARAMS which is transient. + * we do not need to wait for the after state change work either. */ + self = drbd_change_state(mdev, CS_VERBOSE, NS(role, R_SECONDARY)); + if (self != SS_SUCCESS) { + drbd_khelper(mdev, "pri-lost-after-sb"); + } else { + dev_warn(DEV, "Successfully gave up primary role.\n"); + rv = hg; + } + } else + rv = hg; + } + + return rv; +} + +static void drbd_uuid_dump(struct drbd_conf *mdev, char *text, u64 *uuid, + u64 bits, u64 flags) +{ + if (!uuid) { + dev_info(DEV, "%s uuid info vanished while I was looking!\n", text); + return; + } + dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n", + text, + (unsigned long long)uuid[UI_CURRENT], + (unsigned long long)uuid[UI_BITMAP], + (unsigned long long)uuid[UI_HISTORY_START], + (unsigned long long)uuid[UI_HISTORY_END], + (unsigned long long)bits, + (unsigned long long)flags); +} + +/* + 100 after split brain try auto recover + 2 C_SYNC_SOURCE set BitMap + 1 C_SYNC_SOURCE use BitMap + 0 no Sync + -1 C_SYNC_TARGET use BitMap + -2 C_SYNC_TARGET set BitMap + -100 after split brain, disconnect +-1000 unrelated data + */ +static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(local) +{ + u64 self, peer; + int i, j; + + self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); + + *rule_nr = 10; + if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED) + return 0; + + *rule_nr = 20; + if ((self == UUID_JUST_CREATED || self == (u64)0) && + peer != UUID_JUST_CREATED) + return -2; + + *rule_nr = 30; + if (self != UUID_JUST_CREATED && + (peer == UUID_JUST_CREATED || peer == (u64)0)) + return 2; + + if (self == peer) { + int rct, dc; /* roles at crash time */ + + if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) { + + if (mdev->agreed_pro_version < 91) + return -1001; + + if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) && + (mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) { + dev_info(DEV, "was SyncSource, missed the resync finished event, corrected myself:\n"); + drbd_uuid_set_bm(mdev, 0UL); + + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, + mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); + *rule_nr = 34; + } else { + dev_info(DEV, "was SyncSource (peer failed to write sync_uuid)\n"); + *rule_nr = 36; + } + + return 1; + } + + if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) { + + if (mdev->agreed_pro_version < 91) + return -1001; + + if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) && + (mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1))) { + dev_info(DEV, "was SyncTarget, peer missed the resync finished event, corrected peer:\n"); + + mdev->p_uuid[UI_HISTORY_START + 1] = mdev->p_uuid[UI_HISTORY_START]; + mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_BITMAP]; + mdev->p_uuid[UI_BITMAP] = 0UL; + + drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); + *rule_nr = 35; + } else { + dev_info(DEV, "was SyncTarget (failed to write sync_uuid)\n"); + *rule_nr = 37; + } + + return -1; + } + + /* Common power [off|failure] */ + rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) + + (mdev->p_uuid[UI_FLAGS] & 2); + /* lowest bit is set when we were primary, + * next bit (weight 2) is set when peer was primary */ + *rule_nr = 40; + + switch (rct) { + case 0: /* !self_pri && !peer_pri */ return 0; + case 1: /* self_pri && !peer_pri */ return 1; + case 2: /* !self_pri && peer_pri */ return -1; + case 3: /* self_pri && peer_pri */ + dc = test_bit(DISCARD_CONCURRENT, &mdev->flags); + return dc ? -1 : 1; + } + } + + *rule_nr = 50; + peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer) + return -1; + + *rule_nr = 51; + peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); + peer = mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1); + if (self == peer) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of the peer's UUIDs. */ + + if (mdev->agreed_pro_version < 91) + return -1001; + + mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; + mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; + return -1; + } + } + + *rule_nr = 60; + self = mdev->ldev->md.uuid[UI_CURRENT] & ~((u64)1); + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + peer = mdev->p_uuid[i] & ~((u64)1); + if (self == peer) + return -2; + } + + *rule_nr = 70; + self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); + if (self == peer) + return 1; + + *rule_nr = 71; + self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + self = mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1); + peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1); + if (self == peer) { + /* The last P_SYNC_UUID did not get though. Undo the last start of + resync as sync source modifications of our UUIDs. */ + + if (mdev->agreed_pro_version < 91) + return -1001; + + _drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]); + _drbd_uuid_set(mdev, UI_HISTORY_START, mdev->ldev->md.uuid[UI_HISTORY_START + 1]); + + dev_info(DEV, "Undid last start of resync:\n"); + + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, + mdev->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(mdev) : 0, 0); + + return 1; + } + } + + + *rule_nr = 80; + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = mdev->ldev->md.uuid[i] & ~((u64)1); + if (self == peer) + return 2; + } + + *rule_nr = 90; + self = mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1); + peer = mdev->p_uuid[UI_BITMAP] & ~((u64)1); + if (self == peer && self != ((u64)0)) + return 100; + + *rule_nr = 100; + for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { + self = mdev->ldev->md.uuid[i] & ~((u64)1); + for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) { + peer = mdev->p_uuid[j] & ~((u64)1); + if (self == peer) + return -100; + } + } + + return -1000; +} + +/* drbd_sync_handshake() returns the new conn state on success, or + CONN_MASK (-1) on failure. + */ +static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role, + enum drbd_disk_state peer_disk) __must_hold(local) +{ + int hg, rule_nr; + enum drbd_conns rv = C_MASK; + enum drbd_disk_state mydisk; + + mydisk = mdev->state.disk; + if (mydisk == D_NEGOTIATING) + mydisk = mdev->new_state_tmp.disk; + + dev_info(DEV, "drbd_sync_handshake:\n"); + drbd_uuid_dump(mdev, "self", mdev->ldev->md.uuid, mdev->comm_bm_set, 0); + drbd_uuid_dump(mdev, "peer", mdev->p_uuid, + mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); + + hg = drbd_uuid_compare(mdev, &rule_nr); + + dev_info(DEV, "uuid_compare()=%d by rule %d\n", hg, rule_nr); + + if (hg == -1000) { + dev_alert(DEV, "Unrelated data, aborting!\n"); + return C_MASK; + } + if (hg == -1001) { + dev_alert(DEV, "To resolve this both sides have to support at least protocol\n"); + return C_MASK; + } + + if ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) || + (peer_disk == D_INCONSISTENT && mydisk > D_INCONSISTENT)) { + int f = (hg == -100) || abs(hg) == 2; + hg = mydisk > D_INCONSISTENT ? 1 : -1; + if (f) + hg = hg*2; + dev_info(DEV, "Becoming sync %s due to disk states.\n", + hg > 0 ? "source" : "target"); + } + + if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) { + int pcount = (mdev->state.role == R_PRIMARY) + + (peer_role == R_PRIMARY); + int forced = (hg == -100); + + switch (pcount) { + case 0: + hg = drbd_asb_recover_0p(mdev); + break; + case 1: + hg = drbd_asb_recover_1p(mdev); + break; + case 2: + hg = drbd_asb_recover_2p(mdev); + break; + } + if (abs(hg) < 100) { + dev_warn(DEV, "Split-Brain detected, %d primaries, " + "automatically solved. Sync from %s node\n", + pcount, (hg < 0) ? "peer" : "this"); + if (forced) { + dev_warn(DEV, "Doing a full sync, since" + " UUIDs where ambiguous.\n"); + hg = hg*2; + } + } + } + + if (hg == -100) { + if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1)) + hg = -1; + if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1)) + hg = 1; + + if (abs(hg) < 100) + dev_warn(DEV, "Split-Brain detected, manually solved. " + "Sync from %s node\n", + (hg < 0) ? "peer" : "this"); + } + + if (hg == -100) { + dev_alert(DEV, "Split-Brain detected, dropping connection!\n"); + drbd_khelper(mdev, "split-brain"); + return C_MASK; + } + + if (hg > 0 && mydisk <= D_INCONSISTENT) { + dev_err(DEV, "I shall become SyncSource, but I am inconsistent!\n"); + return C_MASK; + } + + if (hg < 0 && /* by intention we do not use mydisk here. */ + mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) { + switch (mdev->net_conf->rr_conflict) { + case ASB_CALL_HELPER: + drbd_khelper(mdev, "pri-lost"); + /* fall through */ + case ASB_DISCONNECT: + dev_err(DEV, "I shall become SyncTarget, but I am primary!\n"); + return C_MASK; + case ASB_VIOLENTLY: + dev_warn(DEV, "Becoming SyncTarget, violating the stable-data" + "assumption\n"); + } + } + + if (abs(hg) >= 2) { + dev_info(DEV, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write, "set_n_write from sync_handshake")) + return C_MASK; + } + + if (hg > 0) { /* become sync source. */ + rv = C_WF_BITMAP_S; + } else if (hg < 0) { /* become sync target */ + rv = C_WF_BITMAP_T; + } else { + rv = C_CONNECTED; + if (drbd_bm_total_weight(mdev)) { + dev_info(DEV, "No resync, but %lu bits in bitmap!\n", + drbd_bm_total_weight(mdev)); + } + } + + return rv; +} + +/* returns 1 if invalid */ +static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self) +{ + /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */ + if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) || + (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL)) + return 0; + + /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */ + if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL || + self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL) + return 1; + + /* everything else is valid if they are equal on both sides. */ + if (peer == self) + return 0; + + /* everything es is invalid. */ + return 1; +} + +static int receive_protocol(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_protocol *p = (struct p_protocol *)h; + int header_size, data_size; + int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p; + int p_want_lose, p_two_primaries; + char p_integrity_alg[SHARED_SECRET_MAX] = ""; + + header_size = sizeof(*p) - sizeof(*h); + data_size = h->length - header_size; + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + p_proto = be32_to_cpu(p->protocol); + p_after_sb_0p = be32_to_cpu(p->after_sb_0p); + p_after_sb_1p = be32_to_cpu(p->after_sb_1p); + p_after_sb_2p = be32_to_cpu(p->after_sb_2p); + p_want_lose = be32_to_cpu(p->want_lose); + p_two_primaries = be32_to_cpu(p->two_primaries); + + if (p_proto != mdev->net_conf->wire_protocol) { + dev_err(DEV, "incompatible communication protocols\n"); + goto disconnect; + } + + if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) { + dev_err(DEV, "incompatible after-sb-0pri settings\n"); + goto disconnect; + } + + if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) { + dev_err(DEV, "incompatible after-sb-1pri settings\n"); + goto disconnect; + } + + if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) { + dev_err(DEV, "incompatible after-sb-2pri settings\n"); + goto disconnect; + } + + if (p_want_lose && mdev->net_conf->want_lose) { + dev_err(DEV, "both sides have the 'want_lose' flag set\n"); + goto disconnect; + } + + if (p_two_primaries != mdev->net_conf->two_primaries) { + dev_err(DEV, "incompatible setting of the two-primaries options\n"); + goto disconnect; + } + + if (mdev->agreed_pro_version >= 87) { + unsigned char *my_alg = mdev->net_conf->integrity_alg; + + if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size) + return FALSE; + + p_integrity_alg[SHARED_SECRET_MAX-1] = 0; + if (strcmp(p_integrity_alg, my_alg)) { + dev_err(DEV, "incompatible setting of the data-integrity-alg\n"); + goto disconnect; + } + dev_info(DEV, "data-integrity-alg: %s\n", + my_alg[0] ? my_alg : (unsigned char *)""); + } + + return TRUE; + +disconnect: + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; +} + +/* helper function + * input: alg name, feature name + * return: NULL (alg name was "") + * ERR_PTR(error) if something goes wrong + * or the crypto hash ptr, if it worked out ok. */ +struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev, + const char *alg, const char *name) +{ + struct crypto_hash *tfm; + + if (!alg[0]) + return NULL; + + tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) { + dev_err(DEV, "Can not allocate \"%s\" as %s (reason: %ld)\n", + alg, name, PTR_ERR(tfm)); + return tfm; + } + if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) { + crypto_free_hash(tfm); + dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name); + return ERR_PTR(-EINVAL); + } + return tfm; +} + +static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h) +{ + int ok = TRUE; + struct p_rs_param_89 *p = (struct p_rs_param_89 *)h; + unsigned int header_size, data_size, exp_max_sz; + struct crypto_hash *verify_tfm = NULL; + struct crypto_hash *csums_tfm = NULL; + const int apv = mdev->agreed_pro_version; + + exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param) + : apv == 88 ? sizeof(struct p_rs_param) + + SHARED_SECRET_MAX + : /* 89 */ sizeof(struct p_rs_param_89); + + if (h->length > exp_max_sz) { + dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n", + h->length, exp_max_sz); + return FALSE; + } + + if (apv <= 88) { + header_size = sizeof(struct p_rs_param) - sizeof(*h); + data_size = h->length - header_size; + } else /* apv >= 89 */ { + header_size = sizeof(struct p_rs_param_89) - sizeof(*h); + data_size = h->length - header_size; + D_ASSERT(data_size == 0); + } + + /* initialize verify_alg and csums_alg */ + memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX); + + if (drbd_recv(mdev, h->payload, header_size) != header_size) + return FALSE; + + mdev->sync_conf.rate = be32_to_cpu(p->rate); + + if (apv >= 88) { + if (apv == 88) { + if (data_size > SHARED_SECRET_MAX) { + dev_err(DEV, "verify-alg too long, " + "peer wants %u, accepting only %u byte\n", + data_size, SHARED_SECRET_MAX); + return FALSE; + } + + if (drbd_recv(mdev, p->verify_alg, data_size) != data_size) + return FALSE; + + /* we expect NUL terminated string */ + /* but just in case someone tries to be evil */ + D_ASSERT(p->verify_alg[data_size-1] == 0); + p->verify_alg[data_size-1] = 0; + + } else /* apv >= 89 */ { + /* we still expect NUL terminated strings */ + /* but just in case someone tries to be evil */ + D_ASSERT(p->verify_alg[SHARED_SECRET_MAX-1] == 0); + D_ASSERT(p->csums_alg[SHARED_SECRET_MAX-1] == 0); + p->verify_alg[SHARED_SECRET_MAX-1] = 0; + p->csums_alg[SHARED_SECRET_MAX-1] = 0; + } + + if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) { + if (mdev->state.conn == C_WF_REPORT_PARAMS) { + dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n", + mdev->sync_conf.verify_alg, p->verify_alg); + goto disconnect; + } + verify_tfm = drbd_crypto_alloc_digest_safe(mdev, + p->verify_alg, "verify-alg"); + if (IS_ERR(verify_tfm)) { + verify_tfm = NULL; + goto disconnect; + } + } + + if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) { + if (mdev->state.conn == C_WF_REPORT_PARAMS) { + dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n", + mdev->sync_conf.csums_alg, p->csums_alg); + goto disconnect; + } + csums_tfm = drbd_crypto_alloc_digest_safe(mdev, + p->csums_alg, "csums-alg"); + if (IS_ERR(csums_tfm)) { + csums_tfm = NULL; + goto disconnect; + } + } + + + spin_lock(&mdev->peer_seq_lock); + /* lock against drbd_nl_syncer_conf() */ + if (verify_tfm) { + strcpy(mdev->sync_conf.verify_alg, p->verify_alg); + mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1; + crypto_free_hash(mdev->verify_tfm); + mdev->verify_tfm = verify_tfm; + dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg); + } + if (csums_tfm) { + strcpy(mdev->sync_conf.csums_alg, p->csums_alg); + mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1; + crypto_free_hash(mdev->csums_tfm); + mdev->csums_tfm = csums_tfm; + dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg); + } + spin_unlock(&mdev->peer_seq_lock); + } + + return ok; +disconnect: + /* just for completeness: actually not needed, + * as this is not reached if csums_tfm was ok. */ + crypto_free_hash(csums_tfm); + /* but free the verify_tfm again, if csums_tfm did not work out */ + crypto_free_hash(verify_tfm); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; +} + +static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) +{ + /* sorry, we currently have no working implementation + * of distributed TCQ */ +} + +/* warn if the arguments differ by more than 12.5% */ +static void warn_if_differ_considerably(struct drbd_conf *mdev, + const char *s, sector_t a, sector_t b) +{ + sector_t d; + if (a == 0 || b == 0) + return; + d = (a > b) ? (a - b) : (b - a); + if (d > (a>>3) || d > (b>>3)) + dev_warn(DEV, "Considerable difference in %s: %llus vs. %llus\n", s, + (unsigned long long)a, (unsigned long long)b); +} + +static int receive_sizes(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_sizes *p = (struct p_sizes *)h; + enum determine_dev_size dd = unchanged; + unsigned int max_seg_s; + sector_t p_size, p_usize, my_usize; + int ldsc = 0; /* local disk size changed */ + enum drbd_conns nconn; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_size = be64_to_cpu(p->d_size); + p_usize = be64_to_cpu(p->u_size); + + if (p_size == 0 && mdev->state.disk == D_DISKLESS) { + dev_err(DEV, "some backing storage is needed\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + /* just store the peer's disk size for now. + * we still need to figure out whether we accept that. */ + mdev->p_size = p_size; + +#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) + if (get_ldev(mdev)) { + warn_if_differ_considerably(mdev, "lower level device sizes", + p_size, drbd_get_max_capacity(mdev->ldev)); + warn_if_differ_considerably(mdev, "user requested size", + p_usize, mdev->ldev->dc.disk_size); + + /* if this is the first connect, or an otherwise expected + * param exchange, choose the minimum */ + if (mdev->state.conn == C_WF_REPORT_PARAMS) + p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size, + p_usize); + + my_usize = mdev->ldev->dc.disk_size; + + if (mdev->ldev->dc.disk_size != p_usize) { + mdev->ldev->dc.disk_size = p_usize; + dev_info(DEV, "Peer sets u_size to %lu sectors\n", + (unsigned long)mdev->ldev->dc.disk_size); + } + + /* Never shrink a device with usable data during connect. + But allow online shrinking if we are connected. */ + if (drbd_new_dev_size(mdev, mdev->ldev) < + drbd_get_capacity(mdev->this_bdev) && + mdev->state.disk >= D_OUTDATED && + mdev->state.conn < C_CONNECTED) { + dev_err(DEV, "The peer's disk size is too small!\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + mdev->ldev->dc.disk_size = my_usize; + put_ldev(mdev); + return FALSE; + } + put_ldev(mdev); + } +#undef min_not_zero + + if (get_ldev(mdev)) { + dd = drbd_determin_dev_size(mdev); + put_ldev(mdev); + if (dd == dev_size_error) + return FALSE; + drbd_md_sync(mdev); + } else { + /* I am diskless, need to accept the peer's size. */ + drbd_set_my_capacity(mdev, p_size); + } + + if (mdev->p_uuid && mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) { + nconn = drbd_sync_handshake(mdev, + mdev->state.peer, mdev->state.pdsk); + put_ldev(mdev); + + if (nconn == C_MASK) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + if (drbd_request_state(mdev, NS(conn, nconn)) < SS_SUCCESS) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + } + + if (get_ldev(mdev)) { + if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { + mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); + ldsc = 1; + } + + max_seg_s = be32_to_cpu(p->max_segment_size); + if (max_seg_s != queue_max_segment_size(mdev->rq_queue)) + drbd_setup_queue_param(mdev, max_seg_s); + + drbd_setup_order_type(mdev, be32_to_cpu(p->queue_order_type)); + put_ldev(mdev); + } + + if (mdev->state.conn > C_WF_REPORT_PARAMS) { + if (be64_to_cpu(p->c_size) != + drbd_get_capacity(mdev->this_bdev) || ldsc) { + /* we have different sizes, probably peer + * needs to know my new size... */ + drbd_send_sizes(mdev, 0); + } + if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) || + (dd == grew && mdev->state.conn == C_CONNECTED)) { + if (mdev->state.pdsk >= D_INCONSISTENT && + mdev->state.disk >= D_INCONSISTENT) + resync_after_online_grow(mdev); + else + set_bit(RESYNC_AFTER_NEG, &mdev->flags); + } + } + + return TRUE; +} + +static int receive_uuids(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_uuids *p = (struct p_uuids *)h; + u64 *p_uuid; + int i; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO); + + for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++) + p_uuid[i] = be64_to_cpu(p->uuid[i]); + + kfree(mdev->p_uuid); + mdev->p_uuid = p_uuid; + + if (mdev->state.conn < C_CONNECTED && + mdev->state.disk < D_INCONSISTENT && + mdev->state.role == R_PRIMARY && + (mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) { + dev_err(DEV, "Can only connect to data with current UUID=%016llX\n", + (unsigned long long)mdev->ed_uuid); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + if (get_ldev(mdev)) { + int skip_initial_sync = + mdev->state.conn == C_CONNECTED && + mdev->agreed_pro_version >= 90 && + mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && + (p_uuid[UI_FLAGS] & 8); + if (skip_initial_sync) { + dev_info(DEV, "Accepted new current UUID, preparing to skip initial sync\n"); + drbd_bitmap_io(mdev, &drbd_bmio_clear_n_write, + "clear_n_write from receive_uuids"); + _drbd_uuid_set(mdev, UI_CURRENT, p_uuid[UI_CURRENT]); + _drbd_uuid_set(mdev, UI_BITMAP, 0); + _drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE), + CS_VERBOSE, NULL); + drbd_md_sync(mdev); + } + put_ldev(mdev); + } + + /* Before we test for the disk state, we should wait until an eventually + ongoing cluster wide state change is finished. That is important if + we are primary and are detaching from our disk. We need to see the + new disk state... */ + wait_event(mdev->misc_wait, !test_bit(CLUSTER_ST_CHANGE, &mdev->flags)); + if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT) + drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]); + + return TRUE; +} + +/** + * convert_state() - Converts the peer's view of the cluster state to our point of view + * @ps: The state as seen by the peer. + */ +static union drbd_state convert_state(union drbd_state ps) +{ + union drbd_state ms; + + static enum drbd_conns c_tab[] = { + [C_CONNECTED] = C_CONNECTED, + + [C_STARTING_SYNC_S] = C_STARTING_SYNC_T, + [C_STARTING_SYNC_T] = C_STARTING_SYNC_S, + [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */ + [C_VERIFY_S] = C_VERIFY_T, + [C_MASK] = C_MASK, + }; + + ms.i = ps.i; + + ms.conn = c_tab[ps.conn]; + ms.peer = ps.role; + ms.role = ps.peer; + ms.pdsk = ps.disk; + ms.disk = ps.pdsk; + ms.peer_isp = (ps.aftr_isp | ps.user_isp); + + return ms; +} + +static int receive_req_state(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_req_state *p = (struct p_req_state *)h; + union drbd_state mask, val; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + mask.i = be32_to_cpu(p->mask); + val.i = be32_to_cpu(p->val); + + if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && + test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { + drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG); + return TRUE; + } + + mask = convert_state(mask); + val = convert_state(val); + + rv = drbd_change_state(mdev, CS_VERBOSE, mask, val); + + drbd_send_sr_reply(mdev, rv); + drbd_md_sync(mdev); + + return TRUE; +} + +static int receive_state(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_state *p = (struct p_state *)h; + enum drbd_conns nconn, oconn; + union drbd_state ns, peer_state; + enum drbd_disk_state real_peer_disk; + int rv; + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) + return FALSE; + + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + peer_state.i = be32_to_cpu(p->state); + + real_peer_disk = peer_state.disk; + if (peer_state.disk == D_NEGOTIATING) { + real_peer_disk = mdev->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT; + dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk)); + } + + spin_lock_irq(&mdev->req_lock); + retry: + oconn = nconn = mdev->state.conn; + spin_unlock_irq(&mdev->req_lock); + + if (nconn == C_WF_REPORT_PARAMS) + nconn = C_CONNECTED; + + if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING && + get_ldev_if_state(mdev, D_NEGOTIATING)) { + int cr; /* consider resync */ + + /* if we established a new connection */ + cr = (oconn < C_CONNECTED); + /* if we had an established connection + * and one of the nodes newly attaches a disk */ + cr |= (oconn == C_CONNECTED && + (peer_state.disk == D_NEGOTIATING || + mdev->state.disk == D_NEGOTIATING)); + /* if we have both been inconsistent, and the peer has been + * forced to be UpToDate with --overwrite-data */ + cr |= test_bit(CONSIDER_RESYNC, &mdev->flags); + /* if we had been plain connected, and the admin requested to + * start a sync by "invalidate" or "invalidate-remote" */ + cr |= (oconn == C_CONNECTED && + (peer_state.conn >= C_STARTING_SYNC_S && + peer_state.conn <= C_WF_BITMAP_T)); + + if (cr) + nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); + + put_ldev(mdev); + if (nconn == C_MASK) { + if (mdev->state.disk == D_NEGOTIATING) { + drbd_force_state(mdev, NS(disk, D_DISKLESS)); + nconn = C_CONNECTED; + } else if (peer_state.disk == D_NEGOTIATING) { + dev_err(DEV, "Disk attach process on the peer node was aborted.\n"); + peer_state.disk = D_DISKLESS; + } else { + D_ASSERT(oconn == C_WF_REPORT_PARAMS); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + } + } + + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn != oconn) + goto retry; + clear_bit(CONSIDER_RESYNC, &mdev->flags); + ns.i = mdev->state.i; + ns.conn = nconn; + ns.peer = peer_state.role; + ns.pdsk = real_peer_disk; + ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp); + if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING) + ns.disk = mdev->new_state_tmp.disk; + + rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL); + ns = mdev->state; + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) { + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return FALSE; + } + + if (oconn > C_WF_REPORT_PARAMS) { + if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED && + peer_state.disk != D_NEGOTIATING ) { + /* we want resync, peer has not yet decided to sync... */ + /* Nowadays only used when forcing a node into primary role and + setting its disk to UpToDate with that */ + drbd_send_uuids(mdev); + drbd_send_state(mdev); + } + } + + mdev->net_conf->want_lose = 0; + + drbd_md_sync(mdev); /* update connected indicator, la_size, ... */ + + return TRUE; +} + +static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_rs_uuid *p = (struct p_rs_uuid *)h; + + wait_event(mdev->misc_wait, + mdev->state.conn == C_WF_SYNC_UUID || + mdev->state.conn < C_CONNECTED || + mdev->state.disk < D_NEGOTIATING); + + /* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */ + + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; + if (drbd_recv(mdev, h->payload, h->length) != h->length) + return FALSE; + + /* Here the _drbd_uuid_ functions are right, current should + _not_ be rotated into the history */ + if (get_ldev_if_state(mdev, D_NEGOTIATING)) { + _drbd_uuid_set(mdev, UI_CURRENT, be64_to_cpu(p->uuid)); + _drbd_uuid_set(mdev, UI_BITMAP, 0UL); + + drbd_start_resync(mdev, C_SYNC_TARGET); + + put_ldev(mdev); + } else + dev_err(DEV, "Ignoring SyncUUID packet!\n"); + + return TRUE; +} + +enum receive_bitmap_ret { OK, DONE, FAILED }; + +static enum receive_bitmap_ret +receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h, + unsigned long *buffer, struct bm_xfer_ctx *c) +{ + unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset); + unsigned want = num_words * sizeof(long); + + if (want != h->length) { + dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length); + return FAILED; + } + if (want == 0) + return DONE; + if (drbd_recv(mdev, buffer, want) != want) + return FAILED; + + drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer); + + c->word_offset += num_words; + c->bit_offset = c->word_offset * BITS_PER_LONG; + if (c->bit_offset > c->bm_bits) + c->bit_offset = c->bm_bits; + + return OK; +} + +static enum receive_bitmap_ret +recv_bm_rle_bits(struct drbd_conf *mdev, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c) +{ + struct bitstream bs; + u64 look_ahead; + u64 rl; + u64 tmp; + unsigned long s = c->bit_offset; + unsigned long e; + int len = p->head.length - (sizeof(*p) - sizeof(p->head)); + int toggle = DCBP_get_start(p); + int have; + int bits; + + bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p)); + + bits = bitstream_get_bits(&bs, &look_ahead, 64); + if (bits < 0) + return FAILED; + + for (have = bits; have > 0; s += rl, toggle = !toggle) { + bits = vli_decode_bits(&rl, look_ahead); + if (bits <= 0) + return FAILED; + + if (toggle) { + e = s + rl -1; + if (e >= c->bm_bits) { + dev_err(DEV, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e); + return FAILED; + } + _drbd_bm_set_bits(mdev, s, e); + } + + if (have < bits) { + dev_err(DEV, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n", + have, bits, look_ahead, + (unsigned int)(bs.cur.b - p->code), + (unsigned int)bs.buf_len); + return FAILED; + } + look_ahead >>= bits; + have -= bits; + + bits = bitstream_get_bits(&bs, &tmp, 64 - have); + if (bits < 0) + return FAILED; + look_ahead |= tmp << have; + have += bits; + } + + c->bit_offset = s; + bm_xfer_ctx_bit_to_word_offset(c); + + return (s == c->bm_bits) ? DONE : OK; +} + +static enum receive_bitmap_ret +decode_bitmap_c(struct drbd_conf *mdev, + struct p_compressed_bm *p, + struct bm_xfer_ctx *c) +{ + if (DCBP_get_code(p) == RLE_VLI_Bits) + return recv_bm_rle_bits(mdev, p, c); + + /* other variants had been implemented for evaluation, + * but have been dropped as this one turned out to be "best" + * during all our tests. */ + + dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + return FAILED; +} + +void INFO_bm_xfer_stats(struct drbd_conf *mdev, + const char *direction, struct bm_xfer_ctx *c) +{ + /* what would it take to transfer it "plaintext" */ + unsigned plain = sizeof(struct p_header) * + ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1) + + c->bm_words * sizeof(long); + unsigned total = c->bytes[0] + c->bytes[1]; + unsigned r; + + /* total can not be zero. but just in case: */ + if (total == 0) + return; + + /* don't report if not compressed */ + if (total >= plain) + return; + + /* total < plain. check for overflow, still */ + r = (total > UINT_MAX/1000) ? (total / (plain/1000)) + : (1000 * total / plain); + + if (r > 1000) + r = 1000; + + r = 1000 - r; + dev_info(DEV, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), " + "total %u; compression: %u.%u%%\n", + direction, + c->bytes[1], c->packets[1], + c->bytes[0], c->packets[0], + total, r/10, r % 10); +} + +/* Since we are processing the bitfield from lower addresses to higher, + it does not matter if the process it in 32 bit chunks or 64 bit + chunks as long as it is little endian. (Understand it as byte stream, + beginning with the lowest byte...) If we would use big endian + we would need to process it from the highest address to the lowest, + in order to be agnostic to the 32 vs 64 bits issue. + + returns 0 on failure, 1 if we successfully received it. */ +static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h) +{ + struct bm_xfer_ctx c; + void *buffer; + enum receive_bitmap_ret ret; + int ok = FALSE; + + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); + + drbd_bm_lock(mdev, "receive bitmap"); + + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + buffer = (unsigned long *) __get_free_page(GFP_NOIO); + if (!buffer) { + dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__); + goto out; + } + + c = (struct bm_xfer_ctx) { + .bm_bits = drbd_bm_bits(mdev), + .bm_words = drbd_bm_words(mdev), + }; + + do { + if (h->command == P_BITMAP) { + ret = receive_bitmap_plain(mdev, h, buffer, &c); + } else if (h->command == P_COMPRESSED_BITMAP) { + /* MAYBE: sanity check that we speak proto >= 90, + * and the feature is enabled! */ + struct p_compressed_bm *p; + + if (h->length > BM_PACKET_PAYLOAD_BYTES) { + dev_err(DEV, "ReportCBitmap packet too large\n"); + goto out; + } + /* use the page buff */ + p = buffer; + memcpy(p, h, sizeof(*h)); + if (drbd_recv(mdev, p->head.payload, h->length) != h->length) + goto out; + if (p->head.length <= (sizeof(*p) - sizeof(p->head))) { + dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length); + return FAILED; + } + ret = decode_bitmap_c(mdev, p, &c); + } else { + dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command); + goto out; + } + + c.packets[h->command == P_BITMAP]++; + c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length; + + if (ret != OK) + break; + + if (!drbd_recv_header(mdev, h)) + goto out; + } while (ret == OK); + if (ret == FAILED) + goto out; + + INFO_bm_xfer_stats(mdev, "receive", &c); + + if (mdev->state.conn == C_WF_BITMAP_T) { + ok = !drbd_send_bitmap(mdev); + if (!ok) + goto out; + /* Omit CS_ORDERED with this state transition to avoid deadlocks. */ + ok = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE); + D_ASSERT(ok == SS_SUCCESS); + } else if (mdev->state.conn != C_WF_BITMAP_S) { + /* admin may have requested C_DISCONNECTING, + * other threads may have noticed network errors */ + dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n", + drbd_conn_str(mdev->state.conn)); + } + + ok = TRUE; + out: + drbd_bm_unlock(mdev); + if (ok && mdev->state.conn == C_WF_BITMAP_S) + drbd_start_resync(mdev, C_SYNC_SOURCE); + free_page((unsigned long) buffer); + return ok; +} + +static int receive_skip(struct drbd_conf *mdev, struct p_header *h) +{ + /* TODO zero copy sink :) */ + static char sink[128]; + int size, want, r; + + dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n", + h->command, h->length); + + size = h->length; + while (size > 0) { + want = min_t(int, size, sizeof(sink)); + r = drbd_recv(mdev, sink, want); + ERR_IF(r <= 0) break; + size -= r; + } + return size == 0; +} + +static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h) +{ + if (mdev->state.disk >= D_INCONSISTENT) + drbd_kick_lo(mdev); + + /* Make sure we've acked all the TCP data associated + * with the data requests being unplugged */ + drbd_tcp_quickack(mdev->data.socket); + + return TRUE; +} + +typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *); + +static drbd_cmd_handler_f drbd_default_handler[] = { + [P_DATA] = receive_Data, + [P_DATA_REPLY] = receive_DataReply, + [P_RS_DATA_REPLY] = receive_RSDataReply, + [P_BARRIER] = receive_Barrier, + [P_BITMAP] = receive_bitmap, + [P_COMPRESSED_BITMAP] = receive_bitmap, + [P_UNPLUG_REMOTE] = receive_UnplugRemote, + [P_DATA_REQUEST] = receive_DataRequest, + [P_RS_DATA_REQUEST] = receive_DataRequest, + [P_SYNC_PARAM] = receive_SyncParam, + [P_SYNC_PARAM89] = receive_SyncParam, + [P_PROTOCOL] = receive_protocol, + [P_UUIDS] = receive_uuids, + [P_SIZES] = receive_sizes, + [P_STATE] = receive_state, + [P_STATE_CHG_REQ] = receive_req_state, + [P_SYNC_UUID] = receive_sync_uuid, + [P_OV_REQUEST] = receive_DataRequest, + [P_OV_REPLY] = receive_DataRequest, + [P_CSUM_RS_REQUEST] = receive_DataRequest, + /* anything missing from this table is in + * the asender_tbl, see get_asender_cmd */ + [P_MAX_CMD] = NULL, +}; + +static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler; +static drbd_cmd_handler_f *drbd_opt_cmd_handler; + +static void drbdd(struct drbd_conf *mdev) +{ + drbd_cmd_handler_f handler; + struct p_header *header = &mdev->data.rbuf.header; + + while (get_t_state(&mdev->receiver) == Running) { + drbd_thread_current_set_cpu(mdev); + if (!drbd_recv_header(mdev, header)) + break; + + if (header->command < P_MAX_CMD) + handler = drbd_cmd_handler[header->command]; + else if (P_MAY_IGNORE < header->command + && header->command < P_MAX_OPT_CMD) + handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE]; + else if (header->command > P_MAX_OPT_CMD) + handler = receive_skip; + else + handler = NULL; + + if (unlikely(!handler)) { + dev_err(DEV, "unknown packet type %d, l: %d!\n", + header->command, header->length); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + break; + } + if (unlikely(!handler(mdev, header))) { + dev_err(DEV, "error receiving %s, l: %d!\n", + cmdname(header->command), header->length); + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); + break; + } + + trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, + __FILE__, __LINE__); + } +} + +static void drbd_fail_pending_reads(struct drbd_conf *mdev) +{ + struct hlist_head *slot; + struct hlist_node *pos; + struct hlist_node *tmp; + struct drbd_request *req; + int i; + + /* + * Application READ requests + */ + spin_lock_irq(&mdev->req_lock); + for (i = 0; i < APP_R_HSIZE; i++) { + slot = mdev->app_reads_hash+i; + hlist_for_each_entry_safe(req, pos, tmp, slot, colision) { + /* it may (but should not any longer!) + * be on the work queue; if that assert triggers, + * we need to also grab the + * spin_lock_irq(&mdev->data.work.q_lock); + * and list_del_init here. */ + D_ASSERT(list_empty(&req->w.list)); + /* It would be nice to complete outside of spinlock. + * But this is easier for now. */ + _req_mod(req, connection_lost_while_pending); + } + } + for (i = 0; i < APP_R_HSIZE; i++) + if (!hlist_empty(mdev->app_reads_hash+i)) + dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: " + "%p, should be NULL\n", i, mdev->app_reads_hash[i].first); + + memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); + spin_unlock_irq(&mdev->req_lock); +} + +void drbd_flush_workqueue(struct drbd_conf *mdev) +{ + struct drbd_wq_barrier barr; + + barr.w.cb = w_prev_work_done; + init_completion(&barr.done); + drbd_queue_work(&mdev->data.work, &barr.w); + wait_for_completion(&barr.done); +} + +static void drbd_disconnect(struct drbd_conf *mdev) +{ + enum drbd_fencing_p fp; + union drbd_state os, ns; + int rv = SS_UNKNOWN_ERROR; + unsigned int i; + + if (mdev->state.conn == C_STANDALONE) + return; + if (mdev->state.conn >= C_WF_CONNECTION) + dev_err(DEV, "ASSERT FAILED cstate = %s, expected < WFConnection\n", + drbd_conn_str(mdev->state.conn)); + + /* asender does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&mdev->asender); + + mutex_lock(&mdev->data.mutex); + drbd_free_sock(mdev); + mutex_unlock(&mdev->data.mutex); + + spin_lock_irq(&mdev->req_lock); + _drbd_wait_ee_list_empty(mdev, &mdev->active_ee); + _drbd_wait_ee_list_empty(mdev, &mdev->sync_ee); + _drbd_wait_ee_list_empty(mdev, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + /* We do not have data structures that would allow us to + * get the rs_pending_cnt down to 0 again. + * * On C_SYNC_TARGET we do not have any data structures describing + * the pending RSDataRequest's we have sent. + * * On C_SYNC_SOURCE there is no data structure that tracks + * the P_RS_DATA_REPLY blocks that we sent to the SyncTarget. + * And no, it is not the sum of the reference counts in the + * resync_LRU. The resync_LRU tracks the whole operation including + * the disk-IO, while the rs_pending_cnt only tracks the blocks + * on the fly. */ + drbd_rs_cancel_all(mdev); + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + wake_up(&mdev->misc_wait); + + /* make sure syncer is stopped and w_resume_next_sg queued */ + del_timer_sync(&mdev->resync_timer); + set_bit(STOP_SYNC_TIMER, &mdev->flags); + resync_timer_fn((unsigned long)mdev); + + /* so we can be sure that all remote or resync reads + * made it at least to net_ee */ + wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + + /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, + * w_make_resync_request etc. which may still be on the worker queue + * to be "canceled" */ + drbd_flush_workqueue(mdev); + + /* This also does reclaim_net_ee(). If we do this too early, we might + * miss some resync ee and pages.*/ + drbd_process_done_ee(mdev); + + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; + + if (!mdev->state.susp) + tl_clear(mdev); + + drbd_fail_pending_reads(mdev); + + dev_info(DEV, "Connection closed\n"); + + drbd_md_sync(mdev); + + fp = FP_DONT_CARE; + if (get_ldev(mdev)) { + fp = mdev->ldev->dc.fencing; + put_ldev(mdev); + } + + if (mdev->state.role == R_PRIMARY) { + if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) { + enum drbd_disk_state nps = drbd_try_outdate_peer(mdev); + drbd_request_state(mdev, NS(pdsk, nps)); + } + } + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + if (os.conn >= C_UNCONNECTED) { + /* Do not restart in case we are C_DISCONNECTING */ + ns = os; + ns.conn = C_UNCONNECTED; + rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + } + spin_unlock_irq(&mdev->req_lock); + + if (os.conn == C_DISCONNECTING) { + struct hlist_head *h; + wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0); + + /* we must not free the tl_hash + * while application io is still on the fly */ + wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0); + + spin_lock_irq(&mdev->req_lock); + /* paranoia code */ + for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n", + (int)(h - mdev->ee_hash), h->first); + kfree(mdev->ee_hash); + mdev->ee_hash = NULL; + mdev->ee_hash_s = 0; + + /* paranoia code */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) + if (h->first) + dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", + (int)(h - mdev->tl_hash), h->first); + kfree(mdev->tl_hash); + mdev->tl_hash = NULL; + mdev->tl_hash_s = 0; + spin_unlock_irq(&mdev->req_lock); + + crypto_free_hash(mdev->cram_hmac_tfm); + mdev->cram_hmac_tfm = NULL; + + kfree(mdev->net_conf); + mdev->net_conf = NULL; + drbd_request_state(mdev, NS(conn, C_STANDALONE)); + } + + /* tcp_close and release of sendpage pages can be deferred. I don't + * want to use SO_LINGER, because apparently it can be deferred for + * more than 20 seconds (longest time I checked). + * + * Actually we don't care for exactly when the network stack does its + * put_page(), but release our reference on these pages right here. + */ + i = drbd_release_ee(mdev, &mdev->net_ee); + if (i) + dev_info(DEV, "net_ee not empty, killed %u entries\n", i); + i = atomic_read(&mdev->pp_in_use); + if (i) + dev_info(DEV, "pp_in_use = %u, expected 0\n", i); + + D_ASSERT(list_empty(&mdev->read_ee)); + D_ASSERT(list_empty(&mdev->active_ee)); + D_ASSERT(list_empty(&mdev->sync_ee)); + D_ASSERT(list_empty(&mdev->done_ee)); + + /* ok, no more ee's on the fly, it is safe to reset the epoch_size */ + atomic_set(&mdev->current_epoch->epoch_size, 0); + D_ASSERT(list_empty(&mdev->current_epoch->list)); +} + +/* + * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version + * we can agree on is stored in agreed_pro_version. + * + * feature flags and the reserved array should be enough room for future + * enhancements of the handshake protocol, and possible plugins... + * + * for now, they are expected to be zero, but ignored. + */ +static int drbd_send_handshake(struct drbd_conf *mdev) +{ + /* ASSERT current == mdev->receiver ... */ + struct p_handshake *p = &mdev->data.sbuf.handshake; + int ok; + + if (mutex_lock_interruptible(&mdev->data.mutex)) { + dev_err(DEV, "interrupted during initial handshake\n"); + return 0; /* interrupted. not ok. */ + } + + if (mdev->data.socket == NULL) { + mutex_unlock(&mdev->data.mutex); + return 0; + } + + memset(p, 0, sizeof(*p)); + p->protocol_min = cpu_to_be32(PRO_VERSION_MIN); + p->protocol_max = cpu_to_be32(PRO_VERSION_MAX); + ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE, + (struct p_header *)p, sizeof(*p), 0 ); + mutex_unlock(&mdev->data.mutex); + return ok; +} + +/* + * return values: + * 1 yes, we have a valid connection + * 0 oops, did not work out, please try again + * -1 peer talks different language, + * no point in trying again, please go standalone. + */ +static int drbd_do_handshake(struct drbd_conf *mdev) +{ + /* ASSERT current == mdev->receiver ... */ + struct p_handshake *p = &mdev->data.rbuf.handshake; + const int expect = sizeof(struct p_handshake) + -sizeof(struct p_header); + int rv; + + rv = drbd_send_handshake(mdev); + if (!rv) + return 0; + + rv = drbd_recv_header(mdev, &p->head); + if (!rv) + return 0; + + if (p->head.command != P_HAND_SHAKE) { + dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n", + cmdname(p->head.command), p->head.command); + return -1; + } + + if (p->head.length != expect) { + dev_err(DEV, "expected HandShake length: %u, received: %u\n", + expect, p->head.length); + return -1; + } + + rv = drbd_recv(mdev, &p->head.payload, expect); + + if (rv != expect) { + dev_err(DEV, "short read receiving handshake packet: l=%u\n", rv); + return 0; + } + + trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, + __FILE__, __LINE__); + + p->protocol_min = be32_to_cpu(p->protocol_min); + p->protocol_max = be32_to_cpu(p->protocol_max); + if (p->protocol_max == 0) + p->protocol_max = p->protocol_min; + + if (PRO_VERSION_MAX < p->protocol_min || + PRO_VERSION_MIN > p->protocol_max) + goto incompat; + + mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max); + + dev_info(DEV, "Handshake successful: " + "Agreed network protocol version %d\n", mdev->agreed_pro_version); + + return 1; + + incompat: + dev_err(DEV, "incompatible DRBD dialects: " + "I support %d-%d, peer supports %d-%d\n", + PRO_VERSION_MIN, PRO_VERSION_MAX, + p->protocol_min, p->protocol_max); + return -1; +} + +#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE) +static int drbd_do_auth(struct drbd_conf *mdev) +{ + dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n"); + dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n"); + return 0; +} +#else +#define CHALLENGE_LEN 64 +static int drbd_do_auth(struct drbd_conf *mdev) +{ + char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */ + struct scatterlist sg; + char *response = NULL; + char *right_response = NULL; + char *peers_ch = NULL; + struct p_header p; + unsigned int key_len = strlen(mdev->net_conf->shared_secret); + unsigned int resp_size; + struct hash_desc desc; + int rv; + + desc.tfm = mdev->cram_hmac_tfm; + desc.flags = 0; + + rv = crypto_hash_setkey(mdev->cram_hmac_tfm, + (u8 *)mdev->net_conf->shared_secret, key_len); + if (rv) { + dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv); + rv = 0; + goto fail; + } + + get_random_bytes(my_challenge, CHALLENGE_LEN); + + rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN); + if (!rv) + goto fail; + + rv = drbd_recv_header(mdev, &p); + if (!rv) + goto fail; + + if (p.command != P_AUTH_CHALLENGE) { + dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command); + rv = 0; + goto fail; + } + + if (p.length > CHALLENGE_LEN*2) { + dev_err(DEV, "expected AuthChallenge payload too big.\n"); + rv = 0; + goto fail; + } + + peers_ch = kmalloc(p.length, GFP_NOIO); + if (peers_ch == NULL) { + dev_err(DEV, "kmalloc of peers_ch failed\n"); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, peers_ch, p.length); + + if (rv != p.length) { + dev_err(DEV, "short read AuthChallenge: l=%u\n", rv); + rv = 0; + goto fail; + } + + resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm); + response = kmalloc(resp_size, GFP_NOIO); + if (response == NULL) { + dev_err(DEV, "kmalloc of response failed\n"); + rv = 0; + goto fail; + } + + sg_init_table(&sg, 1); + sg_set_buf(&sg, peers_ch, p.length); + + rv = crypto_hash_digest(&desc, &sg, sg.length, response); + if (rv) { + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); + rv = 0; + goto fail; + } + + rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size); + if (!rv) + goto fail; + + rv = drbd_recv_header(mdev, &p); + if (!rv) + goto fail; + + if (p.command != P_AUTH_RESPONSE) { + dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n", + cmdname(p.command), p.command); + rv = 0; + goto fail; + } + + if (p.length != resp_size) { + dev_err(DEV, "expected AuthResponse payload of wrong size\n"); + rv = 0; + goto fail; + } + + rv = drbd_recv(mdev, response , resp_size); + + if (rv != resp_size) { + dev_err(DEV, "short read receiving AuthResponse: l=%u\n", rv); + rv = 0; + goto fail; + } + + right_response = kmalloc(resp_size, GFP_NOIO); + if (response == NULL) { + dev_err(DEV, "kmalloc of right_response failed\n"); + rv = 0; + goto fail; + } + + sg_set_buf(&sg, my_challenge, CHALLENGE_LEN); + + rv = crypto_hash_digest(&desc, &sg, sg.length, right_response); + if (rv) { + dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv); + rv = 0; + goto fail; + } + + rv = !memcmp(response, right_response, resp_size); + + if (rv) + dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n", + resp_size, mdev->net_conf->cram_hmac_alg); + + fail: + kfree(peers_ch); + kfree(response); + kfree(right_response); + + return rv; +} +#endif + +int drbdd_init(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + unsigned int minor = mdev_to_minor(mdev); + int h; + + sprintf(current->comm, "drbd%d_receiver", minor); + + dev_info(DEV, "receiver (re)started\n"); + + do { + h = drbd_connect(mdev); + if (h == 0) { + drbd_disconnect(mdev); + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } + if (h == -1) { + dev_warn(DEV, "Discarding network configuration.\n"); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + } while (h == 0); + + if (h > 0) { + if (get_net_conf(mdev)) { + drbdd(mdev); + put_net_conf(mdev); + } + } + + drbd_disconnect(mdev); + + dev_info(DEV, "receiver terminated\n"); + return 0; +} + +/* ********* acknowledge sender ******** */ + +static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_req_state_reply *p = (struct p_req_state_reply *)h; + + int retcode = be32_to_cpu(p->retcode); + + if (retcode >= SS_SUCCESS) { + set_bit(CL_ST_CHG_SUCCESS, &mdev->flags); + } else { + set_bit(CL_ST_CHG_FAIL, &mdev->flags); + dev_err(DEV, "Requested state change failed by peer: %s (%d)\n", + drbd_set_st_err_str(retcode), retcode); + } + wake_up(&mdev->state_wait); + + return TRUE; +} + +static int got_Ping(struct drbd_conf *mdev, struct p_header *h) +{ + return drbd_send_ping_ack(mdev); + +} + +static int got_PingAck(struct drbd_conf *mdev, struct p_header *h) +{ + /* restore idle timeout */ + mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + + return TRUE; +} + +static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + + D_ASSERT(mdev->agreed_pro_version >= 89); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + drbd_rs_complete_io(mdev, sector); + drbd_set_in_sync(mdev, sector, blksize); + /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */ + mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT); + dec_rs_pending(mdev); + + return TRUE; +} + +/* when we receive the ACK for a write request, + * verify that we actually know about it */ +static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, + u64 id, sector_t sector) +{ + struct hlist_head *slot = tl_hash_slot(mdev, sector); + struct hlist_node *n; + struct drbd_request *req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + if (req->sector != sector) { + dev_err(DEV, "_ack_id_to_req: found req %p but it has " + "wrong sector (%llus versus %llus)\n", req, + (unsigned long long)req->sector, + (unsigned long long)sector); + break; + } + return req; + } + } + dev_err(DEV, "_ack_id_to_req: failed to find req %p, sector %llus in list\n", + (void *)(unsigned long)id, (unsigned long long)sector); + return NULL; +} + +typedef struct drbd_request *(req_validator_fn) + (struct drbd_conf *mdev, u64 id, sector_t sector); + +static int validate_req_change_req_state(struct drbd_conf *mdev, + u64 id, sector_t sector, req_validator_fn validator, + const char *func, enum drbd_req_event what) +{ + struct drbd_request *req; + struct bio_and_error m; + + spin_lock_irq(&mdev->req_lock); + req = validator(mdev, id, sector); + if (unlikely(!req)) { + spin_unlock_irq(&mdev->req_lock); + dev_err(DEV, "%s: got a corrupt block_id/sector pair\n", func); + return FALSE; + } + __req_mod(req, what, &m); + spin_unlock_irq(&mdev->req_lock); + + if (m.bio) + complete_master_bio(mdev, &m); + return TRUE; +} + +static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + int blksize = be32_to_cpu(p->blksize); + enum drbd_req_event what; + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + if (is_syncer_block_id(p->block_id)) { + drbd_set_in_sync(mdev, sector, blksize); + dec_rs_pending(mdev); + return TRUE; + } + switch (be16_to_cpu(h->command)) { + case P_RS_WRITE_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = write_acked_by_peer_and_sis; + break; + case P_WRITE_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = write_acked_by_peer; + break; + case P_RECV_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B); + what = recv_acked_by_peer; + break; + case P_DISCARD_ACK: + D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C); + what = conflict_discarded_by_peer; + break; + default: + D_ASSERT(0); + return FALSE; + } + + return validate_req_change_req_state(mdev, p->block_id, sector, + _ack_id_to_req, __func__ , what); +} + +static int got_NegAck(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + + if (__ratelimit(&drbd_ratelimit_state)) + dev_warn(DEV, "Got NegAck packet. Peer is in troubles?\n"); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + if (is_syncer_block_id(p->block_id)) { + int size = be32_to_cpu(p->blksize); + dec_rs_pending(mdev); + drbd_rs_failed_io(mdev, sector, size); + return TRUE; + } + return validate_req_change_req_state(mdev, p->block_id, sector, + _ack_id_to_req, __func__ , neg_acked); +} + +static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + sector_t sector = be64_to_cpu(p->sector); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n", + (unsigned long long)sector, be32_to_cpu(p->blksize)); + + return validate_req_change_req_state(mdev, p->block_id, sector, + _ar_id_to_req, __func__ , neg_acked); +} + +static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h) +{ + sector_t sector; + int size; + struct p_block_ack *p = (struct p_block_ack *)h; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + D_ASSERT(p->block_id == ID_SYNCER); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + dec_rs_pending(mdev); + + if (get_ldev_if_state(mdev, D_FAILED)) { + drbd_rs_complete_io(mdev, sector); + drbd_rs_failed_io(mdev, sector, size); + put_ldev(mdev); + } + + return TRUE; +} + +static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_barrier_ack *p = (struct p_barrier_ack *)h; + + tl_release(mdev, p->barrier, be32_to_cpu(p->set_size)); + + return TRUE; +} + +static int got_OVResult(struct drbd_conf *mdev, struct p_header *h) +{ + struct p_block_ack *p = (struct p_block_ack *)h; + struct drbd_work *w; + sector_t sector; + int size; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + update_peer_seq(mdev, be32_to_cpu(p->seq_num)); + + if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC) + drbd_ov_oos_found(mdev, sector, size); + else + ov_oos_print(mdev); + + drbd_rs_complete_io(mdev, sector); + dec_rs_pending(mdev); + + if (--mdev->ov_left == 0) { + w = kmalloc(sizeof(*w), GFP_NOIO); + if (w) { + w->cb = w_ov_finished; + drbd_queue_work_front(&mdev->data.work, w); + } else { + dev_err(DEV, "kmalloc(w) failed."); + ov_oos_print(mdev); + drbd_resync_finished(mdev); + } + } + return TRUE; +} + +struct asender_cmd { + size_t pkt_size; + int (*process)(struct drbd_conf *mdev, struct p_header *h); +}; + +static struct asender_cmd *get_asender_cmd(int cmd) +{ + static struct asender_cmd asender_tbl[] = { + /* anything missing from this table is in + * the drbd_cmd_handler (drbd_default_handler) table, + * see the beginning of drbdd() */ + [P_PING] = { sizeof(struct p_header), got_Ping }, + [P_PING_ACK] = { sizeof(struct p_header), got_PingAck }, + [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, + [P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck }, + [P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply }, + [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply}, + [P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult }, + [P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck }, + [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply }, + [P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync }, + [P_MAX_CMD] = { 0, NULL }, + }; + if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL) + return NULL; + return &asender_tbl[cmd]; +} + +int drbd_asender(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + struct p_header *h = &mdev->meta.rbuf.header; + struct asender_cmd *cmd = NULL; + + int rv, len; + void *buf = h; + int received = 0; + int expect = sizeof(struct p_header); + int empty; + + sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); + + current->policy = SCHED_RR; /* Make this a realtime task! */ + current->rt_priority = 2; /* more important than all other tasks */ + + while (get_t_state(thi) == Running) { + drbd_thread_current_set_cpu(mdev); + if (test_and_clear_bit(SEND_PING, &mdev->flags)) { + ERR_IF(!drbd_send_ping(mdev)) goto reconnect; + mdev->meta.socket->sk->sk_rcvtimeo = + mdev->net_conf->ping_timeo*HZ/10; + } + + /* conditionally cork; + * it may hurt latency if we cork without much to send */ + if (!mdev->net_conf->no_cork && + 3 < atomic_read(&mdev->unacked_cnt)) + drbd_tcp_cork(mdev->meta.socket); + while (1) { + clear_bit(SIGNAL_ASENDER, &mdev->flags); + flush_signals(current); + if (!drbd_process_done_ee(mdev)) { + dev_err(DEV, "process_done_ee() = NOT_OK\n"); + goto reconnect; + } + /* to avoid race with newly queued ACKs */ + set_bit(SIGNAL_ASENDER, &mdev->flags); + spin_lock_irq(&mdev->req_lock); + empty = list_empty(&mdev->done_ee); + spin_unlock_irq(&mdev->req_lock); + /* new ack may have been queued right here, + * but then there is also a signal pending, + * and we start over... */ + if (empty) + break; + } + /* but unconditionally uncork unless disabled */ + if (!mdev->net_conf->no_cork) + drbd_tcp_uncork(mdev->meta.socket); + + /* short circuit, recv_msg would return EINTR anyways. */ + if (signal_pending(current)) + continue; + + rv = drbd_recv_short(mdev, mdev->meta.socket, + buf, expect-received, 0); + clear_bit(SIGNAL_ASENDER, &mdev->flags); + + flush_signals(current); + + /* Note: + * -EINTR (on meta) we got a signal + * -EAGAIN (on meta) rcvtimeo expired + * -ECONNRESET other side closed the connection + * -ERESTARTSYS (on data) we got a signal + * rv < 0 other than above: unexpected error! + * rv == expected: full header or command + * rv < expected: "woken" by signal during receive + * rv == 0 : "connection shut down by peer" + */ + if (likely(rv > 0)) { + received += rv; + buf += rv; + } else if (rv == 0) { + dev_err(DEV, "meta connection shut down by peer.\n"); + goto reconnect; + } else if (rv == -EAGAIN) { + if (mdev->meta.socket->sk->sk_rcvtimeo == + mdev->net_conf->ping_timeo*HZ/10) { + dev_err(DEV, "PingAck did not arrive in time.\n"); + goto reconnect; + } + set_bit(SEND_PING, &mdev->flags); + continue; + } else if (rv == -EINTR) { + continue; + } else { + dev_err(DEV, "sock_recvmsg returned %d\n", rv); + goto reconnect; + } + + if (received == expect && cmd == NULL) { + if (unlikely(h->magic != BE_DRBD_MAGIC)) { + dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + goto reconnect; + } + cmd = get_asender_cmd(be16_to_cpu(h->command)); + len = be16_to_cpu(h->length); + if (unlikely(cmd == NULL)) { + dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n", + (long)be32_to_cpu(h->magic), + h->command, h->length); + goto disconnect; + } + expect = cmd->pkt_size; + ERR_IF(len != expect-sizeof(struct p_header)) { + trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); + goto reconnect; + } + } + if (received == expect) { + D_ASSERT(cmd != NULL); + trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); + if (!cmd->process(mdev, h)) + goto reconnect; + + buf = h; + received = 0; + expect = sizeof(struct p_header); + cmd = NULL; + } + } + + if (0) { +reconnect: + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + } + if (0) { +disconnect: + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + } + clear_bit(SIGNAL_ASENDER, &mdev->flags); + + D_ASSERT(mdev->state.conn < C_CONNECTED); + dev_info(DEV, "asender terminated\n"); + + return 0; +} diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c new file mode 100644 index 00000000000..0656cf1edd5 --- /dev/null +++ b/drivers/block/drbd/drbd_req.c @@ -0,0 +1,1132 @@ +/* + drbd_req.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include + +#include +#include +#include "drbd_int.h" +#include "drbd_tracing.h" +#include "drbd_req.h" + + +/* Update disk stats at start of I/O request */ +static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio) +{ + const int rw = bio_data_dir(bio); + int cpu; + cpu = part_stat_lock(); + part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); + part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); + part_stat_unlock(); + mdev->vdisk->part0.in_flight[rw]++; +} + +/* Update disk stats when completing request upwards */ +static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) +{ + int rw = bio_data_dir(req->master_bio); + unsigned long duration = jiffies - req->start_time; + int cpu; + cpu = part_stat_lock(); + part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); + part_round_stats(cpu, &mdev->vdisk->part0); + part_stat_unlock(); + mdev->vdisk->part0.in_flight[rw]--; +} + +static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) +{ + const unsigned long s = req->rq_state; + /* if it was a write, we may have to set the corresponding + * bit(s) out-of-sync first. If it had a local part, we need to + * release the reference to the activity log. */ + if (rw == WRITE) { + /* remove it from the transfer log. + * well, only if it had been there in the first + * place... if it had not (local only or conflicting + * and never sent), it should still be "empty" as + * initialized in drbd_req_new(), so we can list_del() it + * here unconditionally */ + list_del(&req->tl_requests); + /* Set out-of-sync unless both OK flags are set + * (local only or remote failed). + * Other places where we set out-of-sync: + * READ with local io-error */ + if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK)) + drbd_set_out_of_sync(mdev, req->sector, req->size); + + if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS)) + drbd_set_in_sync(mdev, req->sector, req->size); + + /* one might be tempted to move the drbd_al_complete_io + * to the local io completion callback drbd_endio_pri. + * but, if this was a mirror write, we may only + * drbd_al_complete_io after this is RQ_NET_DONE, + * otherwise the extent could be dropped from the al + * before it has actually been written on the peer. + * if we crash before our peer knows about the request, + * but after the extent has been dropped from the al, + * we would forget to resync the corresponding extent. + */ + if (s & RQ_LOCAL_MASK) { + if (get_ldev_if_state(mdev, D_FAILED)) { + drbd_al_complete_io(mdev, req->sector); + put_ldev(mdev); + } else if (__ratelimit(&drbd_ratelimit_state)) { + dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), " + "but my Disk seems to have failed :(\n", + (unsigned long long) req->sector); + } + } + } + + /* if it was a local io error, we want to notify our + * peer about that, and see if we need to + * detach the disk and stuff. + * to avoid allocating some special work + * struct, reuse the request. */ + + /* THINK + * why do we do this not when we detect the error, + * but delay it until it is "done", i.e. possibly + * until the next barrier ack? */ + + if (rw == WRITE && + ((s & RQ_LOCAL_MASK) && !(s & RQ_LOCAL_OK))) { + if (!(req->w.list.next == LIST_POISON1 || + list_empty(&req->w.list))) { + /* DEBUG ASSERT only; if this triggers, we + * probably corrupt the worker list here */ + dev_err(DEV, "req->w.list.next = %p\n", req->w.list.next); + dev_err(DEV, "req->w.list.prev = %p\n", req->w.list.prev); + } + req->w.cb = w_io_error; + drbd_queue_work(&mdev->data.work, &req->w); + /* drbd_req_free() is done in w_io_error */ + } else { + drbd_req_free(req); + } +} + +static void queue_barrier(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + + /* We are within the req_lock. Once we queued the barrier for sending, + * we set the CREATE_BARRIER bit. It is cleared as soon as a new + * barrier/epoch object is added. This is the only place this bit is + * set. It indicates that the barrier for this epoch is already queued, + * and no new epoch has been created yet. */ + if (test_bit(CREATE_BARRIER, &mdev->flags)) + return; + + b = mdev->newest_tle; + b->w.cb = w_send_barrier; + /* inc_ap_pending done here, so we won't + * get imbalanced on connection loss. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in tl_clear. */ + inc_ap_pending(mdev); + drbd_queue_work(&mdev->data.work, &b->w); + set_bit(CREATE_BARRIER, &mdev->flags); +} + +static void _about_to_complete_local_write(struct drbd_conf *mdev, + struct drbd_request *req) +{ + const unsigned long s = req->rq_state; + struct drbd_request *i; + struct drbd_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + /* before we can signal completion to the upper layers, + * we may need to close the current epoch */ + if (mdev->state.conn >= C_CONNECTED && + req->epoch == mdev->newest_tle->br_number) + queue_barrier(mdev); + + /* we need to do the conflict detection stuff, + * if we have the ee_hash (two_primaries) and + * this has been on the network */ + if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) { + const sector_t sector = req->sector; + const int size = req->size; + + /* ASSERT: + * there must be no conflicting requests, since + * they must have been failed on the spot */ +#define OVERLAPS overlaps(sector, size, i->sector, i->size) + slot = tl_hash_slot(mdev, sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " + "other: %p %llus +%u\n", + req, (unsigned long long)sector, size, + i, (unsigned long long)i->sector, i->size); + } + } + + /* maybe "wake" those conflicting epoch entries + * that wait for this request to finish. + * + * currently, there can be only _one_ such ee + * (well, or some more, which would be pending + * P_DISCARD_ACK not yet sent by the asender...), + * since we block the receiver thread upon the + * first conflict detection, which will wait on + * misc_wait. maybe we want to assert that? + * + * anyways, if we found one, + * we just have to do a wake_up. */ +#undef OVERLAPS +#define OVERLAPS overlaps(sector, size, e->sector, e->size) + slot = ee_hash_slot(mdev, req->sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + wake_up(&mdev->misc_wait); + break; + } + } + } +#undef OVERLAPS +} + +void complete_master_bio(struct drbd_conf *mdev, + struct bio_and_error *m) +{ + trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL); + bio_endio(m->bio, m->error); + dec_ap_bio(mdev); +} + +/* Helper for __req_mod(). + * Set m->bio to the master bio, if it is fit to be completed, + * or leave it alone (it is initialized to NULL in __req_mod), + * if it has already been completed, or cannot be completed yet. + * If m->bio is set, the error status to be returned is placed in m->error. + */ +void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) +{ + const unsigned long s = req->rq_state; + struct drbd_conf *mdev = req->mdev; + /* only WRITES may end up here without a master bio (on barrier ack) */ + int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; + + trace_drbd_req(req, nothing, "_req_may_be_done"); + + /* we must not complete the master bio, while it is + * still being processed by _drbd_send_zc_bio (drbd_send_dblock) + * not yet acknowledged by the peer + * not yet completed by the local io subsystem + * these flags may get cleared in any order by + * the worker, + * the receiver, + * the bio_endio completion callbacks. + */ + if (s & RQ_NET_QUEUED) + return; + if (s & RQ_NET_PENDING) + return; + if (s & RQ_LOCAL_PENDING) + return; + + if (req->master_bio) { + /* this is data_received (remote read) + * or protocol C P_WRITE_ACK + * or protocol B P_RECV_ACK + * or protocol A "handed_over_to_network" (SendAck) + * or canceled or failed, + * or killed from the transfer log due to connection loss. + */ + + /* + * figure out whether to report success or failure. + * + * report success when at least one of the operations succeeded. + * or, to put the other way, + * only report failure, when both operations failed. + * + * what to do about the failures is handled elsewhere. + * what we need to do here is just: complete the master_bio. + * + * local completion error, if any, has been stored as ERR_PTR + * in private_bio within drbd_endio_pri. + */ + int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK); + int error = PTR_ERR(req->private_bio); + + /* remove the request from the conflict detection + * respective block_id verification hash */ + if (!hlist_unhashed(&req->colision)) + hlist_del(&req->colision); + else + D_ASSERT((s & RQ_NET_MASK) == 0); + + /* for writes we need to do some extra housekeeping */ + if (rw == WRITE) + _about_to_complete_local_write(mdev, req); + + /* Update disk stats */ + _drbd_end_io_acct(mdev, req); + + m->error = ok ? 0 : (error ?: -EIO); + m->bio = req->master_bio; + req->master_bio = NULL; + } + + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { + /* this is disconnected (local only) operation, + * or protocol C P_WRITE_ACK, + * or protocol A or B P_BARRIER_ACK, + * or killed from the transfer log due to connection loss. */ + _req_is_done(mdev, req, rw); + } + /* else: network part and not DONE yet. that is + * protocol A or B, barrier ack still pending... */ +} + +/* + * checks whether there was an overlapping request + * or ee already registered. + * + * if so, return 1, in which case this request is completed on the spot, + * without ever being submitted or send. + * + * return 0 if it is ok to submit this request. + * + * NOTE: + * paranoia: assume something above us is broken, and issues different write + * requests for the same block simultaneously... + * + * To ensure these won't be reordered differently on both nodes, resulting in + * diverging data sets, we discard the later one(s). Not that this is supposed + * to happen, but this is the rationale why we also have to check for + * conflicting requests with local origin, and why we have to do so regardless + * of whether we allowed multiple primaries. + * + * BTW, in case we only have one primary, the ee_hash is empty anyways, and the + * second hlist_for_each_entry becomes a noop. This is even simpler than to + * grab a reference on the net_conf, and check for the two_primaries flag... + */ +static int _req_conflicts(struct drbd_request *req) +{ + struct drbd_conf *mdev = req->mdev; + const sector_t sector = req->sector; + const int size = req->size; + struct drbd_request *i; + struct drbd_epoch_entry *e; + struct hlist_node *n; + struct hlist_head *slot; + + D_ASSERT(hlist_unhashed(&req->colision)); + + if (!get_net_conf(mdev)) + return 0; + + /* BUG_ON */ + ERR_IF (mdev->tl_hash_s == 0) + goto out_no_conflict; + BUG_ON(mdev->tl_hash == NULL); + +#define OVERLAPS overlaps(i->sector, i->size, sector, size) + slot = tl_hash_slot(mdev, sector); + hlist_for_each_entry(i, n, slot, colision) { + if (OVERLAPS) { + dev_alert(DEV, "%s[%u] Concurrent local write detected! " + "[DISCARD L] new: %llus +%u; " + "pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)i->sector, i->size); + goto out_conflict; + } + } + + if (mdev->ee_hash_s) { + /* now, check for overlapping requests with remote origin */ + BUG_ON(mdev->ee_hash == NULL); +#undef OVERLAPS +#define OVERLAPS overlaps(e->sector, e->size, sector, size) + slot = ee_hash_slot(mdev, sector); + hlist_for_each_entry(e, n, slot, colision) { + if (OVERLAPS) { + dev_alert(DEV, "%s[%u] Concurrent remote write detected!" + " [DISCARD L] new: %llus +%u; " + "pending: %llus +%u\n", + current->comm, current->pid, + (unsigned long long)sector, size, + (unsigned long long)e->sector, e->size); + goto out_conflict; + } + } + } +#undef OVERLAPS + +out_no_conflict: + /* this is like it should be, and what we expected. + * our users do behave after all... */ + put_net_conf(mdev); + return 0; + +out_conflict: + put_net_conf(mdev); + return 1; +} + +/* obviously this could be coded as many single functions + * instead of one huge switch, + * or by putting the code directly in the respective locations + * (as it has been before). + * + * but having it this way + * enforces that it is all in this one place, where it is easier to audit, + * it makes it obvious that whatever "event" "happens" to a request should + * happen "atomically" within the req_lock, + * and it enforces that we have to think in a very structured manner + * about the "events" that may happen to a request during its life time ... + */ +void __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m) +{ + struct drbd_conf *mdev = req->mdev; + m->bio = NULL; + + trace_drbd_req(req, what, NULL); + + switch (what) { + default: + dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); + break; + + /* does not happen... + * initialization done in drbd_req_new + case created: + break; + */ + + case to_be_send: /* via network */ + /* reached via drbd_make_request_common + * and from w_read_retry_remote */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + break; + + case to_be_submitted: /* locally */ + /* reached via drbd_make_request_common */ + D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK)); + req->rq_state |= RQ_LOCAL_PENDING; + break; + + case completed_ok: + if (bio_data_dir(req->master_bio) == WRITE) + mdev->writ_cnt += req->size>>9; + else + mdev->read_cnt += req->size>>9; + + req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK); + req->rq_state &= ~RQ_LOCAL_PENDING; + + _req_may_be_done(req, m); + put_ldev(mdev); + break; + + case write_completed_with_error: + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + dev_alert(DEV, "Local WRITE failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* and now: check how to handle local io error. */ + __drbd_chk_io_error(mdev, FALSE); + _req_may_be_done(req, m); + put_ldev(mdev); + break; + + case read_ahead_completed_with_error: + /* it is legal to fail READA */ + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + _req_may_be_done(req, m); + put_ldev(mdev); + break; + + case read_completed_with_error: + drbd_set_out_of_sync(mdev, req->sector, req->size); + + req->rq_state |= RQ_LOCAL_COMPLETED; + req->rq_state &= ~RQ_LOCAL_PENDING; + + dev_alert(DEV, "Local READ failed sec=%llus size=%u\n", + (unsigned long long)req->sector, req->size); + /* _req_mod(req,to_be_send); oops, recursion... */ + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + req->rq_state |= RQ_NET_PENDING; + inc_ap_pending(mdev); + + __drbd_chk_io_error(mdev, FALSE); + put_ldev(mdev); + /* NOTE: if we have no connection, + * or know the peer has no good data either, + * then we don't actually need to "queue_for_net_read", + * but we do so anyways, since the drbd_io_error() + * and the potential state change to "Diskless" + * needs to be done from process context */ + + /* fall through: _req_mod(req,queue_for_net_read); */ + + case queue_for_net_read: + /* READ or READA, and + * no local disk, + * or target area marked as invalid, + * or just got an io-error. */ + /* from drbd_make_request_common + * or from bio_endio during read io-error recovery */ + + /* so we can verify the handle in the answer packet + * corresponding hlist_del is in _req_may_be_done() */ + hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); + + set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */ + + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = (req->rq_state & RQ_LOCAL_MASK) + ? w_read_retry_remote + : w_send_read_req; + drbd_queue_work(&mdev->data.work, &req->w); + break; + + case queue_for_net_write: + /* assert something? */ + /* from drbd_make_request_common only */ + + hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); + /* corresponding hlist_del is in _req_may_be_done() */ + + /* NOTE + * In case the req ended up on the transfer log before being + * queued on the worker, it could lead to this request being + * missed during cleanup after connection loss. + * So we have to do both operations here, + * within the same lock that protects the transfer log. + * + * _req_add_to_epoch(req); this has to be after the + * _maybe_start_new_epoch(req); which happened in + * drbd_make_request_common, because we now may set the bit + * again ourselves to close the current epoch. + * + * Add req to the (now) current epoch (barrier). */ + + /* see drbd_make_request_common, + * just after it grabs the req_lock */ + D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); + + req->epoch = mdev->newest_tle->br_number; + list_add_tail(&req->tl_requests, + &mdev->newest_tle->requests); + + /* increment size of current epoch */ + mdev->newest_tle->n_req++; + + /* queue work item to send data */ + D_ASSERT(req->rq_state & RQ_NET_PENDING); + req->rq_state |= RQ_NET_QUEUED; + req->w.cb = w_send_dblock; + drbd_queue_work(&mdev->data.work, &req->w); + + /* close the epoch, in case it outgrew the limit */ + if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size) + queue_barrier(mdev); + + break; + + case send_canceled: + /* treat it the same */ + case send_failed: + /* real cleanup will be done from tl_clear. just update flags + * so it is no longer marked as on the worker queue */ + req->rq_state &= ~RQ_NET_QUEUED; + /* if we did it right, tl_clear should be scheduled only after + * this, so this should not be necessary! */ + _req_may_be_done(req, m); + break; + + case handed_over_to_network: + /* assert something? */ + if (bio_data_dir(req->master_bio) == WRITE && + mdev->net_conf->wire_protocol == DRBD_PROT_A) { + /* this is what is dangerous about protocol A: + * pretend it was successfully written on the peer. */ + if (req->rq_state & RQ_NET_PENDING) { + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= RQ_NET_OK; + } /* else: neg-ack was faster... */ + /* it is still not yet RQ_NET_DONE until the + * corresponding epoch barrier got acked as well, + * so we know what to dirty on connection loss */ + } + req->rq_state &= ~RQ_NET_QUEUED; + req->rq_state |= RQ_NET_SENT; + /* because _drbd_send_zc_bio could sleep, and may want to + * dereference the bio even after the "write_acked_by_peer" and + * "completed_ok" events came in, once we return from + * _drbd_send_zc_bio (drbd_send_dblock), we have to check + * whether it is done already, and end it. */ + _req_may_be_done(req, m); + break; + + case connection_lost_while_pending: + /* transfer log cleanup after connection loss */ + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) + dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + req->rq_state |= RQ_NET_DONE; + /* if it is still queued, we may not complete it here. + * it will be canceled soon. */ + if (!(req->rq_state & RQ_NET_QUEUED)) + _req_may_be_done(req, m); + break; + + case write_acked_by_peer_and_sis: + req->rq_state |= RQ_NET_SIS; + case conflict_discarded_by_peer: + /* for discarded conflicting writes of multiple primaries, + * there is no need to keep anything in the tl, potential + * node crashes are covered by the activity log. */ + if (what == conflict_discarded_by_peer) + dev_alert(DEV, "Got DiscardAck packet %llus +%u!" + " DRBD is not a random data generator!\n", + (unsigned long long)req->sector, req->size); + req->rq_state |= RQ_NET_DONE; + /* fall through */ + case write_acked_by_peer: + /* protocol C; successfully written on peer. + * Nothing to do here. + * We want to keep the tl in place for all protocols, to cater + * for volatile write-back caches on lower level devices. + * + * A barrier request is expected to have forced all prior + * requests onto stable storage, so completion of a barrier + * request could set NET_DONE right here, and not wait for the + * P_BARRIER_ACK, but that is an unnecessary optimization. */ + + /* this makes it effectively the same as for: */ + case recv_acked_by_peer: + /* protocol B; pretends to be successfully written on peer. + * see also notes above in handed_over_to_network about + * protocol != C */ + req->rq_state |= RQ_NET_OK; + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + _req_may_be_done(req, m); + break; + + case neg_acked: + /* assert something? */ + if (req->rq_state & RQ_NET_PENDING) + dec_ap_pending(mdev); + req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING); + + req->rq_state |= RQ_NET_DONE; + _req_may_be_done(req, m); + /* else: done by handed_over_to_network */ + break; + + case barrier_acked: + if (req->rq_state & RQ_NET_PENDING) { + /* barrier came in before all requests have been acked. + * this is bad, because if the connection is lost now, + * we won't be able to clean them up... */ + dev_err(DEV, "FIXME (barrier_acked but pending)\n"); + trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)"); + list_move(&req->tl_requests, &mdev->out_of_sequence_requests); + } + D_ASSERT(req->rq_state & RQ_NET_SENT); + req->rq_state |= RQ_NET_DONE; + _req_may_be_done(req, m); + break; + + case data_received: + D_ASSERT(req->rq_state & RQ_NET_PENDING); + dec_ap_pending(mdev); + req->rq_state &= ~RQ_NET_PENDING; + req->rq_state |= (RQ_NET_OK|RQ_NET_DONE); + _req_may_be_done(req, m); + break; + }; +} + +/* we may do a local read if: + * - we are consistent (of course), + * - or we are generally inconsistent, + * BUT we are still/already IN SYNC for this area. + * since size may be bigger than BM_BLOCK_SIZE, + * we may need to check several bits. + */ +static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size) +{ + unsigned long sbnr, ebnr; + sector_t esector, nr_sectors; + + if (mdev->state.disk == D_UP_TO_DATE) + return 1; + if (mdev->state.disk >= D_OUTDATED) + return 0; + if (mdev->state.disk < D_INCONSISTENT) + return 0; + /* state.disk == D_INCONSISTENT We will have a look at the BitMap */ + nr_sectors = drbd_get_capacity(mdev->this_bdev); + esector = sector + (size >> 9) - 1; + + D_ASSERT(sector < nr_sectors); + D_ASSERT(esector < nr_sectors); + + sbnr = BM_SECT_TO_BIT(sector); + ebnr = BM_SECT_TO_BIT(esector); + + return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); +} + +static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) +{ + const int rw = bio_rw(bio); + const int size = bio->bi_size; + const sector_t sector = bio->bi_sector; + struct drbd_tl_epoch *b = NULL; + struct drbd_request *req; + int local, remote; + int err = -EIO; + + /* allocate outside of all locks; */ + req = drbd_req_new(mdev, bio); + if (!req) { + dec_ap_bio(mdev); + /* only pass the error to the upper layers. + * if user cannot handle io errors, that's not our business. */ + dev_err(DEV, "could not kmalloc() req\n"); + bio_endio(bio, -ENOMEM); + return 0; + } + + trace_drbd_bio(mdev, "Rq", bio, 0, req); + + local = get_ldev(mdev); + if (!local) { + bio_put(req->private_bio); /* or we get a bio leak */ + req->private_bio = NULL; + } + if (rw == WRITE) { + remote = 1; + } else { + /* READ || READA */ + if (local) { + if (!drbd_may_do_local_read(mdev, sector, size)) { + /* we could kick the syncer to + * sync this extent asap, wait for + * it, then continue locally. + * Or just issue the request remotely. + */ + local = 0; + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(mdev); + } + } + remote = !local && mdev->state.pdsk >= D_UP_TO_DATE; + } + + /* If we have a disk, but a READA request is mapped to remote, + * we are R_PRIMARY, D_INCONSISTENT, SyncTarget. + * Just fail that READA request right here. + * + * THINK: maybe fail all READA when not local? + * or make this configurable... + * if network is slow, READA won't do any good. + */ + if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) { + err = -EWOULDBLOCK; + goto fail_and_free_req; + } + + /* For WRITES going to the local disk, grab a reference on the target + * extent. This waits for any resync activity in the corresponding + * resync extent to finish, and, if necessary, pulls in the target + * extent into the activity log, which involves further disk io because + * of transactional on-disk meta data updates. */ + if (rw == WRITE && local) + drbd_al_begin_io(mdev, sector); + + remote = remote && (mdev->state.pdsk == D_UP_TO_DATE || + (mdev->state.pdsk == D_INCONSISTENT && + mdev->state.conn >= C_CONNECTED)); + + if (!(local || remote)) { + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + goto fail_free_complete; + } + + /* For WRITE request, we have to make sure that we have an + * unused_spare_tle, in case we need to start a new epoch. + * I try to be smart and avoid to pre-allocate always "just in case", + * but there is a race between testing the bit and pointer outside the + * spinlock, and grabbing the spinlock. + * if we lost that race, we retry. */ + if (rw == WRITE && remote && + mdev->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->flags)) { +allocate_barrier: + b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO); + if (!b) { + dev_err(DEV, "Failed to alloc barrier.\n"); + err = -ENOMEM; + goto fail_free_complete; + } + } + + /* GOOD, everything prepared, grab the spin_lock */ + spin_lock_irq(&mdev->req_lock); + + if (remote) { + remote = (mdev->state.pdsk == D_UP_TO_DATE || + (mdev->state.pdsk == D_INCONSISTENT && + mdev->state.conn >= C_CONNECTED)); + if (!remote) + dev_warn(DEV, "lost connection while grabbing the req_lock!\n"); + if (!(local || remote)) { + dev_err(DEV, "IO ERROR: neither local nor remote disk\n"); + spin_unlock_irq(&mdev->req_lock); + goto fail_free_complete; + } + } + + if (b && mdev->unused_spare_tle == NULL) { + mdev->unused_spare_tle = b; + b = NULL; + } + if (rw == WRITE && remote && + mdev->unused_spare_tle == NULL && + test_bit(CREATE_BARRIER, &mdev->flags)) { + /* someone closed the current epoch + * while we were grabbing the spinlock */ + spin_unlock_irq(&mdev->req_lock); + goto allocate_barrier; + } + + + /* Update disk stats */ + _drbd_start_io_acct(mdev, req, bio); + + /* _maybe_start_new_epoch(mdev); + * If we need to generate a write barrier packet, we have to add the + * new epoch (barrier) object, and queue the barrier packet for sending, + * and queue the req's data after it _within the same lock_, otherwise + * we have race conditions were the reorder domains could be mixed up. + * + * Even read requests may start a new epoch and queue the corresponding + * barrier packet. To get the write ordering right, we only have to + * make sure that, if this is a write request and it triggered a + * barrier packet, this request is queued within the same spinlock. */ + if (remote && mdev->unused_spare_tle && + test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { + _tl_add_barrier(mdev, mdev->unused_spare_tle); + mdev->unused_spare_tle = NULL; + } else { + D_ASSERT(!(remote && rw == WRITE && + test_bit(CREATE_BARRIER, &mdev->flags))); + } + + /* NOTE + * Actually, 'local' may be wrong here already, since we may have failed + * to write to the meta data, and may become wrong anytime because of + * local io-error for some other request, which would lead to us + * "detaching" the local disk. + * + * 'remote' may become wrong any time because the network could fail. + * + * This is a harmless race condition, though, since it is handled + * correctly at the appropriate places; so it just defers the failure + * of the respective operation. + */ + + /* mark them early for readability. + * this just sets some state flags. */ + if (remote) + _req_mod(req, to_be_send); + if (local) + _req_mod(req, to_be_submitted); + + /* check this request on the collision detection hash tables. + * if we have a conflict, just complete it here. + * THINK do we want to check reads, too? (I don't think so...) */ + if (rw == WRITE && _req_conflicts(req)) { + /* this is a conflicting request. + * even though it may have been only _partially_ + * overlapping with one of the currently pending requests, + * without even submitting or sending it, we will + * pretend that it was successfully served right now. + */ + if (local) { + bio_put(req->private_bio); + req->private_bio = NULL; + drbd_al_complete_io(mdev, req->sector); + put_ldev(mdev); + local = 0; + } + if (remote) + dec_ap_pending(mdev); + _drbd_end_io_acct(mdev, req); + /* THINK: do we want to fail it (-EIO), or pretend success? */ + bio_endio(req->master_bio, 0); + req->master_bio = NULL; + dec_ap_bio(mdev); + drbd_req_free(req); + remote = 0; + } + + /* NOTE remote first: to get the concurrent write detection right, + * we must register the request before start of local IO. */ + if (remote) { + /* either WRITE and C_CONNECTED, + * or READ, and no local disk, + * or READ, but not in sync. + */ + _req_mod(req, (rw == WRITE) + ? queue_for_net_write + : queue_for_net_read); + } + spin_unlock_irq(&mdev->req_lock); + kfree(b); /* if someone else has beaten us to it... */ + + if (local) { + req->private_bio->bi_bdev = mdev->ldev->backing_bdev; + + trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL); + + if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR + : rw == READ ? DRBD_FAULT_DT_RD + : DRBD_FAULT_DT_RA)) + bio_endio(req->private_bio, -EIO); + else + generic_make_request(req->private_bio); + } + + /* we need to plug ALWAYS since we possibly need to kick lo_dev. + * we plug after submit, so we won't miss an unplug event */ + drbd_plug_device(mdev); + + return 0; + +fail_free_complete: + if (rw == WRITE && local) + drbd_al_complete_io(mdev, sector); +fail_and_free_req: + if (local) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(mdev); + } + bio_endio(bio, err); + drbd_req_free(req); + dec_ap_bio(mdev); + kfree(b); + + return 0; +} + +/* helper function for drbd_make_request + * if we can determine just by the mdev (state) that this request will fail, + * return 1 + * otherwise return 0 + */ +static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write) +{ + /* Unconfigured */ + if (mdev->state.conn == C_DISCONNECTING && + mdev->state.disk == D_DISKLESS) + return 1; + + if (mdev->state.role != R_PRIMARY && + (!allow_oos || is_write)) { + if (__ratelimit(&drbd_ratelimit_state)) { + dev_err(DEV, "Process %s[%u] tried to %s; " + "since we are not in Primary state, " + "we cannot allow this\n", + current->comm, current->pid, + is_write ? "WRITE" : "READ"); + } + return 1; + } + + /* + * Paranoia: we might have been primary, but sync target, or + * even diskless, then lost the connection. + * This should have been handled (panic? suspend?) somewhere + * else. But maybe it was not, so check again here. + * Caution: as long as we do not have a read/write lock on mdev, + * to serialize state changes, this is racy, since we may lose + * the connection *after* we test for the cstate. + */ + if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sorry, I have no access to good data anymore.\n"); + return 1; + } + + return 0; +} + +int drbd_make_request_26(struct request_queue *q, struct bio *bio) +{ + unsigned int s_enr, e_enr; + struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; + + if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) { + bio_endio(bio, -EPERM); + return 0; + } + + /* Reject barrier requests if we know the underlying device does + * not support them. + * XXX: Need to get this info from peer as well some how so we + * XXX: reject if EITHER side/data/metadata area does not support them. + * + * because of those XXX, this is not yet enabled, + * i.e. in drbd_init_set_defaults we set the NO_BARRIER_SUPP bit. + */ + if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER) && test_bit(NO_BARRIER_SUPP, &mdev->flags))) { + /* dev_warn(DEV, "Rejecting barrier request as underlying device does not support\n"); */ + bio_endio(bio, -EOPNOTSUPP); + return 0; + } + + /* + * what we "blindly" assume: + */ + D_ASSERT(bio->bi_size > 0); + D_ASSERT((bio->bi_size & 0x1ff) == 0); + D_ASSERT(bio->bi_idx == 0); + + /* to make some things easier, force alignment of requests within the + * granularity of our hash tables */ + s_enr = bio->bi_sector >> HT_SHIFT; + e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; + + if (likely(s_enr == e_enr)) { + inc_ap_bio(mdev, 1); + return drbd_make_request_common(mdev, bio); + } + + /* can this bio be split generically? + * Maybe add our own split-arbitrary-bios function. */ + if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_SEGMENT_SIZE) { + /* rather error out here than BUG in bio_split */ + dev_err(DEV, "bio would need to, but cannot, be split: " + "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n", + bio->bi_vcnt, bio->bi_idx, bio->bi_size, + (unsigned long long)bio->bi_sector); + bio_endio(bio, -EINVAL); + } else { + /* This bio crosses some boundary, so we have to split it. */ + struct bio_pair *bp; + /* works for the "do not cross hash slot boundaries" case + * e.g. sector 262269, size 4096 + * s_enr = 262269 >> 6 = 4097 + * e_enr = (262269+8-1) >> 6 = 4098 + * HT_SHIFT = 6 + * sps = 64, mask = 63 + * first_sectors = 64 - (262269 & 63) = 3 + */ + const sector_t sect = bio->bi_sector; + const int sps = 1 << HT_SHIFT; /* sectors per slot */ + const int mask = sps - 1; + const sector_t first_sectors = sps - (sect & mask); + bp = bio_split(bio, +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,28) + bio_split_pool, +#endif + first_sectors); + + /* we need to get a "reference count" (ap_bio_cnt) + * to avoid races with the disconnect/reconnect/suspend code. + * In case we need to split the bio here, we need to get two references + * atomically, otherwise we might deadlock when trying to submit the + * second one! */ + inc_ap_bio(mdev, 2); + + D_ASSERT(e_enr == s_enr + 1); + + drbd_make_request_common(mdev, &bp->bio1); + drbd_make_request_common(mdev, &bp->bio2); + bio_pair_release(bp); + } + return 0; +} + +/* This is called by bio_add_page(). With this function we reduce + * the number of BIOs that span over multiple DRBD_MAX_SEGMENT_SIZEs + * units (was AL_EXTENTs). + * + * we do the calculation within the lower 32bit of the byte offsets, + * since we don't care for actual offset, but only check whether it + * would cross "activity log extent" boundaries. + * + * As long as the BIO is empty we have to allow at least one bvec, + * regardless of size and offset. so the resulting bio may still + * cross extent boundaries. those are dealt with (bio_split) in + * drbd_make_request_26. + */ +int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec) +{ + struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata; + unsigned int bio_offset = + (unsigned int)bvm->bi_sector << 9; /* 32 bit */ + unsigned int bio_size = bvm->bi_size; + int limit, backing_limit; + + limit = DRBD_MAX_SEGMENT_SIZE + - ((bio_offset & (DRBD_MAX_SEGMENT_SIZE-1)) + bio_size); + if (limit < 0) + limit = 0; + if (bio_size == 0) { + if (limit <= bvec->bv_len) + limit = bvec->bv_len; + } else if (limit && get_ldev(mdev)) { + struct request_queue * const b = + mdev->ldev->backing_bdev->bd_disk->queue; + if (b->merge_bvec_fn && mdev->ldev->dc.use_bmbv) { + backing_limit = b->merge_bvec_fn(b, bvm, bvec); + limit = min(limit, backing_limit); + } + put_ldev(mdev); + } + return limit; +} diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h new file mode 100644 index 00000000000..d37ab57f120 --- /dev/null +++ b/drivers/block/drbd/drbd_req.h @@ -0,0 +1,327 @@ +/* + drbd_req.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2006-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2006-2008, Lars Ellenberg . + Copyright (C) 2006-2008, Philipp Reisner . + + DRBD is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + DRBD is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_REQ_H +#define _DRBD_REQ_H + +#include +#include + +#include +#include +#include "drbd_int.h" +#include "drbd_wrappers.h" + +/* The request callbacks will be called in irq context by the IDE drivers, + and in Softirqs/Tasklets/BH context by the SCSI drivers, + and by the receiver and worker in kernel-thread context. + Try to get the locking right :) */ + +/* + * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are + * associated with IO requests originating from the block layer above us. + * + * There are quite a few things that may happen to a drbd request + * during its lifetime. + * + * It will be created. + * It will be marked with the intention to be + * submitted to local disk and/or + * send via the network. + * + * It has to be placed on the transfer log and other housekeeping lists, + * In case we have a network connection. + * + * It may be identified as a concurrent (write) request + * and be handled accordingly. + * + * It may me handed over to the local disk subsystem. + * It may be completed by the local disk subsystem, + * either sucessfully or with io-error. + * In case it is a READ request, and it failed locally, + * it may be retried remotely. + * + * It may be queued for sending. + * It may be handed over to the network stack, + * which may fail. + * It may be acknowledged by the "peer" according to the wire_protocol in use. + * this may be a negative ack. + * It may receive a faked ack when the network connection is lost and the + * transfer log is cleaned up. + * Sending may be canceled due to network connection loss. + * When it finally has outlived its time, + * corresponding dirty bits in the resync-bitmap may be cleared or set, + * it will be destroyed, + * and completion will be signalled to the originator, + * with or without "success". + */ + +enum drbd_req_event { + created, + to_be_send, + to_be_submitted, + + /* XXX yes, now I am inconsistent... + * these two are not "events" but "actions" + * oh, well... */ + queue_for_net_write, + queue_for_net_read, + + send_canceled, + send_failed, + handed_over_to_network, + connection_lost_while_pending, + recv_acked_by_peer, + write_acked_by_peer, + write_acked_by_peer_and_sis, /* and set_in_sync */ + conflict_discarded_by_peer, + neg_acked, + barrier_acked, /* in protocol A and B */ + data_received, /* (remote read) */ + + read_completed_with_error, + read_ahead_completed_with_error, + write_completed_with_error, + completed_ok, + nothing, /* for tracing only */ +}; + +/* encoding of request states for now. we don't actually need that many bits. + * we don't need to do atomic bit operations either, since most of the time we + * need to look at the connection state and/or manipulate some lists at the + * same time, so we should hold the request lock anyways. + */ +enum drbd_req_state_bits { + /* 210 + * 000: no local possible + * 001: to be submitted + * UNUSED, we could map: 011: submitted, completion still pending + * 110: completed ok + * 010: completed with error + */ + __RQ_LOCAL_PENDING, + __RQ_LOCAL_COMPLETED, + __RQ_LOCAL_OK, + + /* 76543 + * 00000: no network possible + * 00001: to be send + * 00011: to be send, on worker queue + * 00101: sent, expecting recv_ack (B) or write_ack (C) + * 11101: sent, + * recv_ack (B) or implicit "ack" (A), + * still waiting for the barrier ack. + * master_bio may already be completed and invalidated. + * 11100: write_acked (C), + * data_received (for remote read, any protocol) + * or finally the barrier ack has arrived (B,A)... + * request can be freed + * 01100: neg-acked (write, protocol C) + * or neg-d-acked (read, any protocol) + * or killed from the transfer log + * during cleanup after connection loss + * request can be freed + * 01000: canceled or send failed... + * request can be freed + */ + + /* if "SENT" is not set, yet, this can still fail or be canceled. + * if "SENT" is set already, we still wait for an Ack packet. + * when cleared, the master_bio may be completed. + * in (B,A) the request object may still linger on the transaction log + * until the corresponding barrier ack comes in */ + __RQ_NET_PENDING, + + /* If it is QUEUED, and it is a WRITE, it is also registered in the + * transfer log. Currently we need this flag to avoid conflicts between + * worker canceling the request and tl_clear_barrier killing it from + * transfer log. We should restructure the code so this conflict does + * no longer occur. */ + __RQ_NET_QUEUED, + + /* well, actually only "handed over to the network stack". + * + * TODO can potentially be dropped because of the similar meaning + * of RQ_NET_SENT and ~RQ_NET_QUEUED. + * however it is not exactly the same. before we drop it + * we must ensure that we can tell a request with network part + * from a request without, regardless of what happens to it. */ + __RQ_NET_SENT, + + /* when set, the request may be freed (if RQ_NET_QUEUED is clear). + * basically this means the corresponding P_BARRIER_ACK was received */ + __RQ_NET_DONE, + + /* whether or not we know (C) or pretend (B,A) that the write + * was successfully written on the peer. + */ + __RQ_NET_OK, + + /* peer called drbd_set_in_sync() for this write */ + __RQ_NET_SIS, + + /* keep this last, its for the RQ_NET_MASK */ + __RQ_NET_MAX, +}; + +#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) +#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) +#define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) + +#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ + +#define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) +#define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) +#define RQ_NET_SENT (1UL << __RQ_NET_SENT) +#define RQ_NET_DONE (1UL << __RQ_NET_DONE) +#define RQ_NET_OK (1UL << __RQ_NET_OK) +#define RQ_NET_SIS (1UL << __RQ_NET_SIS) + +/* 0x1f8 */ +#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) + +/* epoch entries */ +static inline +struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + BUG_ON(mdev->ee_hash_s == 0); + return mdev->ee_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s); +} + +/* transfer log (drbd_request objects) */ +static inline +struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + BUG_ON(mdev->tl_hash_s == 0); + return mdev->tl_hash + + ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s); +} + +/* application reads (drbd_request objects) */ +static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector) +{ + return mdev->app_reads_hash + + ((unsigned int)(sector) % APP_R_HSIZE); +} + +/* when we receive the answer for a read request, + * verify that we actually know about it */ +static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, + u64 id, sector_t sector) +{ + struct hlist_head *slot = ar_hash_slot(mdev, sector); + struct hlist_node *n; + struct drbd_request *req; + + hlist_for_each_entry(req, n, slot, colision) { + if ((unsigned long)req == (unsigned long)id) { + D_ASSERT(req->sector == sector); + return req; + } + } + return NULL; +} + +static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, + struct bio *bio_src) +{ + struct bio *bio; + struct drbd_request *req = + mempool_alloc(drbd_request_mempool, GFP_NOIO); + if (likely(req)) { + bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */ + + req->rq_state = 0; + req->mdev = mdev; + req->master_bio = bio_src; + req->private_bio = bio; + req->epoch = 0; + req->sector = bio->bi_sector; + req->size = bio->bi_size; + req->start_time = jiffies; + INIT_HLIST_NODE(&req->colision); + INIT_LIST_HEAD(&req->tl_requests); + INIT_LIST_HEAD(&req->w.list); + + bio->bi_private = req; + bio->bi_end_io = drbd_endio_pri; + bio->bi_next = NULL; + } + return req; +} + +static inline void drbd_req_free(struct drbd_request *req) +{ + mempool_free(req, drbd_request_mempool); +} + +static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) +{ + return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9))); +} + +/* Short lived temporary struct on the stack. + * We could squirrel the error to be returned into + * bio->bi_size, or similar. But that would be too ugly. */ +struct bio_and_error { + struct bio *bio; + int error; +}; + +extern void _req_may_be_done(struct drbd_request *req, + struct bio_and_error *m); +extern void __req_mod(struct drbd_request *req, enum drbd_req_event what, + struct bio_and_error *m); +extern void complete_master_bio(struct drbd_conf *mdev, + struct bio_and_error *m); + +/* use this if you don't want to deal with calling complete_master_bio() + * outside the spinlock, e.g. when walking some list on cleanup. */ +static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what) +{ + struct drbd_conf *mdev = req->mdev; + struct bio_and_error m; + + /* __req_mod possibly frees req, do not touch req after that! */ + __req_mod(req, what, &m); + if (m.bio) + complete_master_bio(mdev, &m); +} + +/* completion of master bio is outside of spinlock. + * If you need it irqsave, do it your self! */ +static inline void req_mod(struct drbd_request *req, + enum drbd_req_event what) +{ + struct drbd_conf *mdev = req->mdev; + struct bio_and_error m; + spin_lock_irq(&mdev->req_lock); + __req_mod(req, what, &m); + spin_unlock_irq(&mdev->req_lock); + + if (m.bio) + complete_master_bio(mdev, &m); +} +#endif diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c new file mode 100644 index 00000000000..76863e3f05b --- /dev/null +++ b/drivers/block/drbd/drbd_strings.c @@ -0,0 +1,113 @@ +/* + drbd.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ + +#include + +static const char *drbd_conn_s_names[] = { + [C_STANDALONE] = "StandAlone", + [C_DISCONNECTING] = "Disconnecting", + [C_UNCONNECTED] = "Unconnected", + [C_TIMEOUT] = "Timeout", + [C_BROKEN_PIPE] = "BrokenPipe", + [C_NETWORK_FAILURE] = "NetworkFailure", + [C_PROTOCOL_ERROR] = "ProtocolError", + [C_WF_CONNECTION] = "WFConnection", + [C_WF_REPORT_PARAMS] = "WFReportParams", + [C_TEAR_DOWN] = "TearDown", + [C_CONNECTED] = "Connected", + [C_STARTING_SYNC_S] = "StartingSyncS", + [C_STARTING_SYNC_T] = "StartingSyncT", + [C_WF_BITMAP_S] = "WFBitMapS", + [C_WF_BITMAP_T] = "WFBitMapT", + [C_WF_SYNC_UUID] = "WFSyncUUID", + [C_SYNC_SOURCE] = "SyncSource", + [C_SYNC_TARGET] = "SyncTarget", + [C_PAUSED_SYNC_S] = "PausedSyncS", + [C_PAUSED_SYNC_T] = "PausedSyncT", + [C_VERIFY_S] = "VerifyS", + [C_VERIFY_T] = "VerifyT", +}; + +static const char *drbd_role_s_names[] = { + [R_PRIMARY] = "Primary", + [R_SECONDARY] = "Secondary", + [R_UNKNOWN] = "Unknown" +}; + +static const char *drbd_disk_s_names[] = { + [D_DISKLESS] = "Diskless", + [D_ATTACHING] = "Attaching", + [D_FAILED] = "Failed", + [D_NEGOTIATING] = "Negotiating", + [D_INCONSISTENT] = "Inconsistent", + [D_OUTDATED] = "Outdated", + [D_UNKNOWN] = "DUnknown", + [D_CONSISTENT] = "Consistent", + [D_UP_TO_DATE] = "UpToDate", +}; + +static const char *drbd_state_sw_errors[] = { + [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", + [-SS_NO_UP_TO_DATE_DISK] = "Refusing to be Primary without at least one UpToDate disk", + [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", + [-SS_NO_REMOTE_DISK] = "Can not resync without remote disk", + [-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected", + [-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated", + [-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active", + [-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device", + [-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node", + [-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk", + [-SS_DEVICE_IN_USE] = "Device is held open by someone", + [-SS_NO_NET_CONFIG] = "Have no net/connection configuration", + [-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify", + [-SS_NEED_CONNECTION] = "Need a connection to start verify or resync", + [-SS_NOT_SUPPORTED] = "Peer does not support protocol", + [-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated", + [-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change", + [-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted", +}; + +const char *drbd_conn_str(enum drbd_conns s) +{ + /* enums are unsigned... */ + return s > C_PAUSED_SYNC_T ? "TOO_LARGE" : drbd_conn_s_names[s]; +} + +const char *drbd_role_str(enum drbd_role s) +{ + return s > R_SECONDARY ? "TOO_LARGE" : drbd_role_s_names[s]; +} + +const char *drbd_disk_str(enum drbd_disk_state s) +{ + return s > D_UP_TO_DATE ? "TOO_LARGE" : drbd_disk_s_names[s]; +} + +const char *drbd_set_st_err_str(enum drbd_state_ret_codes err) +{ + return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" : + err > SS_TWO_PRIMARIES ? "TOO_LARGE" + : drbd_state_sw_errors[-err]; +} diff --git a/drivers/block/drbd/drbd_tracing.c b/drivers/block/drbd/drbd_tracing.c new file mode 100644 index 00000000000..d18d4f7b4be --- /dev/null +++ b/drivers/block/drbd/drbd_tracing.c @@ -0,0 +1,752 @@ +/* + drbd_tracing.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include "drbd_int.h" +#include "drbd_tracing.h" +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg"); +MODULE_DESCRIPTION("DRBD tracepoint probes"); +MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c"); +MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)"); +MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)"); + +unsigned int trace_mask = 0; /* Bitmap of events to trace */ +int trace_level; /* Current trace level */ +int trace_devs; /* Bitmap of devices to trace */ + +module_param(trace_mask, uint, 0444); +module_param(trace_level, int, 0644); +module_param(trace_devs, int, 0644); + +enum { + TRACE_PACKET = 0x0001, + TRACE_RQ = 0x0002, + TRACE_UUID = 0x0004, + TRACE_RESYNC = 0x0008, + TRACE_EE = 0x0010, + TRACE_UNPLUG = 0x0020, + TRACE_NL = 0x0040, + TRACE_AL_EXT = 0x0080, + TRACE_INT_RQ = 0x0100, + TRACE_MD_IO = 0x0200, + TRACE_EPOCH = 0x0400, +}; + +/* Buffer printing support + * dbg_print_flags: used for Flags arg to drbd_print_buffer + * - DBGPRINT_BUFFADDR; if set, each line starts with the + * virtual address of the line being output. If clear, + * each line starts with the offset from the beginning + * of the buffer. */ +enum dbg_print_flags { + DBGPRINT_BUFFADDR = 0x0001, +}; + +/* Macro stuff */ +static char *nl_packet_name(int packet_type) +{ +/* Generate packet type strings */ +#define NL_PACKET(name, number, fields) \ + [P_ ## name] = # name, +#define NL_INTEGER Argh! +#define NL_BIT Argh! +#define NL_INT64 Argh! +#define NL_STRING Argh! + + static char *nl_tag_name[P_nl_after_last_packet] = { +#include "linux/drbd_nl.h" + }; + + return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ? + nl_tag_name[packet_type] : "*Unknown*"; +} +/* /Macro stuff */ + +static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level) +{ + return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs); +} + +static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg) +{ + if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) + return; + + dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt)); +} + +static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index) +{ + static char *uuid_str[UI_EXTENDED_SIZE] = { + [UI_CURRENT] = "CURRENT", + [UI_BITMAP] = "BITMAP", + [UI_HISTORY_START] = "HISTORY_START", + [UI_HISTORY_END] = "HISTORY_END", + [UI_SIZE] = "SIZE", + [UI_FLAGS] = "FLAGS", + }; + + if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) + return; + + if (index >= UI_EXTENDED_SIZE) { + dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n"); + return; + } + + dev_info(DEV, " uuid[%s] now %016llX\n", + uuid_str[index], + (unsigned long long)mdev->ldev->md.uuid[index]); +} + +static void probe_drbd_md_io(struct drbd_conf *mdev, int rw, + struct drbd_backing_dev *bdev) +{ + if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) + return; + + dev_info(DEV, " %s metadata superblock now\n", + rw == READ ? "Reading" : "Writing"); +} + +static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg) +{ + if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) + return; + + dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n", + msg, (unsigned long long)e->sector, e->size, e); +} + +static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch, + enum epoch_event ev) +{ + static char *epoch_event_str[] = { + [EV_PUT] = "put", + [EV_GOT_BARRIER_NR] = "got_barrier_nr", + [EV_BARRIER_DONE] = "barrier_done", + [EV_BECAME_LAST] = "became_last", + [EV_TRACE_FLUSH] = "issuing_flush", + [EV_TRACE_ADD_BARRIER] = "added_barrier", + [EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch", + }; + + if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) + return; + + ev &= ~EV_CLEANUP; + + switch (ev) { + case EV_TRACE_ALLOC: + dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs); + break; + case EV_TRACE_FREE: + dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n", + epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), + mdev->epochs); + break; + default: + dev_info(DEV, "Update epoch %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n", + epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), + atomic_read(&epoch->active), + test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-', + test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-', + test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-', + test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-', + epoch_event_str[ev]); + } +} + +static void probe_drbd_netlink(void *data, int is_req) +{ + struct cn_msg *msg = data; + + if (is_req) { + struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data; + + printk(KERN_INFO "drbd%d: " + "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n", + nlp->drbd_minor, + nl_packet_name(nlp->packet_type), + nlp->packet_type, + msg->seq, msg->ack, msg->len); + } else { + struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data; + + printk(KERN_INFO "drbd%d: " + "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n", + nlp->minor, + nlp->packet_type == P_nl_after_last_packet ? + "Empty-Reply" : nl_packet_name(nlp->packet_type), + nlp->packet_type, + msg->seq, msg->ack, msg->len); + } +} + +static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg) +{ + unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); + + if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) + return; + + dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n", + msg, (unsigned long long) sector, enr, + (int)BM_SECT_TO_EXT(sector)); +} + +/** + * drbd_print_buffer() - Hexdump arbitrary binary data into a buffer + * @prefix: String is output at the beginning of each line output. + * @flags: Currently only defined flag: DBGPRINT_BUFFADDR; if set, each + * line starts with the virtual address of the line being + * output. If clear, each line starts with the offset from the + * beginning of the buffer. + * @size: Indicates the size of each entry in the buffer. Supported + * values are sizeof(char), sizeof(short) and sizeof(int) + * @buffer: Start address of buffer + * @buffer_va: Virtual address of start of buffer (normally the same + * as Buffer, but having it separate allows it to hold + * file address for example) + * @length: length of buffer + */ +static void drbd_print_buffer(const char *prefix, unsigned int flags, int size, + const void *buffer, const void *buffer_va, + unsigned int length) + +#define LINE_SIZE 16 +#define LINE_ENTRIES (int)(LINE_SIZE/size) +{ + const unsigned char *pstart; + const unsigned char *pstart_va; + const unsigned char *pend; + char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8]; + char *pbytes = bytes_str, *pascii = ascii_str; + int offset = 0; + long sizemask; + int field_width; + int index; + const unsigned char *pend_str; + const unsigned char *p; + int count; + + /* verify size parameter */ + if (size != sizeof(char) && + size != sizeof(short) && + size != sizeof(int)) { + printk(KERN_DEBUG "drbd_print_buffer: " + "ERROR invalid size %d\n", size); + return; + } + + sizemask = size-1; + field_width = size*2; + + /* Adjust start/end to be on appropriate boundary for size */ + buffer = (const char *)((long)buffer & ~sizemask); + pend = (const unsigned char *) + (((long)buffer + length + sizemask) & ~sizemask); + + if (flags & DBGPRINT_BUFFADDR) { + /* Move start back to nearest multiple of line size, + * if printing address. This results in nicely formatted output + * with addresses being on line size (16) byte boundaries */ + pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1)); + } else { + pstart = (const unsigned char *)buffer; + } + + /* Set value of start VA to print if addresses asked for */ + pstart_va = (const unsigned char *)buffer_va + - ((const unsigned char *)buffer-pstart); + + /* Calculate end position to nicely align right hand side */ + pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1)); + + /* Init strings */ + *pbytes = *pascii = '\0'; + + /* Start at beginning of first line */ + p = pstart; + count = 0; + + while (p < pend_str) { + if (p < (const unsigned char *)buffer || p >= pend) { + /* Before start of buffer or after end- print spaces */ + pbytes += sprintf(pbytes, "%*c ", field_width, ' '); + pascii += sprintf(pascii, "%*c", size, ' '); + p += size; + } else { + /* Add hex and ascii to strings */ + int val; + switch (size) { + default: + case 1: + val = *(unsigned char *)p; + break; + case 2: + val = *(unsigned short *)p; + break; + case 4: + val = *(unsigned int *)p; + break; + } + + pbytes += sprintf(pbytes, "%0*x ", field_width, val); + + for (index = size; index; index--) { + *pascii++ = isprint(*p) ? *p : '.'; + p++; + } + } + + count++; + + if (count == LINE_ENTRIES || p >= pend_str) { + /* Null terminate and print record */ + *pascii = '\0'; + printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n", + prefix, + (flags & DBGPRINT_BUFFADDR) + ? (long)pstart_va:(long)offset, + LINE_ENTRIES*(field_width+1), bytes_str, + LINE_SIZE, ascii_str); + + /* Move onto next line */ + pstart_va += (p-pstart); + pstart = p; + count = 0; + offset += LINE_SIZE; + + /* Re-init strings */ + pbytes = bytes_str; + pascii = ascii_str; + *pbytes = *pascii = '\0'; + } + } +} + +static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args) +{ + char str[256]; + + if (!is_mdev_trace(mdev, level)) + return; + + if (vsnprintf(str, 256, fmt, args) >= 256) + str[255] = 0; + + printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)), + dev_name(disk_to_dev(mdev->vdisk)), str); +} + +static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, + struct drbd_request *r) +{ +#if defined(CONFIG_LBDAF) || defined(CONFIG_LBD) +#define SECTOR_FORMAT "%Lx" +#else +#define SECTOR_FORMAT "%lx" +#endif +#define SECTOR_SHIFT 9 + + unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT); + char *faddr = (char *)(lowaddr); + char rb[sizeof(void *)*2+6] = { 0, }; + struct bio_vec *bvec; + int segno; + + const int rw = bio->bi_rw; + const int biorw = (rw & (RW_MASK|RWA_MASK)); + const int biobarrier = (rw & (1<>>", + pfx, + biorw == WRITE ? "Write" : "Read", + biobarrier ? " : B" : "", + biosync ? " : S" : "", + bio, + rb, + complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "", + bio->bi_sector << SECTOR_SHIFT, + bio->bi_size); + + if (trace_level >= TRACE_LVL_METRICS && + ((biorw == WRITE) ^ complete)) { + printk(KERN_DEBUG " ind page offset length\n"); + __bio_for_each_segment(bvec, bio, segno, 0) { + printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n", segno, + bvec->bv_page, bvec->bv_offset, bvec->bv_len); + + if (trace_level >= TRACE_LVL_ALL) { + char *bvec_buf; + unsigned long flags; + + bvec_buf = bvec_kmap_irq(bvec, &flags); + + drbd_print_buffer(" ", DBGPRINT_BUFFADDR, 1, + bvec_buf, + faddr, + (bvec->bv_len <= 0x80) + ? bvec->bv_len : 0x80); + + bvec_kunmap_irq(bvec_buf, &flags); + + if (bvec->bv_len > 0x40) + printk(KERN_DEBUG " ....\n"); + + faddr += bvec->bv_len; + } + } + } +} + +static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg) +{ + static const char *rq_event_names[] = { + [created] = "created", + [to_be_send] = "to_be_send", + [to_be_submitted] = "to_be_submitted", + [queue_for_net_write] = "queue_for_net_write", + [queue_for_net_read] = "queue_for_net_read", + [send_canceled] = "send_canceled", + [send_failed] = "send_failed", + [handed_over_to_network] = "handed_over_to_network", + [connection_lost_while_pending] = + "connection_lost_while_pending", + [recv_acked_by_peer] = "recv_acked_by_peer", + [write_acked_by_peer] = "write_acked_by_peer", + [neg_acked] = "neg_acked", + [conflict_discarded_by_peer] = "conflict_discarded_by_peer", + [barrier_acked] = "barrier_acked", + [data_received] = "data_received", + [read_completed_with_error] = "read_completed_with_error", + [read_ahead_completed_with_error] = "reada_completed_with_error", + [write_completed_with_error] = "write_completed_with_error", + [completed_ok] = "completed_ok", + }; + + struct drbd_conf *mdev = req->mdev; + + const int rw = (req->master_bio == NULL || + bio_data_dir(req->master_bio) == WRITE) ? + 'W' : 'R'; + const unsigned long s = req->rq_state; + + if (what != nothing) { + dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]); + } else { + dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n", + msg, req, rw, + s & RQ_LOCAL_PENDING ? 'p' : '-', + s & RQ_LOCAL_COMPLETED ? 'c' : '-', + s & RQ_LOCAL_OK ? 'o' : '-', + s & RQ_NET_PENDING ? 'p' : '-', + s & RQ_NET_QUEUED ? 'q' : '-', + s & RQ_NET_SENT ? 's' : '-', + s & RQ_NET_DONE ? 'd' : '-', + s & RQ_NET_OK ? 'o' : '-', + req->epoch, + (unsigned long long)req->sector, + req->size, + drbd_conn_str(mdev->state.conn)); + } +} + + +#define drbd_peer_str drbd_role_str +#define drbd_pdsk_str drbd_disk_str + +#define PSM(A) \ +do { \ + if (mask.A) { \ + int i = snprintf(p, len, " " #A "( %s )", \ + drbd_##A##_str(val.A)); \ + if (i >= len) \ + return op; \ + p += i; \ + len -= i; \ + } \ +} while (0) + +static char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val) +{ + char *op = p; + *p = '\0'; + PSM(role); + PSM(peer); + PSM(conn); + PSM(disk); + PSM(pdsk); + + return op; +} + +#define INFOP(fmt, args...) \ +do { \ + if (trace_level >= TRACE_LVL_ALL) { \ + dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \ + file, line, current->comm, current->pid, \ + sockname, recv ? "<<<" : ">>>" , \ + ## args); \ + } else { \ + dev_info(DEV, "%s %s " fmt, sockname, \ + recv ? "<<<" : ">>>" , \ + ## args); \ + } \ +} while (0) + +static char *_dump_block_id(u64 block_id, char *buff) +{ + if (is_syncer_block_id(block_id)) + strcpy(buff, "SyncerId"); + else + sprintf(buff, "%llx", (unsigned long long)block_id); + + return buff; +} + +static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock, + int recv, union p_polymorph *p, char *file, int line) +{ + char *sockname = sock == mdev->meta.socket ? "meta" : "data"; + int cmd = (recv == 2) ? p->header.command : be16_to_cpu(p->header.command); + char tmp[300]; + union drbd_state m, v; + + switch (cmd) { + case P_HAND_SHAKE: + INFOP("%s (protocol %u-%u)\n", cmdname(cmd), + be32_to_cpu(p->handshake.protocol_min), + be32_to_cpu(p->handshake.protocol_max)); + break; + + case P_BITMAP: /* don't report this */ + case P_COMPRESSED_BITMAP: /* don't report this */ + break; + + case P_DATA: + INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd), + (unsigned long long)be64_to_cpu(p->data.sector), + _dump_block_id(p->data.block_id, tmp), + be32_to_cpu(p->data.seq_num), + be32_to_cpu(p->data.dp_flags) + ); + break; + + case P_DATA_REPLY: + case P_RS_DATA_REPLY: + INFOP("%s (sector %llus, id %s)\n", cmdname(cmd), + (unsigned long long)be64_to_cpu(p->data.sector), + _dump_block_id(p->data.block_id, tmp) + ); + break; + + case P_RECV_ACK: + case P_WRITE_ACK: + case P_RS_WRITE_ACK: + case P_DISCARD_ACK: + case P_NEG_ACK: + case P_NEG_RS_DREPLY: + INFOP("%s (sector %llus, size %u, id %s, seq %u)\n", + cmdname(cmd), + (long long)be64_to_cpu(p->block_ack.sector), + be32_to_cpu(p->block_ack.blksize), + _dump_block_id(p->block_ack.block_id, tmp), + be32_to_cpu(p->block_ack.seq_num) + ); + break; + + case P_DATA_REQUEST: + case P_RS_DATA_REQUEST: + INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd), + (long long)be64_to_cpu(p->block_req.sector), + be32_to_cpu(p->block_req.blksize), + _dump_block_id(p->block_req.block_id, tmp) + ); + break; + + case P_BARRIER: + case P_BARRIER_ACK: + INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier); + break; + + case P_SYNC_PARAM: + case P_SYNC_PARAM89: + INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n", + cmdname(cmd), be32_to_cpu(p->rs_param_89.rate), + p->rs_param_89.verify_alg, p->rs_param_89.csums_alg); + break; + + case P_UUIDS: + INFOP("%s Curr:%016llX, Bitmap:%016llX, " + "HisSt:%016llX, HisEnd:%016llX\n", + cmdname(cmd), + (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]), + (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]), + (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]), + (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END])); + break; + + case P_SIZES: + INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, " + "max bio %x, q order %x)\n", + cmdname(cmd), + (long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)), + (long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)), + (long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)), + be32_to_cpu(p->sizes.max_segment_size), + be32_to_cpu(p->sizes.queue_order_type)); + break; + + case P_STATE: + v.i = be32_to_cpu(p->state.state); + m.i = 0xffffffff; + dump_st(tmp, sizeof(tmp), m, v); + INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp); + break; + + case P_STATE_CHG_REQ: + m.i = be32_to_cpu(p->req_state.mask); + v.i = be32_to_cpu(p->req_state.val); + dump_st(tmp, sizeof(tmp), m, v); + INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp); + break; + + case P_STATE_CHG_REPLY: + INFOP("%s (ret %x)\n", cmdname(cmd), + be32_to_cpu(p->req_state_reply.retcode)); + break; + + case P_PING: + case P_PING_ACK: + /* + * Dont trace pings at summary level + */ + if (trace_level < TRACE_LVL_ALL) + break; + /* fall through... */ + default: + INFOP("%s (%u)\n", cmdname(cmd), cmd); + break; + } +} + + +static int __init drbd_trace_init(void) +{ + int ret; + + if (trace_mask & TRACE_UNPLUG) { + ret = register_trace_drbd_unplug(probe_drbd_unplug); + WARN_ON(ret); + } + if (trace_mask & TRACE_UUID) { + ret = register_trace_drbd_uuid(probe_drbd_uuid); + WARN_ON(ret); + } + if (trace_mask & TRACE_EE) { + ret = register_trace_drbd_ee(probe_drbd_ee); + WARN_ON(ret); + } + if (trace_mask & TRACE_PACKET) { + ret = register_trace_drbd_packet(probe_drbd_packet); + WARN_ON(ret); + } + if (trace_mask & TRACE_MD_IO) { + ret = register_trace_drbd_md_io(probe_drbd_md_io); + WARN_ON(ret); + } + if (trace_mask & TRACE_EPOCH) { + ret = register_trace_drbd_epoch(probe_drbd_epoch); + WARN_ON(ret); + } + if (trace_mask & TRACE_NL) { + ret = register_trace_drbd_netlink(probe_drbd_netlink); + WARN_ON(ret); + } + if (trace_mask & TRACE_AL_EXT) { + ret = register_trace_drbd_actlog(probe_drbd_actlog); + WARN_ON(ret); + } + if (trace_mask & TRACE_RQ) { + ret = register_trace_drbd_bio(probe_drbd_bio); + WARN_ON(ret); + } + if (trace_mask & TRACE_INT_RQ) { + ret = register_trace_drbd_req(probe_drbd_req); + WARN_ON(ret); + } + if (trace_mask & TRACE_RESYNC) { + ret = register_trace__drbd_resync(probe_drbd_resync); + WARN_ON(ret); + } + return 0; +} + +module_init(drbd_trace_init); + +static void __exit drbd_trace_exit(void) +{ + if (trace_mask & TRACE_UNPLUG) + unregister_trace_drbd_unplug(probe_drbd_unplug); + if (trace_mask & TRACE_UUID) + unregister_trace_drbd_uuid(probe_drbd_uuid); + if (trace_mask & TRACE_EE) + unregister_trace_drbd_ee(probe_drbd_ee); + if (trace_mask & TRACE_PACKET) + unregister_trace_drbd_packet(probe_drbd_packet); + if (trace_mask & TRACE_MD_IO) + unregister_trace_drbd_md_io(probe_drbd_md_io); + if (trace_mask & TRACE_EPOCH) + unregister_trace_drbd_epoch(probe_drbd_epoch); + if (trace_mask & TRACE_NL) + unregister_trace_drbd_netlink(probe_drbd_netlink); + if (trace_mask & TRACE_AL_EXT) + unregister_trace_drbd_actlog(probe_drbd_actlog); + if (trace_mask & TRACE_RQ) + unregister_trace_drbd_bio(probe_drbd_bio); + if (trace_mask & TRACE_INT_RQ) + unregister_trace_drbd_req(probe_drbd_req); + if (trace_mask & TRACE_RESYNC) + unregister_trace__drbd_resync(probe_drbd_resync); + + tracepoint_synchronize_unregister(); +} + +module_exit(drbd_trace_exit); diff --git a/drivers/block/drbd/drbd_tracing.h b/drivers/block/drbd/drbd_tracing.h new file mode 100644 index 00000000000..c4531a137f6 --- /dev/null +++ b/drivers/block/drbd/drbd_tracing.h @@ -0,0 +1,87 @@ +/* + drbd_tracing.h + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#ifndef DRBD_TRACING_H +#define DRBD_TRACING_H + +#include +#include "drbd_int.h" +#include "drbd_req.h" + +enum { + TRACE_LVL_ALWAYS = 0, + TRACE_LVL_SUMMARY, + TRACE_LVL_METRICS, + TRACE_LVL_ALL, + TRACE_LVL_MAX +}; + +DECLARE_TRACE(drbd_unplug, + TP_PROTO(struct drbd_conf *mdev, char* msg), + TP_ARGS(mdev, msg)); + +DECLARE_TRACE(drbd_uuid, + TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index), + TP_ARGS(mdev, index)); + +DECLARE_TRACE(drbd_ee, + TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg), + TP_ARGS(mdev, e, msg)); + +DECLARE_TRACE(drbd_md_io, + TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev), + TP_ARGS(mdev, rw, bdev)); + +DECLARE_TRACE(drbd_epoch, + TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev), + TP_ARGS(mdev, epoch, ev)); + +DECLARE_TRACE(drbd_netlink, + TP_PROTO(void *data, int is_req), + TP_ARGS(data, is_req)); + +DECLARE_TRACE(drbd_actlog, + TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg), + TP_ARGS(mdev, sector, msg)); + +DECLARE_TRACE(drbd_bio, + TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, + struct drbd_request *r), + TP_ARGS(mdev, pfx, bio, complete, r)); + +DECLARE_TRACE(drbd_req, + TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg), + TP_ARGS(req, what, msg)); + +DECLARE_TRACE(drbd_packet, + TP_PROTO(struct drbd_conf *mdev, struct socket *sock, + int recv, union p_polymorph *p, char *file, int line), + TP_ARGS(mdev, sock, recv, p, file, line)); + +DECLARE_TRACE(_drbd_resync, + TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args), + TP_ARGS(mdev, level, fmt, args)); + +#endif diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h new file mode 100644 index 00000000000..fc824006e72 --- /dev/null +++ b/drivers/block/drbd/drbd_vli.h @@ -0,0 +1,351 @@ +/* +-*- linux-c -*- + drbd_receiver.c + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _DRBD_VLI_H +#define _DRBD_VLI_H + +/* + * At a granularity of 4KiB storage represented per bit, + * and stroage sizes of several TiB, + * and possibly small-bandwidth replication, + * the bitmap transfer time can take much too long, + * if transmitted in plain text. + * + * We try to reduce the transfered bitmap information + * by encoding runlengths of bit polarity. + * + * We never actually need to encode a "zero" (runlengths are positive). + * But then we have to store the value of the first bit. + * The first bit of information thus shall encode if the first runlength + * gives the number of set or unset bits. + * + * We assume that large areas are either completely set or unset, + * which gives good compression with any runlength method, + * even when encoding the runlength as fixed size 32bit/64bit integers. + * + * Still, there may be areas where the polarity flips every few bits, + * and encoding the runlength sequence of those areas with fix size + * integers would be much worse than plaintext. + * + * We want to encode small runlength values with minimum code length, + * while still being able to encode a Huge run of all zeros. + * + * Thus we need a Variable Length Integer encoding, VLI. + * + * For some cases, we produce more code bits than plaintext input. + * We need to send incompressible chunks as plaintext, skip over them + * and then see if the next chunk compresses better. + * + * We don't care too much about "excellent" compression ratio for large + * runlengths (all set/all clear): whether we achieve a factor of 100 + * or 1000 is not that much of an issue. + * We do not want to waste too much on short runlengths in the "noisy" + * parts of the bitmap, though. + * + * There are endless variants of VLI, we experimented with: + * * simple byte-based + * * various bit based with different code word length. + * + * To avoid yet an other configuration parameter (choice of bitmap compression + * algorithm) which was difficult to explain and tune, we just chose the one + * variant that turned out best in all test cases. + * Based on real world usage patterns, with device sizes ranging from a few GiB + * to several TiB, file server/mailserver/webserver/mysql/postgress, + * mostly idle to really busy, the all time winner (though sometimes only + * marginally better) is: + */ + +/* + * encoding is "visualised" as + * __little endian__ bitstream, least significant bit first (left most) + * + * this particular encoding is chosen so that the prefix code + * starts as unary encoding the level, then modified so that + * 10 levels can be described in 8bit, with minimal overhead + * for the smaller levels. + * + * Number of data bits follow fibonacci sequence, with the exception of the + * last level (+1 data bit, so it makes 64bit total). The only worse code when + * encoding bit polarity runlength is 1 plain bits => 2 code bits. +prefix data bits max val Nº data bits +0 x 0x2 1 +10 x 0x4 1 +110 xx 0x8 2 +1110 xxx 0x10 3 +11110 xxx xx 0x30 5 +111110 xx xxxxxx 0x130 8 +11111100 xxxxxxxx xxxxx 0x2130 13 +11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 +11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 +11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 + * maximum encodable value: 0x100000400202130 == 2**56 + some */ + +/* compression "table": + transmitted x 0.29 + as plaintext x ........................ + x ........................ + x ........................ + x 0.59 0.21........................ + x ........................................................ + x .. c ................................................... + x 0.44.. o ................................................... + x .......... d ................................................... + x .......... e ................................................... + X............. ................................................... + x.............. b ................................................... +2.0x............... i ................................................... + #X................ t ................................................... + #................. s ........................... plain bits .......... +-+----------------------------------------------------------------------- + 1 16 32 64 +*/ + +/* LEVEL: (total bits, prefix bits, prefix value), + * sorted ascending by number of total bits. + * The rest of the code table is calculated at compiletime from this. */ + +/* fibonacci data 1, 1, ... */ +#define VLI_L_1_1() do { \ + LEVEL( 2, 1, 0x00); \ + LEVEL( 3, 2, 0x01); \ + LEVEL( 5, 3, 0x03); \ + LEVEL( 7, 4, 0x07); \ + LEVEL(10, 5, 0x0f); \ + LEVEL(14, 6, 0x1f); \ + LEVEL(21, 8, 0x3f); \ + LEVEL(29, 8, 0x7f); \ + LEVEL(42, 8, 0xbf); \ + LEVEL(64, 8, 0xff); \ + } while (0) + +/* finds a suitable level to decode the least significant part of in. + * returns number of bits consumed. + * + * BUG() for bad input, as that would mean a buggy code table. */ +static inline int vli_decode_bits(u64 *out, const u64 in) +{ + u64 adj = 1; + +#define LEVEL(t,b,v) \ + do { \ + if ((in & ((1 << b) -1)) == v) { \ + *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ + return t; \ + } \ + adj += 1ULL << (t - b); \ + } while (0) + + VLI_L_1_1(); + + /* NOT REACHED, if VLI_LEVELS code table is defined properly */ + BUG(); +#undef LEVEL +} + +/* return number of code bits needed, + * or negative error number */ +static inline int __vli_encode_bits(u64 *out, const u64 in) +{ + u64 max = 0; + u64 adj = 1; + + if (in == 0) + return -EINVAL; + +#define LEVEL(t,b,v) do { \ + max += 1ULL << (t - b); \ + if (in <= max) { \ + if (out) \ + *out = ((in - adj) << b) | v; \ + return t; \ + } \ + adj = max + 1; \ + } while (0) + + VLI_L_1_1(); + + return -EOVERFLOW; +#undef LEVEL +} + +#undef VLI_L_1_1 + +/* code from here down is independend of actually used bit code */ + +/* + * Code length is determined by some unique (e.g. unary) prefix. + * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, + * not a byte stream. + */ + +/* for the bitstream, we need a cursor */ +struct bitstream_cursor { + /* the current byte */ + u8 *b; + /* the current bit within *b, nomalized: 0..7 */ + unsigned int bit; +}; + +/* initialize cursor to point to first bit of stream */ +static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) +{ + cur->b = s; + cur->bit = 0; +} + +/* advance cursor by that many bits; maximum expected input value: 64, + * but depending on VLI implementation, it may be more. */ +static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) +{ + bits += cur->bit; + cur->b = cur->b + (bits >> 3); + cur->bit = bits & 7; +} + +/* the bitstream itself knows its length */ +struct bitstream { + struct bitstream_cursor cur; + unsigned char *buf; + size_t buf_len; /* in bytes */ + + /* for input stream: + * number of trailing 0 bits for padding + * total number of valid bits in stream: buf_len * 8 - pad_bits */ + unsigned int pad_bits; +}; + +static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) +{ + bs->buf = s; + bs->buf_len = len; + bs->pad_bits = pad_bits; + bitstream_cursor_reset(&bs->cur, bs->buf); +} + +static inline void bitstream_rewind(struct bitstream *bs) +{ + bitstream_cursor_reset(&bs->cur, bs->buf); + memset(bs->buf, 0, bs->buf_len); +} + +/* Put (at most 64) least significant bits of val into bitstream, and advance cursor. + * Ignores "pad_bits". + * Returns zero if bits == 0 (nothing to do). + * Returns number of bits used if successful. + * + * If there is not enough room left in bitstream, + * leaves bitstream unchanged and returns -ENOBUFS. + */ +static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) +{ + unsigned char *b = bs->cur.b; + unsigned int tmp; + + if (bits == 0) + return 0; + + if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) + return -ENOBUFS; + + /* paranoia: strip off hi bits; they should not be set anyways. */ + if (bits < 64) + val &= ~0ULL >> (64 - bits); + + *b++ |= (val & 0xff) << bs->cur.bit; + + for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) + *b++ |= (val >> tmp) & 0xff; + + bitstream_cursor_advance(&bs->cur, bits); + return bits; +} + +/* Fetch (at most 64) bits from bitstream into *out, and advance cursor. + * + * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. + * + * If there are less than the requested number of valid bits left in the + * bitstream, still fetches all available bits. + * + * Returns number of actually fetched bits. + */ +static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) +{ + u64 val; + unsigned int n; + + if (bits > 64) + return -EINVAL; + + if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) + bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) + - bs->cur.bit - bs->pad_bits; + + if (bits == 0) { + *out = 0; + return 0; + } + + /* get the high bits */ + val = 0; + n = (bs->cur.bit + bits + 7) >> 3; + /* n may be at most 9, if cur.bit + bits > 64 */ + /* which means this copies at most 8 byte */ + if (n) { + memcpy(&val, bs->cur.b+1, n - 1); + val = le64_to_cpu(val) << (8 - bs->cur.bit); + } + + /* we still need the low bits */ + val |= bs->cur.b[0] >> bs->cur.bit; + + /* and mask out bits we don't want */ + val &= ~0ULL >> (64 - bits); + + bitstream_cursor_advance(&bs->cur, bits); + *out = val; + + return bits; +} + +/* encodes @in as vli into @bs; + + * return values + * > 0: number of bits successfully stored in bitstream + * -ENOBUFS @bs is full + * -EINVAL input zero (invalid) + * -EOVERFLOW input too large for this vli code (invalid) + */ +static inline int vli_encode_bits(struct bitstream *bs, u64 in) +{ + u64 code = code; + int bits = __vli_encode_bits(&code, in); + + if (bits <= 0) + return bits; + + return bitstream_put_bits(bs, code, bits); +} + +#endif diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c new file mode 100644 index 00000000000..212e9545e63 --- /dev/null +++ b/drivers/block/drbd/drbd_worker.c @@ -0,0 +1,1529 @@ +/* + drbd_worker.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 1999-2008, Philipp Reisner . + Copyright (C) 2002-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "drbd_int.h" +#include "drbd_req.h" +#include "drbd_tracing.h" + +#define SLEEP_TIME (HZ/10) + +static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel); + + + +/* defined here: + drbd_md_io_complete + drbd_endio_write_sec + drbd_endio_read_sec + drbd_endio_pri + + * more endio handlers: + atodb_endio in drbd_actlog.c + drbd_bm_async_io_complete in drbd_bitmap.c + + * For all these callbacks, note the following: + * The callbacks will be called in irq context by the IDE drivers, + * and in Softirqs/Tasklets/BH context by the SCSI drivers. + * Try to get the locking right :) + * + */ + + +/* About the global_state_lock + Each state transition on an device holds a read lock. In case we have + to evaluate the sync after dependencies, we grab a write lock, because + we need stable states on all devices for that. */ +rwlock_t global_state_lock; + +/* used for synchronous meta data and bitmap IO + * submitted by drbd_md_sync_page_io() + */ +void drbd_md_io_complete(struct bio *bio, int error) +{ + struct drbd_md_io *md_io; + + md_io = (struct drbd_md_io *)bio->bi_private; + md_io->error = error; + + trace_drbd_bio(md_io->mdev, "Md", bio, 1, NULL); + + complete(&md_io->event); +} + +/* reads on behalf of the partner, + * "submitted" by the receiver + */ +void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) +{ + unsigned long flags = 0; + struct drbd_epoch_entry *e = NULL; + struct drbd_conf *mdev; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + e = bio->bi_private; + mdev = e->mdev; + + if (error) + dev_warn(DEV, "read: error=%d s=%llus\n", error, + (unsigned long long)e->sector); + if (!error && !uptodate) { + dev_warn(DEV, "read: setting error to -EIO s=%llus\n", + (unsigned long long)e->sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + D_ASSERT(e->block_id != ID_VACANT); + + trace_drbd_bio(mdev, "Sec", bio, 1, NULL); + + spin_lock_irqsave(&mdev->req_lock, flags); + mdev->read_cnt += e->size >> 9; + list_del(&e->w.list); + if (list_empty(&mdev->read_ee)) + wake_up(&mdev->ee_wait); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + drbd_chk_io_error(mdev, error, FALSE); + drbd_queue_work(&mdev->data.work, &e->w); + put_ldev(mdev); + + trace_drbd_ee(mdev, e, "read completed"); +} + +/* writes on behalf of the partner, or resync writes, + * "submitted" by the receiver. + */ +void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) +{ + unsigned long flags = 0; + struct drbd_epoch_entry *e = NULL; + struct drbd_conf *mdev; + sector_t e_sector; + int do_wake; + int is_syncer_req; + int do_al_complete_io; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + int is_barrier = bio_rw_flagged(bio, BIO_RW_BARRIER); + + e = bio->bi_private; + mdev = e->mdev; + + if (error) + dev_warn(DEV, "write: error=%d s=%llus\n", error, + (unsigned long long)e->sector); + if (!error && !uptodate) { + dev_warn(DEV, "write: setting error to -EIO s=%llus\n", + (unsigned long long)e->sector); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + /* error == -ENOTSUPP would be a better test, + * alas it is not reliable */ + if (error && is_barrier && e->flags & EE_IS_BARRIER) { + drbd_bump_write_ordering(mdev, WO_bdev_flush); + spin_lock_irqsave(&mdev->req_lock, flags); + list_del(&e->w.list); + e->w.cb = w_e_reissue; + /* put_ldev actually happens below, once we come here again. */ + __release(local); + spin_unlock_irqrestore(&mdev->req_lock, flags); + drbd_queue_work(&mdev->data.work, &e->w); + return; + } + + D_ASSERT(e->block_id != ID_VACANT); + + trace_drbd_bio(mdev, "Sec", bio, 1, NULL); + + spin_lock_irqsave(&mdev->req_lock, flags); + mdev->writ_cnt += e->size >> 9; + is_syncer_req = is_syncer_block_id(e->block_id); + + /* after we moved e to done_ee, + * we may no longer access it, + * it may be freed/reused already! + * (as soon as we release the req_lock) */ + e_sector = e->sector; + do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO; + + list_del(&e->w.list); /* has been on active_ee or sync_ee */ + list_add_tail(&e->w.list, &mdev->done_ee); + + trace_drbd_ee(mdev, e, "write completed"); + + /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, + * neither did we wake possibly waiting conflicting requests. + * done from "drbd_process_done_ee" within the appropriate w.cb + * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ + + do_wake = is_syncer_req + ? list_empty(&mdev->sync_ee) + : list_empty(&mdev->active_ee); + + if (error) + __drbd_chk_io_error(mdev, FALSE); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (is_syncer_req) + drbd_rs_complete_io(mdev, e_sector); + + if (do_wake) + wake_up(&mdev->ee_wait); + + if (do_al_complete_io) + drbd_al_complete_io(mdev, e_sector); + + wake_asender(mdev); + put_ldev(mdev); + +} + +/* read, readA or write requests on R_PRIMARY coming from drbd_make_request + */ +void drbd_endio_pri(struct bio *bio, int error) +{ + unsigned long flags; + struct drbd_request *req = bio->bi_private; + struct drbd_conf *mdev = req->mdev; + struct bio_and_error m; + enum drbd_req_event what; + int uptodate = bio_flagged(bio, BIO_UPTODATE); + + if (error) + dev_warn(DEV, "p %s: error=%d\n", + bio_data_dir(bio) == WRITE ? "write" : "read", error); + if (!error && !uptodate) { + dev_warn(DEV, "p %s: setting error to -EIO\n", + bio_data_dir(bio) == WRITE ? "write" : "read"); + /* strange behavior of some lower level drivers... + * fail the request by clearing the uptodate flag, + * but do not return any error?! */ + error = -EIO; + } + + trace_drbd_bio(mdev, "Pri", bio, 1, NULL); + + /* to avoid recursion in __req_mod */ + if (unlikely(error)) { + what = (bio_data_dir(bio) == WRITE) + ? write_completed_with_error + : (bio_rw(bio) == READA) + ? read_completed_with_error + : read_ahead_completed_with_error; + } else + what = completed_ok; + + bio_put(req->private_bio); + req->private_bio = ERR_PTR(error); + + spin_lock_irqsave(&mdev->req_lock, flags); + __req_mod(req, what, &m); + spin_unlock_irqrestore(&mdev->req_lock, flags); + + if (m.bio) + complete_master_bio(mdev, &m); +} + +int w_io_error(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + + /* NOTE: mdev->ldev can be NULL by the time we get here! */ + /* D_ASSERT(mdev->ldev->dc.on_io_error != EP_PASS_ON); */ + + /* the only way this callback is scheduled is from _req_may_be_done, + * when it is done and had a local write error, see comments there */ + drbd_req_free(req); + + return TRUE; +} + +int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + + /* We should not detach for read io-error, + * but try to WRITE the P_DATA_REPLY to the failed location, + * to give the disk the chance to relocate that block */ + + spin_lock_irq(&mdev->req_lock); + if (cancel || + mdev->state.conn < C_CONNECTED || + mdev->state.pdsk <= D_INCONSISTENT) { + _req_mod(req, send_canceled); + spin_unlock_irq(&mdev->req_lock); + dev_alert(DEV, "WE ARE LOST. Local IO failure, no peer.\n"); + return 1; + } + spin_unlock_irq(&mdev->req_lock); + + return w_send_read_req(mdev, w, 0); +} + +int w_resync_inactive(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + ERR_IF(cancel) return 1; + dev_err(DEV, "resync inactive, but callback triggered??\n"); + return 1; /* Simply ignore this! */ +} + +void drbd_csum(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *bio, void *digest) +{ + struct hash_desc desc; + struct scatterlist sg; + struct bio_vec *bvec; + int i; + + desc.tfm = tfm; + desc.flags = 0; + + sg_init_table(&sg, 1); + crypto_hash_init(&desc); + + __bio_for_each_segment(bvec, bio, i, 0) { + sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); + crypto_hash_update(&desc, &sg, sg.length); + } + crypto_hash_final(&desc, digest); +} + +static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int digest_size; + void *digest; + int ok; + + D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + return 1; + } + + if (likely(drbd_bio_uptodate(e->private_bio))) { + digest_size = crypto_hash_digestsize(mdev->csums_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + + inc_rs_pending(mdev); + ok = drbd_send_drequest_csum(mdev, + e->sector, + e->size, + digest, + digest_size, + P_CSUM_RS_REQUEST); + kfree(digest); + } else { + dev_err(DEV, "kmalloc() of digest failed.\n"); + ok = 0; + } + } else + ok = 1; + + drbd_free_ee(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); + return ok; +} + +#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN) + +static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size) +{ + struct drbd_epoch_entry *e; + + if (!get_ldev(mdev)) + return 0; + + /* GFP_TRY, because if there is no memory available right now, this may + * be rescheduled for later. It is "only" background resync, after all. */ + e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY); + if (!e) { + put_ldev(mdev); + return 2; + } + + spin_lock_irq(&mdev->req_lock); + list_add(&e->w.list, &mdev->read_ee); + spin_unlock_irq(&mdev->req_lock); + + e->private_bio->bi_end_io = drbd_endio_read_sec; + e->private_bio->bi_rw = READ; + e->w.cb = w_e_send_csum; + + mdev->read_cnt += size >> 9; + drbd_generic_make_request(mdev, DRBD_FAULT_RS_RD, e->private_bio); + + return 1; +} + +void resync_timer_fn(unsigned long data) +{ + unsigned long flags; + struct drbd_conf *mdev = (struct drbd_conf *) data; + int queue; + + spin_lock_irqsave(&mdev->req_lock, flags); + + if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) { + queue = 1; + if (mdev->state.conn == C_VERIFY_S) + mdev->resync_work.cb = w_make_ov_request; + else + mdev->resync_work.cb = w_make_resync_request; + } else { + queue = 0; + mdev->resync_work.cb = w_resync_inactive; + } + + spin_unlock_irqrestore(&mdev->req_lock, flags); + + /* harmless race: list_empty outside data.work.q_lock */ + if (list_empty(&mdev->resync_work.list) && queue) + drbd_queue_work(&mdev->data.work, &mdev->resync_work); +} + +int w_make_resync_request(struct drbd_conf *mdev, + struct drbd_work *w, int cancel) +{ + unsigned long bit; + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + int max_segment_size = queue_max_segment_size(mdev->rq_queue); + int number, i, size, pe, mx; + int align, queued, sndbuf; + + if (unlikely(cancel)) + return 1; + + if (unlikely(mdev->state.conn < C_CONNECTED)) { + dev_err(DEV, "Confused in w_make_resync_request()! cstate < Connected"); + return 0; + } + + if (mdev->state.conn != C_SYNC_TARGET) + dev_err(DEV, "%s in w_make_resync_request\n", + drbd_conn_str(mdev->state.conn)); + + if (!get_ldev(mdev)) { + /* Since we only need to access mdev->rsync a + get_ldev_if_state(mdev,D_FAILED) would be sufficient, but + to continue resync with a broken disk makes no sense at + all */ + dev_err(DEV, "Disk broke down during resync!\n"); + mdev->resync_work.cb = w_resync_inactive; + return 1; + } + + number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + pe = atomic_read(&mdev->rs_pending_cnt); + + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket) + mx = mdev->data.socket->sk->sk_rcvbuf / sizeof(struct p_block_req); + else + mx = 1; + mutex_unlock(&mdev->data.mutex); + + /* For resync rates >160MB/sec, allow more pending RS requests */ + if (number > mx) + mx = number; + + /* Limit the number of pending RS requests to no more than the peer's receive buffer */ + if ((pe + number) > mx) { + number = mx - pe; + } + + for (i = 0; i < number; i++) { + /* Stop generating RS requests, when half of the send buffer is filled */ + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket) { + queued = mdev->data.socket->sk->sk_wmem_queued; + sndbuf = mdev->data.socket->sk->sk_sndbuf; + } else { + queued = 1; + sndbuf = 0; + } + mutex_unlock(&mdev->data.mutex); + if (queued > sndbuf / 2) + goto requeue; + +next_sector: + size = BM_BLOCK_SIZE; + bit = drbd_bm_find_next(mdev, mdev->bm_resync_fo); + + if (bit == -1UL) { + mdev->bm_resync_fo = drbd_bm_bits(mdev); + mdev->resync_work.cb = w_resync_inactive; + put_ldev(mdev); + return 1; + } + + sector = BM_BIT_TO_SECT(bit); + + if (drbd_try_rs_begin_io(mdev, sector)) { + mdev->bm_resync_fo = bit; + goto requeue; + } + mdev->bm_resync_fo = bit + 1; + + if (unlikely(drbd_bm_test_bit(mdev, bit) == 0)) { + drbd_rs_complete_io(mdev, sector); + goto next_sector; + } + +#if DRBD_MAX_SEGMENT_SIZE > BM_BLOCK_SIZE + /* try to find some adjacent bits. + * we stop if we have already the maximum req size. + * + * Additionally always align bigger requests, in order to + * be prepared for all stripe sizes of software RAIDs. + * + * we _do_ care about the agreed-upon q->max_segment_size + * here, as splitting up the requests on the other side is more + * difficult. the consequence is, that on lvm and md and other + * "indirect" devices, this is dead code, since + * q->max_segment_size will be PAGE_SIZE. + */ + align = 1; + for (;;) { + if (size + BM_BLOCK_SIZE > max_segment_size) + break; + + /* Be always aligned */ + if (sector & ((1<<(align+3))-1)) + break; + + /* do not cross extent boundaries */ + if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) + break; + /* now, is it actually dirty, after all? + * caution, drbd_bm_test_bit is tri-state for some + * obscure reason; ( b == 0 ) would get the out-of-band + * only accidentally right because of the "oddly sized" + * adjustment below */ + if (drbd_bm_test_bit(mdev, bit+1) != 1) + break; + bit++; + size += BM_BLOCK_SIZE; + if ((BM_BLOCK_SIZE << align) <= size) + align++; + i++; + } + /* if we merged some, + * reset the offset to start the next drbd_bm_find_next from */ + if (size > BM_BLOCK_SIZE) + mdev->bm_resync_fo = bit + 1; +#endif + + /* adjust very last sectors, in case we are oddly sized */ + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) { + switch (read_for_csum(mdev, sector, size)) { + case 0: /* Disk failure*/ + put_ldev(mdev); + return 0; + case 2: /* Allocation failed */ + drbd_rs_complete_io(mdev, sector); + mdev->bm_resync_fo = BM_SECT_TO_BIT(sector); + goto requeue; + /* case 1: everything ok */ + } + } else { + inc_rs_pending(mdev); + if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST, + sector, size, ID_SYNCER)) { + dev_err(DEV, "drbd_send_drequest() failed, aborting...\n"); + dec_rs_pending(mdev); + put_ldev(mdev); + return 0; + } + } + } + + if (mdev->bm_resync_fo >= drbd_bm_bits(mdev)) { + /* last syncer _request_ was sent, + * but the P_RS_DATA_REPLY not yet received. sync will end (and + * next sync group will resume), as soon as we receive the last + * resync data block, and the last bit is cleared. + * until then resync "work" is "inactive" ... + */ + mdev->resync_work.cb = w_resync_inactive; + put_ldev(mdev); + return 1; + } + + requeue: + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + put_ldev(mdev); + return 1; +} + +static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + int number, i, size; + sector_t sector; + const sector_t capacity = drbd_get_capacity(mdev->this_bdev); + + if (unlikely(cancel)) + return 1; + + if (unlikely(mdev->state.conn < C_CONNECTED)) { + dev_err(DEV, "Confused in w_make_ov_request()! cstate < Connected"); + return 0; + } + + number = SLEEP_TIME*mdev->sync_conf.rate / ((BM_BLOCK_SIZE/1024)*HZ); + if (atomic_read(&mdev->rs_pending_cnt) > number) + goto requeue; + + number -= atomic_read(&mdev->rs_pending_cnt); + + sector = mdev->ov_position; + for (i = 0; i < number; i++) { + if (sector >= capacity) { + mdev->resync_work.cb = w_resync_inactive; + return 1; + } + + size = BM_BLOCK_SIZE; + + if (drbd_try_rs_begin_io(mdev, sector)) { + mdev->ov_position = sector; + goto requeue; + } + + if (sector + (size>>9) > capacity) + size = (capacity-sector)<<9; + + inc_rs_pending(mdev); + if (!drbd_send_ov_request(mdev, sector, size)) { + dec_rs_pending(mdev); + return 0; + } + sector += BM_SECT_PER_BIT; + } + mdev->ov_position = sector; + + requeue: + mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME); + return 1; +} + + +int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + kfree(w); + ov_oos_print(mdev); + drbd_resync_finished(mdev); + + return 1; +} + +static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + kfree(w); + + drbd_resync_finished(mdev); + + return 1; +} + +int drbd_resync_finished(struct drbd_conf *mdev) +{ + unsigned long db, dt, dbdt; + unsigned long n_oos; + union drbd_state os, ns; + struct drbd_work *w; + char *khelper_cmd = NULL; + + /* Remove all elements from the resync LRU. Since future actions + * might set bits in the (main) bitmap, then the entries in the + * resync LRU would be wrong. */ + if (drbd_rs_del_all(mdev)) { + /* In case this is not possible now, most probably because + * there are P_RS_DATA_REPLY Packets lingering on the worker's + * queue (or even the read operations for those packets + * is not finished by now). Retry in 100ms. */ + + drbd_kick_lo(mdev); + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ / 10); + w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC); + if (w) { + w->cb = w_resync_finished; + drbd_queue_work(&mdev->data.work, w); + return 1; + } + dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n"); + } + + dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ; + if (dt <= 0) + dt = 1; + db = mdev->rs_total; + dbdt = Bit2KB(db/dt); + mdev->rs_paused /= HZ; + + if (!get_ldev(mdev)) + goto out; + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + + /* This protects us against multiple calls (that can happen in the presence + of application IO), and against connectivity loss just before we arrive here. */ + if (os.conn <= C_CONNECTED) + goto out_unlock; + + ns = os; + ns.conn = C_CONNECTED; + + dev_info(DEV, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n", + (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) ? + "Online verify " : "Resync", + dt + mdev->rs_paused, mdev->rs_paused, dbdt); + + n_oos = drbd_bm_total_weight(mdev); + + if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) { + if (n_oos) { + dev_alert(DEV, "Online verify found %lu %dk block out of sync!\n", + n_oos, Bit2KB(1)); + khelper_cmd = "out-of-sync"; + } + } else { + D_ASSERT((n_oos - mdev->rs_failed) == 0); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) + khelper_cmd = "after-resync-target"; + + if (mdev->csums_tfm && mdev->rs_total) { + const unsigned long s = mdev->rs_same_csum; + const unsigned long t = mdev->rs_total; + const int ratio = + (t == 0) ? 0 : + (t < 100000) ? ((s*100)/t) : (s/(t/100)); + dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " + "transferred %luK total %luK\n", + ratio, + Bit2KB(mdev->rs_same_csum), + Bit2KB(mdev->rs_total - mdev->rs_same_csum), + Bit2KB(mdev->rs_total)); + } + } + + if (mdev->rs_failed) { + dev_info(DEV, " %lu failed blocks\n", mdev->rs_failed); + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + ns.disk = D_INCONSISTENT; + ns.pdsk = D_UP_TO_DATE; + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_INCONSISTENT; + } + } else { + ns.disk = D_UP_TO_DATE; + ns.pdsk = D_UP_TO_DATE; + + if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) { + if (mdev->p_uuid) { + int i; + for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++) + _drbd_uuid_set(mdev, i, mdev->p_uuid[i]); + drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_CURRENT]); + _drbd_uuid_set(mdev, UI_CURRENT, mdev->p_uuid[UI_CURRENT]); + } else { + dev_err(DEV, "mdev->p_uuid is NULL! BUG\n"); + } + } + + drbd_uuid_set_bm(mdev, 0UL); + + if (mdev->p_uuid) { + /* Now the two UUID sets are equal, update what we + * know of the peer. */ + int i; + for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++) + mdev->p_uuid[i] = mdev->ldev->md.uuid[i]; + } + } + + _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); +out_unlock: + spin_unlock_irq(&mdev->req_lock); + put_ldev(mdev); +out: + mdev->rs_total = 0; + mdev->rs_failed = 0; + mdev->rs_paused = 0; + mdev->ov_start_sector = 0; + + if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) { + dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n"); + drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished"); + } + + if (khelper_cmd) + drbd_khelper(mdev, khelper_cmd); + + return 1; +} + +/* helper */ +static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e) +{ + if (drbd_bio_has_active_page(e->private_bio)) { + /* This might happen if sendpage() has not finished */ + spin_lock_irq(&mdev->req_lock); + list_add_tail(&e->w.list, &mdev->net_ee); + spin_unlock_irq(&mdev->req_lock); + } else + drbd_free_ee(mdev, e); +} + +/** + * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int ok; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + if (likely(drbd_bio_uptodate(e->private_bio))) { + ok = drbd_send_block(mdev, P_DATA_REPLY, e); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegDReply. sector=%llus.\n", + (unsigned long long)e->sector); + + ok = drbd_send_ack(mdev, P_NEG_DREPLY, e); + } + + dec_unacked(mdev); + + move_to_net_ee_or_free(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_block() failed\n"); + return ok; +} + +/** + * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int ok; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + if (get_ldev_if_state(mdev, D_FAILED)) { + drbd_rs_complete_io(mdev, e->sector); + put_ldev(mdev); + } + + if (likely(drbd_bio_uptodate(e->private_bio))) { + if (likely(mdev->state.pdsk >= D_INCONSISTENT)) { + inc_rs_pending(mdev); + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Not sending RSDataReply, " + "partner DISKLESS!\n"); + ok = 1; + } + } else { + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegRSDReply. sector %llus.\n", + (unsigned long long)e->sector); + + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + + /* update resync data with failure */ + drbd_rs_failed_io(mdev, e->sector, e->size); + } + + dec_unacked(mdev); + + move_to_net_ee_or_free(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_block() failed\n"); + return ok; +} + +int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct digest_info *di; + int digest_size; + void *digest = NULL; + int ok, eq = 0; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + drbd_rs_complete_io(mdev, e->sector); + + di = (struct digest_info *)(unsigned long)e->block_id; + + if (likely(drbd_bio_uptodate(e->private_bio))) { + /* quick hack to try to avoid a race against reconfiguration. + * a real fix would be much more involved, + * introducing more locking mechanisms */ + if (mdev->csums_tfm) { + digest_size = crypto_hash_digestsize(mdev->csums_tfm); + D_ASSERT(digest_size == di->digest_size); + digest = kmalloc(digest_size, GFP_NOIO); + } + if (digest) { + drbd_csum(mdev, mdev->csums_tfm, e->private_bio, digest); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + + if (eq) { + drbd_set_in_sync(mdev, e->sector, e->size); + mdev->rs_same_csum++; + ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e); + } else { + inc_rs_pending(mdev); + e->block_id = ID_SYNCER; + ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e); + } + } else { + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); + } + + dec_unacked(mdev); + + kfree(di); + + move_to_net_ee_or_free(mdev, e); + + if (unlikely(!ok)) + dev_err(DEV, "drbd_send_block/ack() failed\n"); + return ok; +} + +int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + int digest_size; + void *digest; + int ok = 1; + + if (unlikely(cancel)) + goto out; + + if (unlikely(!drbd_bio_uptodate(e->private_bio))) + goto out; + + digest_size = crypto_hash_digestsize(mdev->verify_tfm); + /* FIXME if this allocation fails, online verify will not terminate! */ + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); + inc_rs_pending(mdev); + ok = drbd_send_drequest_csum(mdev, e->sector, e->size, + digest, digest_size, P_OV_REPLY); + if (!ok) + dec_rs_pending(mdev); + kfree(digest); + } + +out: + drbd_free_ee(mdev, e); + + dec_unacked(mdev); + + return ok; +} + +void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size) +{ + if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) { + mdev->ov_last_oos_size += size>>9; + } else { + mdev->ov_last_oos_start = sector; + mdev->ov_last_oos_size = size>>9; + } + drbd_set_out_of_sync(mdev, sector, size); + set_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags); +} + +int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + struct digest_info *di; + int digest_size; + void *digest; + int ok, eq = 0; + + if (unlikely(cancel)) { + drbd_free_ee(mdev, e); + dec_unacked(mdev); + return 1; + } + + /* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all + * the resync lru has been cleaned up already */ + drbd_rs_complete_io(mdev, e->sector); + + di = (struct digest_info *)(unsigned long)e->block_id; + + if (likely(drbd_bio_uptodate(e->private_bio))) { + digest_size = crypto_hash_digestsize(mdev->verify_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + drbd_csum(mdev, mdev->verify_tfm, e->private_bio, digest); + + D_ASSERT(digest_size == di->digest_size); + eq = !memcmp(digest, di->digest, digest_size); + kfree(digest); + } + } else { + ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e); + if (__ratelimit(&drbd_ratelimit_state)) + dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n"); + } + + dec_unacked(mdev); + + kfree(di); + + if (!eq) + drbd_ov_oos_found(mdev, e->sector, e->size); + else + ov_oos_print(mdev); + + ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, + eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); + + drbd_free_ee(mdev, e); + + if (--mdev->ov_left == 0) { + ov_oos_print(mdev); + drbd_resync_finished(mdev); + } + + return ok; +} + +int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w); + complete(&b->done); + return 1; +} + +int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w); + struct p_barrier *p = &mdev->data.sbuf.barrier; + int ok = 1; + + /* really avoid racing with tl_clear. w.cb may have been referenced + * just before it was reassigned and re-queued, so double check that. + * actually, this race was harmless, since we only try to send the + * barrier packet here, and otherwise do nothing with the object. + * but compare with the head of w_clear_epoch */ + spin_lock_irq(&mdev->req_lock); + if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED) + cancel = 1; + spin_unlock_irq(&mdev->req_lock); + if (cancel) + return 1; + + if (!drbd_get_data_sock(mdev)) + return 0; + p->barrier = b->br_number; + /* inc_ap_pending was done where this was queued. + * dec_ap_pending will be done in got_BarrierAck + * or (on connection loss) in w_clear_epoch. */ + ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER, + (struct p_header *)p, sizeof(*p), 0); + drbd_put_data_sock(mdev); + + return ok; +} + +int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + if (cancel) + return 1; + return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE); +} + +/** + * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled); + return 1; + } + + ok = drbd_send_dblock(mdev, req); + req_mod(req, ok ? handed_over_to_network : send_failed); + + return ok; +} + +/** + * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet + * @mdev: DRBD device. + * @w: work object. + * @cancel: The connection will be closed anyways + */ +int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +{ + struct drbd_request *req = container_of(w, struct drbd_request, w); + int ok; + + if (unlikely(cancel)) { + req_mod(req, send_canceled); + return 1; + } + + ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size, + (unsigned long)req); + + if (!ok) { + /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send(); + * so this is probably redundant */ + if (mdev->state.conn >= C_CONNECTED) + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + } + req_mod(req, ok ? handed_over_to_network : send_failed); + + return ok; +} + +static int _drbd_may_sync_now(struct drbd_conf *mdev) +{ + struct drbd_conf *odev = mdev; + + while (1) { + if (odev->sync_conf.after == -1) + return 1; + odev = minor_to_mdev(odev->sync_conf.after); + ERR_IF(!odev) return 1; + if ((odev->state.conn >= C_SYNC_SOURCE && + odev->state.conn <= C_PAUSED_SYNC_T) || + odev->state.aftr_isp || odev->state.peer_isp || + odev->state.user_isp) + return 0; + } +} + +/** + * _drbd_pause_after() - Pause resync on all devices that may not resync now + * @mdev: DRBD device. + * + * Called from process context only (admin command and after_state_ch). + */ +static int _drbd_pause_after(struct drbd_conf *mdev) +{ + struct drbd_conf *odev; + int i, rv = 0; + + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev) + continue; + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (!_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) + != SS_NOTHING_TO_DO); + } + + return rv; +} + +/** + * _drbd_resume_next() - Resume resync on all devices that may resync now + * @mdev: DRBD device. + * + * Called from process context only (admin command and worker). + */ +static int _drbd_resume_next(struct drbd_conf *mdev) +{ + struct drbd_conf *odev; + int i, rv = 0; + + for (i = 0; i < minor_count; i++) { + odev = minor_to_mdev(i); + if (!odev) + continue; + if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) + continue; + if (odev->state.aftr_isp) { + if (_drbd_may_sync_now(odev)) + rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) + != SS_NOTHING_TO_DO) ; + } + } + return rv; +} + +void resume_next_sg(struct drbd_conf *mdev) +{ + write_lock_irq(&global_state_lock); + _drbd_resume_next(mdev); + write_unlock_irq(&global_state_lock); +} + +void suspend_other_sg(struct drbd_conf *mdev) +{ + write_lock_irq(&global_state_lock); + _drbd_pause_after(mdev); + write_unlock_irq(&global_state_lock); +} + +static int sync_after_error(struct drbd_conf *mdev, int o_minor) +{ + struct drbd_conf *odev; + + if (o_minor == -1) + return NO_ERROR; + if (o_minor < -1 || minor_to_mdev(o_minor) == NULL) + return ERR_SYNC_AFTER; + + /* check for loops */ + odev = minor_to_mdev(o_minor); + while (1) { + if (odev == mdev) + return ERR_SYNC_AFTER_CYCLE; + + /* dependency chain ends here, no cycles. */ + if (odev->sync_conf.after == -1) + return NO_ERROR; + + /* follow the dependency chain */ + odev = minor_to_mdev(odev->sync_conf.after); + } +} + +int drbd_alter_sa(struct drbd_conf *mdev, int na) +{ + int changes; + int retcode; + + write_lock_irq(&global_state_lock); + retcode = sync_after_error(mdev, na); + if (retcode == NO_ERROR) { + mdev->sync_conf.after = na; + do { + changes = _drbd_pause_after(mdev); + changes |= _drbd_resume_next(mdev); + } while (changes); + } + write_unlock_irq(&global_state_lock); + return retcode; +} + +/** + * drbd_start_resync() - Start the resync process + * @mdev: DRBD device. + * @side: Either C_SYNC_SOURCE or C_SYNC_TARGET + * + * This function might bring you directly into one of the + * C_PAUSED_SYNC_* states. + */ +void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) +{ + union drbd_state ns; + int r; + + if (mdev->state.conn >= C_SYNC_SOURCE) { + dev_err(DEV, "Resync already running!\n"); + return; + } + + trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n", + side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource"); + + /* In case a previous resync run was aborted by an IO error/detach on the peer. */ + drbd_rs_cancel_all(mdev); + + if (side == C_SYNC_TARGET) { + /* Since application IO was locked out during C_WF_BITMAP_T and + C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET + we check that we might make the data inconsistent. */ + r = drbd_khelper(mdev, "before-resync-target"); + r = (r >> 8) & 0xff; + if (r > 0) { + dev_info(DEV, "before-resync-target handler returned %d, " + "dropping connection.\n", r); + drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); + return; + } + } + + drbd_state_lock(mdev); + + if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + drbd_state_unlock(mdev); + return; + } + + if (side == C_SYNC_TARGET) { + mdev->bm_resync_fo = 0; + } else /* side == C_SYNC_SOURCE */ { + u64 uuid; + + get_random_bytes(&uuid, sizeof(u64)); + drbd_uuid_set(mdev, UI_BITMAP, uuid); + drbd_send_sync_uuid(mdev, uuid); + + D_ASSERT(mdev->state.disk == D_UP_TO_DATE); + } + + write_lock_irq(&global_state_lock); + ns = mdev->state; + + ns.aftr_isp = !_drbd_may_sync_now(mdev); + + ns.conn = side; + + if (side == C_SYNC_TARGET) + ns.disk = D_INCONSISTENT; + else /* side == C_SYNC_SOURCE */ + ns.pdsk = D_INCONSISTENT; + + r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + ns = mdev->state; + + if (ns.conn < C_CONNECTED) + r = SS_UNKNOWN_ERROR; + + if (r == SS_SUCCESS) { + mdev->rs_total = + mdev->rs_mark_left = drbd_bm_total_weight(mdev); + mdev->rs_failed = 0; + mdev->rs_paused = 0; + mdev->rs_start = + mdev->rs_mark_time = jiffies; + mdev->rs_same_csum = 0; + _drbd_pause_after(mdev); + } + write_unlock_irq(&global_state_lock); + drbd_state_unlock(mdev); + put_ldev(mdev); + + if (r == SS_SUCCESS) { + dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n", + drbd_conn_str(ns.conn), + (unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10), + (unsigned long) mdev->rs_total); + + if (mdev->rs_total == 0) { + /* Peer still reachable? Beware of failing before-resync-target handlers! */ + request_ping(mdev); + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(mdev->net_conf->ping_timeo*HZ/9); /* 9 instead 10 */ + drbd_resync_finished(mdev); + return; + } + + /* ns.conn may already be != mdev->state.conn, + * we may have been paused in between, or become paused until + * the timer triggers. + * No matter, that is handled in resync_timer_fn() */ + if (ns.conn == C_SYNC_TARGET) + mod_timer(&mdev->resync_timer, jiffies); + + drbd_md_sync(mdev); + } +} + +int drbd_worker(struct drbd_thread *thi) +{ + struct drbd_conf *mdev = thi->mdev; + struct drbd_work *w = NULL; + LIST_HEAD(work_list); + int intr = 0, i; + + sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev)); + + while (get_t_state(thi) == Running) { + drbd_thread_current_set_cpu(mdev); + + if (down_trylock(&mdev->data.work.s)) { + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket && !mdev->net_conf->no_cork) + drbd_tcp_uncork(mdev->data.socket); + mutex_unlock(&mdev->data.mutex); + + intr = down_interruptible(&mdev->data.work.s); + + mutex_lock(&mdev->data.mutex); + if (mdev->data.socket && !mdev->net_conf->no_cork) + drbd_tcp_cork(mdev->data.socket); + mutex_unlock(&mdev->data.mutex); + } + + if (intr) { + D_ASSERT(intr == -EINTR); + flush_signals(current); + ERR_IF (get_t_state(thi) == Running) + continue; + break; + } + + if (get_t_state(thi) != Running) + break; + /* With this break, we have done a down() but not consumed + the entry from the list. The cleanup code takes care of + this... */ + + w = NULL; + spin_lock_irq(&mdev->data.work.q_lock); + ERR_IF(list_empty(&mdev->data.work.q)) { + /* something terribly wrong in our logic. + * we were able to down() the semaphore, + * but the list is empty... doh. + * + * what is the best thing to do now? + * try again from scratch, restarting the receiver, + * asender, whatnot? could break even more ugly, + * e.g. when we are primary, but no good local data. + * + * I'll try to get away just starting over this loop. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + continue; + } + w = list_entry(mdev->data.work.q.next, struct drbd_work, list); + list_del_init(&w->list); + spin_unlock_irq(&mdev->data.work.q_lock); + + if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) { + /* dev_warn(DEV, "worker: a callback failed! \n"); */ + if (mdev->state.conn >= C_CONNECTED) + drbd_force_state(mdev, + NS(conn, C_NETWORK_FAILURE)); + } + } + D_ASSERT(test_bit(DEVICE_DYING, &mdev->flags)); + D_ASSERT(test_bit(CONFIG_PENDING, &mdev->flags)); + + spin_lock_irq(&mdev->data.work.q_lock); + i = 0; + while (!list_empty(&mdev->data.work.q)) { + list_splice_init(&mdev->data.work.q, &work_list); + spin_unlock_irq(&mdev->data.work.q_lock); + + while (!list_empty(&work_list)) { + w = list_entry(work_list.next, struct drbd_work, list); + list_del_init(&w->list); + w->cb(mdev, w, 1); + i++; /* dead debugging code */ + } + + spin_lock_irq(&mdev->data.work.q_lock); + } + sema_init(&mdev->data.work.s, 0); + /* DANGEROUS race: if someone did queue his work within the spinlock, + * but up() ed outside the spinlock, we could get an up() on the + * semaphore without corresponding list entry. + * So don't do that. + */ + spin_unlock_irq(&mdev->data.work.q_lock); + + D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE); + /* _drbd_set_state only uses stop_nowait. + * wait here for the Exiting receiver. */ + drbd_thread_stop(&mdev->receiver); + drbd_mdev_cleanup(mdev); + + dev_info(DEV, "worker terminated\n"); + + clear_bit(DEVICE_DYING, &mdev->flags); + clear_bit(CONFIG_PENDING, &mdev->flags); + wake_up(&mdev->state_wait); + + return 0; +} diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h new file mode 100644 index 00000000000..f93fa111ce5 --- /dev/null +++ b/drivers/block/drbd/drbd_wrappers.h @@ -0,0 +1,91 @@ +#ifndef _DRBD_WRAPPERS_H +#define _DRBD_WRAPPERS_H + +#include +#include + +/* see get_sb_bdev and bd_claim */ +extern char *drbd_sec_holder; + +/* sets the number of 512 byte sectors of our virtual device */ +static inline void drbd_set_my_capacity(struct drbd_conf *mdev, + sector_t size) +{ + /* set_capacity(mdev->this_bdev->bd_disk, size); */ + set_capacity(mdev->vdisk, size); + mdev->this_bdev->bd_inode->i_size = (loff_t)size << 9; +} + +#define drbd_bio_uptodate(bio) bio_flagged(bio, BIO_UPTODATE) + +static inline int drbd_bio_has_active_page(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + if (page_count(bvec->bv_page) > 1) + return 1; + } + + return 0; +} + +/* bi_end_io handlers */ +extern void drbd_md_io_complete(struct bio *bio, int error); +extern void drbd_endio_read_sec(struct bio *bio, int error); +extern void drbd_endio_write_sec(struct bio *bio, int error); +extern void drbd_endio_pri(struct bio *bio, int error); + +/* + * used to submit our private bio + */ +static inline void drbd_generic_make_request(struct drbd_conf *mdev, + int fault_type, struct bio *bio) +{ + __release(local); + if (!bio->bi_bdev) { + printk(KERN_ERR "drbd%d: drbd_generic_make_request: " + "bio->bi_bdev == NULL\n", + mdev_to_minor(mdev)); + dump_stack(); + bio_endio(bio, -ENODEV); + return; + } + + if (FAULT_ACTIVE(mdev, fault_type)) + bio_endio(bio, -EIO); + else + generic_make_request(bio); +} + +static inline void drbd_plug_device(struct drbd_conf *mdev) +{ + struct request_queue *q; + q = bdev_get_queue(mdev->this_bdev); + + spin_lock_irq(q->queue_lock); + +/* XXX the check on !blk_queue_plugged is redundant, + * implicitly checked in blk_plug_device */ + + if (!blk_queue_plugged(q)) { + blk_plug_device(q); + del_timer(&q->unplug_timer); + /* unplugging should not happen automatically... */ + } + spin_unlock_irq(q->queue_lock); +} + +static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm) +{ + return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK) + == CRYPTO_ALG_TYPE_HASH; +} + +#ifndef __CHECKER__ +# undef __cond_lock +# define __cond_lock(x,c) (c) +#endif + +#endif diff --git a/include/linux/drbd.h b/include/linux/drbd.h new file mode 100644 index 00000000000..69dc711f37b --- /dev/null +++ b/include/linux/drbd.h @@ -0,0 +1,349 @@ +/* + drbd.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2001-2008, Philipp Reisner . + Copyright (C) 2001-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ +#ifndef DRBD_H +#define DRBD_H +#include +#include + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include +#include + +/* Altough the Linux source code makes a difference between + generic endianness and the bitfields' endianness, there is no + architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness + does not match the generic endianness. */ + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define __LITTLE_ENDIAN_BITFIELD +#elif __BYTE_ORDER == __BIG_ENDIAN +#define __BIG_ENDIAN_BITFIELD +#else +# error "sorry, weird endianness on this box" +#endif + +#endif + + +extern const char *drbd_buildtag(void); +#define REL_VERSION "8.3.3rc2" +#define API_VERSION 88 +#define PRO_VERSION_MIN 86 +#define PRO_VERSION_MAX 91 + + +enum drbd_io_error_p { + EP_PASS_ON, /* FIXME should the better be named "Ignore"? */ + EP_CALL_HELPER, + EP_DETACH +}; + +enum drbd_fencing_p { + FP_DONT_CARE, + FP_RESOURCE, + FP_STONITH +}; + +enum drbd_disconnect_p { + DP_RECONNECT, + DP_DROP_NET_CONF, + DP_FREEZE_IO +}; + +enum drbd_after_sb_p { + ASB_DISCONNECT, + ASB_DISCARD_YOUNGER_PRI, + ASB_DISCARD_OLDER_PRI, + ASB_DISCARD_ZERO_CHG, + ASB_DISCARD_LEAST_CHG, + ASB_DISCARD_LOCAL, + ASB_DISCARD_REMOTE, + ASB_CONSENSUS, + ASB_DISCARD_SECONDARY, + ASB_CALL_HELPER, + ASB_VIOLENTLY +}; + +/* KEEP the order, do not delete or insert. Only append. */ +enum drbd_ret_codes { + ERR_CODE_BASE = 100, + NO_ERROR = 101, + ERR_LOCAL_ADDR = 102, + ERR_PEER_ADDR = 103, + ERR_OPEN_DISK = 104, + ERR_OPEN_MD_DISK = 105, + ERR_DISK_NOT_BDEV = 107, + ERR_MD_NOT_BDEV = 108, + ERR_DISK_TO_SMALL = 111, + ERR_MD_DISK_TO_SMALL = 112, + ERR_BDCLAIM_DISK = 114, + ERR_BDCLAIM_MD_DISK = 115, + ERR_MD_IDX_INVALID = 116, + ERR_IO_MD_DISK = 118, + ERR_MD_INVALID = 119, + ERR_AUTH_ALG = 120, + ERR_AUTH_ALG_ND = 121, + ERR_NOMEM = 122, + ERR_DISCARD = 123, + ERR_DISK_CONFIGURED = 124, + ERR_NET_CONFIGURED = 125, + ERR_MANDATORY_TAG = 126, + ERR_MINOR_INVALID = 127, + ERR_INTR = 129, /* EINTR */ + ERR_RESIZE_RESYNC = 130, + ERR_NO_PRIMARY = 131, + ERR_SYNC_AFTER = 132, + ERR_SYNC_AFTER_CYCLE = 133, + ERR_PAUSE_IS_SET = 134, + ERR_PAUSE_IS_CLEAR = 135, + ERR_PACKET_NR = 137, + ERR_NO_DISK = 138, + ERR_NOT_PROTO_C = 139, + ERR_NOMEM_BITMAP = 140, + ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */ + ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */ + ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */ + ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */ + ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */ + ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */ + ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */ + ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */ + ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */ + ERR_DATA_NOT_CURRENT = 150, + ERR_CONNECTED = 151, /* DRBD 8.3 only */ + + /* insert new ones above this line */ + AFTER_LAST_ERR_CODE +}; + +#define DRBD_PROT_A 1 +#define DRBD_PROT_B 2 +#define DRBD_PROT_C 3 + +enum drbd_role { + R_UNKNOWN = 0, + R_PRIMARY = 1, /* role */ + R_SECONDARY = 2, /* role */ + R_MASK = 3, +}; + +/* The order of these constants is important. + * The lower ones (=C_WF_REPORT_PARAMS ==> There is a socket + */ +enum drbd_conns { + C_STANDALONE, + C_DISCONNECTING, /* Temporal state on the way to StandAlone. */ + C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */ + + /* These temporal states are all used on the way + * from >= C_CONNECTED to Unconnected. + * The 'disconnect reason' states + * I do not allow to change beween them. */ + C_TIMEOUT, + C_BROKEN_PIPE, + C_NETWORK_FAILURE, + C_PROTOCOL_ERROR, + C_TEAR_DOWN, + + C_WF_CONNECTION, + C_WF_REPORT_PARAMS, /* we have a socket */ + C_CONNECTED, /* we have introduced each other */ + C_STARTING_SYNC_S, /* starting full sync by admin request. */ + C_STARTING_SYNC_T, /* stariing full sync by admin request. */ + C_WF_BITMAP_S, + C_WF_BITMAP_T, + C_WF_SYNC_UUID, + + /* All SyncStates are tested with this comparison + * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */ + C_SYNC_SOURCE, + C_SYNC_TARGET, + C_VERIFY_S, + C_VERIFY_T, + C_PAUSED_SYNC_S, + C_PAUSED_SYNC_T, + C_MASK = 31 +}; + +enum drbd_disk_state { + D_DISKLESS, + D_ATTACHING, /* In the process of reading the meta-data */ + D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */ + /* when >= D_FAILED it is legal to access mdev->bc */ + D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */ + D_INCONSISTENT, + D_OUTDATED, + D_UNKNOWN, /* Only used for the peer, never for myself */ + D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */ + D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */ + D_MASK = 15 +}; + +union drbd_state { +/* According to gcc's docs is the ... + * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1). + * Determined by ABI. + * pointed out by Maxim Uvarov q + * even though we transmit as "cpu_to_be32(state)", + * the offsets of the bitfields still need to be swapped + * on different endianess. + */ + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned susp:1 ; /* 2/2 IO suspended no/yes */ + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned peer_isp:1 ; + unsigned user_isp:1 ; + unsigned _pad:11; /* 0 unused */ +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned _pad:11; /* 0 unused */ + unsigned user_isp:1 ; + unsigned peer_isp:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned susp:1 ; /* 2/2 IO suspended no/yes */ + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ +#else +# error "this endianess is not supported" +#endif + }; + unsigned int i; +}; + +enum drbd_state_ret_codes { + SS_CW_NO_NEED = 4, + SS_CW_SUCCESS = 3, + SS_NOTHING_TO_DO = 2, + SS_SUCCESS = 1, + SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */ + SS_TWO_PRIMARIES = -1, + SS_NO_UP_TO_DATE_DISK = -2, + SS_NO_LOCAL_DISK = -4, + SS_NO_REMOTE_DISK = -5, + SS_CONNECTED_OUTDATES = -6, + SS_PRIMARY_NOP = -7, + SS_RESYNC_RUNNING = -8, + SS_ALREADY_STANDALONE = -9, + SS_CW_FAILED_BY_PEER = -10, + SS_IS_DISKLESS = -11, + SS_DEVICE_IN_USE = -12, + SS_NO_NET_CONFIG = -13, + SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */ + SS_NEED_CONNECTION = -15, /* drbd-8.2 only */ + SS_LOWER_THAN_OUTDATED = -16, + SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ + SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ + SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ + SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ +}; + +/* from drbd_strings.c */ +extern const char *drbd_conn_str(enum drbd_conns); +extern const char *drbd_role_str(enum drbd_role); +extern const char *drbd_disk_str(enum drbd_disk_state); +extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes); + +#define SHARED_SECRET_MAX 64 + +#define MDF_CONSISTENT (1 << 0) +#define MDF_PRIMARY_IND (1 << 1) +#define MDF_CONNECTED_IND (1 << 2) +#define MDF_FULL_SYNC (1 << 3) +#define MDF_WAS_UP_TO_DATE (1 << 4) +#define MDF_PEER_OUT_DATED (1 << 5) +#define MDF_CRASHED_PRIMARY (1 << 6) + +enum drbd_uuid_index { + UI_CURRENT, + UI_BITMAP, + UI_HISTORY_START, + UI_HISTORY_END, + UI_SIZE, /* nl-packet: number of dirty bits */ + UI_FLAGS, /* nl-packet: flags */ + UI_EXTENDED_SIZE /* Everything. */ +}; + +enum drbd_timeout_flag { + UT_DEFAULT = 0, + UT_DEGRADED = 1, + UT_PEER_OUTDATED = 2, +}; + +#define UUID_JUST_CREATED ((__u64)4) + +#define DRBD_MAGIC 0x83740267 +#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) + +/* these are of type "int" */ +#define DRBD_MD_INDEX_INTERNAL -1 +#define DRBD_MD_INDEX_FLEX_EXT -2 +#define DRBD_MD_INDEX_FLEX_INT -3 + +/* Start of the new netlink/connector stuff */ + +#define DRBD_NL_CREATE_DEVICE 0x01 +#define DRBD_NL_SET_DEFAULTS 0x02 + +/* The following line should be moved over to linux/connector.h + * when the time comes */ +#ifndef CN_IDX_DRBD +# define CN_IDX_DRBD 0x4 +/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */ +#endif +#define CN_VAL_DRBD 0x1 + +/* For searching a vacant cn_idx value */ +#define CN_IDX_STEP 6977 + +struct drbd_nl_cfg_req { + int packet_type; + unsigned int drbd_minor; + int flags; + unsigned short tag_list[]; +}; + +struct drbd_nl_cfg_reply { + int packet_type; + unsigned int minor; + int ret_code; /* enum ret_code or set_st_err_t */ + unsigned short tag_list[]; /* only used with get_* calls */ +}; + +#endif diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h new file mode 100644 index 00000000000..9d067ce4696 --- /dev/null +++ b/include/linux/drbd_limits.h @@ -0,0 +1,137 @@ +/* + drbd_limits.h + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. +*/ + +/* + * Our current limitations. + * Some of them are hard limits, + * some of them are arbitrary range limits, that make it easier to provide + * feedback about nonsense settings for certain configurable values. + */ + +#ifndef DRBD_LIMITS_H +#define DRBD_LIMITS_H 1 + +#define DEBUG_RANGE_CHECK 0 + +#define DRBD_MINOR_COUNT_MIN 1 +#define DRBD_MINOR_COUNT_MAX 255 + +#define DRBD_DIALOG_REFRESH_MIN 0 +#define DRBD_DIALOG_REFRESH_MAX 600 + +/* valid port number */ +#define DRBD_PORT_MIN 1 +#define DRBD_PORT_MAX 0xffff + +/* startup { */ + /* if you want more than 3.4 days, disable */ +#define DRBD_WFC_TIMEOUT_MIN 0 +#define DRBD_WFC_TIMEOUT_MAX 300000 +#define DRBD_WFC_TIMEOUT_DEF 0 + +#define DRBD_DEGR_WFC_TIMEOUT_MIN 0 +#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 +#define DRBD_DEGR_WFC_TIMEOUT_DEF 0 + +#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 +#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 +#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 +/* }*/ + +/* net { */ + /* timeout, unit centi seconds + * more than one minute timeout is not usefull */ +#define DRBD_TIMEOUT_MIN 1 +#define DRBD_TIMEOUT_MAX 600 +#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ + + /* active connection retries when C_WF_CONNECTION */ +#define DRBD_CONNECT_INT_MIN 1 +#define DRBD_CONNECT_INT_MAX 120 +#define DRBD_CONNECT_INT_DEF 10 /* seconds */ + + /* keep-alive probes when idle */ +#define DRBD_PING_INT_MIN 1 +#define DRBD_PING_INT_MAX 120 +#define DRBD_PING_INT_DEF 10 + + /* timeout for the ping packets.*/ +#define DRBD_PING_TIMEO_MIN 1 +#define DRBD_PING_TIMEO_MAX 100 +#define DRBD_PING_TIMEO_DEF 5 + + /* max number of write requests between write barriers */ +#define DRBD_MAX_EPOCH_SIZE_MIN 1 +#define DRBD_MAX_EPOCH_SIZE_MAX 20000 +#define DRBD_MAX_EPOCH_SIZE_DEF 2048 + + /* I don't think that a tcp send buffer of more than 10M is usefull */ +#define DRBD_SNDBUF_SIZE_MIN 0 +#define DRBD_SNDBUF_SIZE_MAX (10<<20) +#define DRBD_SNDBUF_SIZE_DEF (2*65535) + +#define DRBD_RCVBUF_SIZE_MIN 0 +#define DRBD_RCVBUF_SIZE_MAX (10<<20) +#define DRBD_RCVBUF_SIZE_DEF (2*65535) + + /* @4k PageSize -> 128kB - 512MB */ +#define DRBD_MAX_BUFFERS_MIN 32 +#define DRBD_MAX_BUFFERS_MAX 131072 +#define DRBD_MAX_BUFFERS_DEF 2048 + + /* @4k PageSize -> 4kB - 512MB */ +#define DRBD_UNPLUG_WATERMARK_MIN 1 +#define DRBD_UNPLUG_WATERMARK_MAX 131072 +#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) + + /* 0 is disabled. + * 200 should be more than enough even for very short timeouts */ +#define DRBD_KO_COUNT_MIN 0 +#define DRBD_KO_COUNT_MAX 200 +#define DRBD_KO_COUNT_DEF 0 +/* } */ + +/* syncer { */ + /* FIXME allow rate to be zero? */ +#define DRBD_RATE_MIN 1 +/* channel bonding 10 GbE, or other hardware */ +#define DRBD_RATE_MAX (4 << 20) +#define DRBD_RATE_DEF 250 /* kb/second */ + + /* less than 7 would hit performance unneccessarily. + * 3833 is the largest prime that still does fit + * into 64 sectors of activity log */ +#define DRBD_AL_EXTENTS_MIN 7 +#define DRBD_AL_EXTENTS_MAX 3833 +#define DRBD_AL_EXTENTS_DEF 127 + +#define DRBD_AFTER_MIN -1 +#define DRBD_AFTER_MAX 255 +#define DRBD_AFTER_DEF -1 + +/* } */ + +/* drbdsetup XY resize -d Z + * you are free to reduce the device size to nothing, if you want to. + * the upper limit with 64bit kernel, enough ram and flexible meta data + * is 16 TB, currently. */ +/* DRBD_MAX_SECTORS */ +#define DRBD_DISK_SIZE_SECT_MIN 0 +#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30)) +#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ + +#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON +#define DRBD_FENCING_DEF FP_DONT_CARE +#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT +#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT +#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT +#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT + +#define DRBD_MAX_BIO_BVECS_MIN 0 +#define DRBD_MAX_BIO_BVECS_MAX 128 +#define DRBD_MAX_BIO_BVECS_DEF 0 + +#undef RANGE +#endif diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h new file mode 100644 index 00000000000..db5721ad50d --- /dev/null +++ b/include/linux/drbd_nl.h @@ -0,0 +1,137 @@ +/* + PAKET( name, + TYPE ( pn, pr, member ) + ... + ) + + You may never reissue one of the pn arguments +*/ + +#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) +#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" +#endif + +NL_PACKET(primary, 1, + NL_BIT( 1, T_MAY_IGNORE, overwrite_peer) +) + +NL_PACKET(secondary, 2, ) + +NL_PACKET(disk_conf, 3, + NL_INT64( 2, T_MAY_IGNORE, disk_size) + NL_STRING( 3, T_MANDATORY, backing_dev, 128) + NL_STRING( 4, T_MANDATORY, meta_dev, 128) + NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) + NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) + NL_INTEGER( 7, T_MAY_IGNORE, fencing) + NL_BIT( 37, T_MAY_IGNORE, use_bmbv) + NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) + NL_BIT( 54, T_MAY_IGNORE, no_md_flush) + /* 55 max_bio_size was available in 8.2.6rc2 */ + NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) + NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) + NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) +) + +NL_PACKET(detach, 4, ) + +NL_PACKET(net_conf, 5, + NL_STRING( 8, T_MANDATORY, my_addr, 128) + NL_STRING( 9, T_MANDATORY, peer_addr, 128) + NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) + NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) + NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) + NL_INTEGER( 14, T_MAY_IGNORE, timeout) + NL_INTEGER( 15, T_MANDATORY, wire_protocol) + NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) + NL_INTEGER( 17, T_MAY_IGNORE, ping_int) + NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) + NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) + NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) + NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) + NL_INTEGER( 22, T_MAY_IGNORE, ko_count) + NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) + NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) + NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) + NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) + NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) + NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) + /* 59 addr_family was available in GIT, never released */ + NL_BIT( 60, T_MANDATORY, mind_af) + NL_BIT( 27, T_MAY_IGNORE, want_lose) + NL_BIT( 28, T_MAY_IGNORE, two_primaries) + NL_BIT( 41, T_MAY_IGNORE, always_asbp) + NL_BIT( 61, T_MAY_IGNORE, no_cork) + NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) +) + +NL_PACKET(disconnect, 6, ) + +NL_PACKET(resize, 7, + NL_INT64( 29, T_MAY_IGNORE, resize_size) +) + +NL_PACKET(syncer_conf, 8, + NL_INTEGER( 30, T_MAY_IGNORE, rate) + NL_INTEGER( 31, T_MAY_IGNORE, after) + NL_INTEGER( 32, T_MAY_IGNORE, al_extents) + NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) + NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) + NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) + NL_BIT( 65, T_MAY_IGNORE, use_rle) +) + +NL_PACKET(invalidate, 9, ) +NL_PACKET(invalidate_peer, 10, ) +NL_PACKET(pause_sync, 11, ) +NL_PACKET(resume_sync, 12, ) +NL_PACKET(suspend_io, 13, ) +NL_PACKET(resume_io, 14, ) +NL_PACKET(outdate, 15, ) +NL_PACKET(get_config, 16, ) +NL_PACKET(get_state, 17, + NL_INTEGER( 33, T_MAY_IGNORE, state_i) +) + +NL_PACKET(get_uuids, 18, + NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) + NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) +) + +NL_PACKET(get_timeout_flag, 19, + NL_BIT( 36, T_MAY_IGNORE, use_degraded) +) + +NL_PACKET(call_helper, 20, + NL_STRING( 38, T_MAY_IGNORE, helper, 32) +) + +/* Tag nr 42 already allocated in drbd-8.1 development. */ + +NL_PACKET(sync_progress, 23, + NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) +) + +NL_PACKET(dump_ee, 24, + NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) + NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) + NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) + NL_INT64( 48, T_MAY_IGNORE, ee_sector) + NL_INT64( 49, T_MAY_IGNORE, ee_block_id) + NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) +) + +NL_PACKET(start_ov, 25, + NL_INT64( 66, T_MAY_IGNORE, start_sector) +) + +NL_PACKET(new_c_uuid, 26, + NL_BIT( 63, T_MANDATORY, clear_bm) +) + +#undef NL_PACKET +#undef NL_INTEGER +#undef NL_INT64 +#undef NL_BIT +#undef NL_STRING + diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h new file mode 100644 index 00000000000..fcdff8410e9 --- /dev/null +++ b/include/linux/drbd_tag_magic.h @@ -0,0 +1,83 @@ +#ifndef DRBD_TAG_MAGIC_H +#define DRBD_TAG_MAGIC_H + +#define TT_END 0 +#define TT_REMOVED 0xE000 + +/* declare packet_type enums */ +enum packet_types { +#define NL_PACKET(name, number, fields) P_ ## name = number, +#define NL_INTEGER(pn, pr, member) +#define NL_INT64(pn, pr, member) +#define NL_BIT(pn, pr, member) +#define NL_STRING(pn, pr, member, len) +#include "drbd_nl.h" + P_nl_after_last_packet, +}; + +/* These struct are used to deduce the size of the tag lists: */ +#define NL_PACKET(name, number, fields) \ + struct name ## _tag_len_struct { fields }; +#define NL_INTEGER(pn, pr, member) \ + int member; int tag_and_len ## member; +#define NL_INT64(pn, pr, member) \ + __u64 member; int tag_and_len ## member; +#define NL_BIT(pn, pr, member) \ + unsigned char member:1; int tag_and_len ## member; +#define NL_STRING(pn, pr, member, len) \ + unsigned char member[len]; int member ## _len; \ + int tag_and_len ## member; +#include "linux/drbd_nl.h" + +/* declate tag-list-sizes */ +static const int tag_list_sizes[] = { +#define NL_PACKET(name, number, fields) 2 fields , +#define NL_INTEGER(pn, pr, member) + 4 + 4 +#define NL_INT64(pn, pr, member) + 4 + 8 +#define NL_BIT(pn, pr, member) + 4 + 1 +#define NL_STRING(pn, pr, member, len) + 4 + (len) +#include "drbd_nl.h" +}; + +/* The two highest bits are used for the tag type */ +#define TT_MASK 0xC000 +#define TT_INTEGER 0x0000 +#define TT_INT64 0x4000 +#define TT_BIT 0x8000 +#define TT_STRING 0xC000 +/* The next bit indicates if processing of the tag is mandatory */ +#define T_MANDATORY 0x2000 +#define T_MAY_IGNORE 0x0000 +#define TN_MASK 0x1fff +/* The remaining 13 bits are used to enumerate the tags */ + +#define tag_type(T) ((T) & TT_MASK) +#define tag_number(T) ((T) & TN_MASK) + +/* declare tag enums */ +#define NL_PACKET(name, number, fields) fields +enum drbd_tags { +#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , +#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , +#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , +#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , +#include "drbd_nl.h" +}; + +struct tag { + const char *name; + int type_n_flags; + int max_len; +}; + +/* declare tag names */ +#define NL_PACKET(name, number, fields) fields +static const struct tag tag_descriptions[] = { +#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, +#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, +#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, +#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, +#include "drbd_nl.h" +}; + +#endif diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h new file mode 100644 index 00000000000..3a2b2d9b047 --- /dev/null +++ b/include/linux/lru_cache.h @@ -0,0 +1,294 @@ +/* + lru_cache.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#ifndef LRU_CACHE_H +#define LRU_CACHE_H + +#include +#include +#include +#include /* for memset */ +#include + +/* +This header file (and its .c file; kernel-doc of functions see there) + define a helper framework to easily keep track of index:label associations, + and changes to an "active set" of objects, as well as pending transactions, + to persistently record those changes. + + We use an LRU policy if it is necessary to "cool down" a region currently in + the active set before we can "heat" a previously unused region. + + Because of this later property, it is called "lru_cache". + As it actually Tracks Objects in an Active SeT, we could also call it + toast (incidentally that is what may happen to the data on the + backend storage uppon next resync, if we don't get it right). + +What for? + +We replicate IO (more or less synchronously) to local and remote disk. + +For crash recovery after replication node failure, + we need to resync all regions that have been target of in-flight WRITE IO + (in use, or "hot", regions), as we don't know wether or not those WRITEs have + made it to stable storage. + + To avoid a "full resync", we need to persistently track these regions. + + This is known as "write intent log", and can be implemented as on-disk + (coarse or fine grained) bitmap, or other meta data. + + To avoid the overhead of frequent extra writes to this meta data area, + usually the condition is softened to regions that _may_ have been target of + in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent + bitmap, trading frequency of meta data transactions against amount of + (possibly unneccessary) resync traffic. + + If we set a hard limit on the area that may be "hot" at any given time, we + limit the amount of resync traffic needed for crash recovery. + +For recovery after replication link failure, + we need to resync all blocks that have been changed on the other replica + in the mean time, or, if both replica have been changed independently [*], + all blocks that have been changed on either replica in the mean time. + [*] usually as a result of a cluster split-brain and insufficient protection. + but there are valid use cases to do this on purpose. + + Tracking those blocks can be implemented as "dirty bitmap". + Having it fine-grained reduces the amount of resync traffic. + It should also be persistent, to allow for reboots (or crashes) + while the replication link is down. + +There are various possible implementations for persistently storing +write intent log information, three of which are mentioned here. + +"Chunk dirtying" + The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well. + To reduce the frequency of bitmap updates for write-intent log purposes, + one could dirty "chunks" (of some size) at a time of the (fine grained) + on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as + possible, flushing it to disk again when a previously "hot" (and on-disk + dirtied as full chunk) area "cools down" again (no IO in flight anymore, + and none expected in the near future either). + +"Explicit (coarse) write intent bitmap" + An other implementation could chose a (probably coarse) explicit bitmap, + for write-intent log purposes, additionally to the fine grained dirty bitmap. + +"Activity log" + Yet an other implementation may keep track of the hot regions, by starting + with an empty set, and writing down a journal of region numbers that have + become "hot", or have "cooled down" again. + + To be able to use a ring buffer for this journal of changes to the active + set, we not only record the actual changes to that set, but also record the + not changing members of the set in a round robin fashion. To do so, we use a + fixed (but configurable) number of slots which we can identify by index, and + associate region numbers (labels) with these indices. + For each transaction recording a change to the active set, we record the + change itself (index: -old_label, +new_label), and which index is associated + with which label (index: current_label) within a certain sliding window that + is moved further over the available indices with each such transaction. + + Thus, for crash recovery, if the ringbuffer is sufficiently large, we can + accurately reconstruct the active set. + + Sufficiently large depends only on maximum number of active objects, and the + size of the sliding window recording "index: current_label" associations within + each transaction. + + This is what we call the "activity log". + + Currently we need one activity log transaction per single label change, which + does not give much benefit over the "dirty chunks of bitmap" approach, other + than potentially less seeks. + + We plan to change the transaction format to support multiple changes per + transaction, which then would reduce several (disjoint, "random") updates to + the bitmap into one transaction to the activity log ring buffer. +*/ + +/* this defines an element in a tracked set + * .colision is for hash table lookup. + * When we process a new IO request, we know its sector, thus can deduce the + * region number (label) easily. To do the label -> object lookup without a + * full list walk, we use a simple hash table. + * + * .list is on one of three lists: + * in_use: currently in use (refcnt > 0, lc_number != LC_FREE) + * lru: unused but ready to be reused or recycled + * (ts_refcnt == 0, lc_number != LC_FREE), + * free: unused but ready to be recycled + * (ts_refcnt == 0, lc_number == LC_FREE), + * + * an element is said to be "in the active set", + * if either on "in_use" or "lru", i.e. lc_number != LC_FREE. + * + * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache + * (total memory usage 2 pages), and up to 3833 elements on the act_log + * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages. + * + * We usually do not actually free these objects again, but only "recycle" + * them, as the change "index: -old_label, +LC_FREE" would need a transaction + * as well. Which also means that using a kmem_cache to allocate the objects + * from wastes some resources. + * But it avoids high order page allocations in kmalloc. + */ +struct lc_element { + struct hlist_node colision; + struct list_head list; /* LRU list or free list */ + unsigned refcnt; + /* back "pointer" into ts_cache->element[index], + * for paranoia, and for "ts_element_to_index" */ + unsigned lc_index; + /* if we want to track a larger set of objects, + * it needs to become arch independend u64 */ + unsigned lc_number; + + /* special label when on free list */ +#define LC_FREE (~0U) +}; + +struct lru_cache { + /* the least recently used item is kept at lru->prev */ + struct list_head lru; + struct list_head free; + struct list_head in_use; + + /* the pre-created kmem cache to allocate the objects from */ + struct kmem_cache *lc_cache; + + /* size of tracked objects, used to memset(,0,) them in lc_reset */ + size_t element_size; + /* offset of struct lc_element member in the tracked object */ + size_t element_off; + + /* number of elements (indices) */ + unsigned int nr_elements; + /* Arbitrary limit on maximum tracked objects. Practical limit is much + * lower due to allocation failures, probably. For typical use cases, + * nr_elements should be a few thousand at most. + * This also limits the maximum value of ts_element.ts_index, allowing the + * 8 high bits of .ts_index to be overloaded with flags in the future. */ +#define LC_MAX_ACTIVE (1<<24) + + /* statistics */ + unsigned used; /* number of lelements currently on in_use list */ + unsigned long hits, misses, starving, dirty, changed; + + /* see below: flag-bits for lru_cache */ + unsigned long flags; + + /* when changing the label of an index element */ + unsigned int new_number; + + /* for paranoia when changing the label of an index element */ + struct lc_element *changing_element; + + void *lc_private; + const char *name; + + /* nr_elements there */ + struct hlist_head *lc_slot; + struct lc_element **lc_element; +}; + + +/* flag-bits for lru_cache */ +enum { + /* debugging aid, to catch concurrent access early. + * user needs to guarantee exclusive access by proper locking! */ + __LC_PARANOIA, + /* if we need to change the set, but currently there is a changing + * transaction pending, we are "dirty", and must deferr further + * changing requests */ + __LC_DIRTY, + /* if we need to change the set, but currently there is no free nor + * unused element available, we are "starving", and must not give out + * further references, to guarantee that eventually some refcnt will + * drop to zero and we will be able to make progress again, changing + * the set, writing the transaction. + * if the statistics say we are frequently starving, + * nr_elements is too small. */ + __LC_STARVING, +}; +#define LC_PARANOIA (1<<__LC_PARANOIA) +#define LC_DIRTY (1<<__LC_DIRTY) +#define LC_STARVING (1<<__LC_STARVING) + +extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, + unsigned e_count, size_t e_size, size_t e_off); +extern void lc_reset(struct lru_cache *lc); +extern void lc_destroy(struct lru_cache *lc); +extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); +extern void lc_del(struct lru_cache *lc, struct lc_element *element); + +extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); +extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); +extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); +extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); +extern void lc_changed(struct lru_cache *lc, struct lc_element *e); + +struct seq_file; +extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); + +extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, + void (*detail) (struct seq_file *, struct lc_element *)); + +/** + * lc_try_lock - can be used to stop lc_get() from changing the tracked set + * @lc: the lru cache to operate on + * + * Note that the reference counts and order on the active and lru lists may + * still change. Returns true if we aquired the lock. + */ +static inline int lc_try_lock(struct lru_cache *lc) +{ + return !test_and_set_bit(__LC_DIRTY, &lc->flags); +} + +/** + * lc_unlock - unlock @lc, allow lc_get() to change the set again + * @lc: the lru cache to operate on + */ +static inline void lc_unlock(struct lru_cache *lc) +{ + clear_bit(__LC_DIRTY, &lc->flags); + smp_mb__after_clear_bit(); +} + +static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) +{ + struct lc_element *e = lc_find(lc, enr); + return e && e->refcnt; +} + +#define lc_entry(ptr, type, member) \ + container_of(ptr, type, member) + +extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); +extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); + +#endif diff --git a/lib/Kconfig b/lib/Kconfig index bb1326d3839..1cfe51628e1 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -200,4 +200,7 @@ config NLATTR config GENERIC_ATOMIC64 bool +config LRU_CACHE + tristate + endmenu diff --git a/lib/Makefile b/lib/Makefile index 2e78277eff9..347ad8db29d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -91,6 +91,8 @@ obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o obj-$(CONFIG_NLATTR) += nlattr.o +obj-$(CONFIG_LRU_CACHE) += lru_cache.o + obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o obj-$(CONFIG_GENERIC_CSUM) += checksum.o diff --git a/lib/lru_cache.c b/lib/lru_cache.c new file mode 100644 index 00000000000..270de9d31b8 --- /dev/null +++ b/lib/lru_cache.c @@ -0,0 +1,560 @@ +/* + lru_cache.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#include +#include +#include +#include /* for memset */ +#include /* for seq_printf */ +#include + +MODULE_AUTHOR("Philipp Reisner , " + "Lars Ellenberg "); +MODULE_DESCRIPTION("lru_cache - Track sets of hot objects"); +MODULE_LICENSE("GPL"); + +/* this is developers aid only. + * it catches concurrent access (lack of locking on the users part) */ +#define PARANOIA_ENTRY() do { \ + BUG_ON(!lc); \ + BUG_ON(!lc->nr_elements); \ + BUG_ON(test_and_set_bit(__LC_PARANOIA, &lc->flags)); \ +} while (0) + +#define RETURN(x...) do { \ + clear_bit(__LC_PARANOIA, &lc->flags); \ + smp_mb__after_clear_bit(); return x ; } while (0) + +/* BUG() if e is not one of the elements tracked by lc */ +#define PARANOIA_LC_ELEMENT(lc, e) do { \ + struct lru_cache *lc_ = (lc); \ + struct lc_element *e_ = (e); \ + unsigned i = e_->lc_index; \ + BUG_ON(i >= lc_->nr_elements); \ + BUG_ON(lc_->lc_element[i] != e_); } while (0) + +/** + * lc_create - prepares to track objects in an active set + * @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details + * @e_count: number of elements allowed to be active simultaneously + * @e_size: size of the tracked objects + * @e_off: offset to the &struct lc_element member in a tracked object + * + * Returns a pointer to a newly initialized struct lru_cache on success, + * or NULL on (allocation) failure. + */ +struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, + unsigned e_count, size_t e_size, size_t e_off) +{ + struct hlist_head *slot = NULL; + struct lc_element **element = NULL; + struct lru_cache *lc; + struct lc_element *e; + unsigned cache_obj_size = kmem_cache_size(cache); + unsigned i; + + WARN_ON(cache_obj_size < e_size); + if (cache_obj_size < e_size) + return NULL; + + /* e_count too big; would probably fail the allocation below anyways. + * for typical use cases, e_count should be few thousand at most. */ + if (e_count > LC_MAX_ACTIVE) + return NULL; + + slot = kzalloc(e_count * sizeof(struct hlist_head*), GFP_KERNEL); + if (!slot) + goto out_fail; + element = kzalloc(e_count * sizeof(struct lc_element *), GFP_KERNEL); + if (!element) + goto out_fail; + + lc = kzalloc(sizeof(*lc), GFP_KERNEL); + if (!lc) + goto out_fail; + + INIT_LIST_HEAD(&lc->in_use); + INIT_LIST_HEAD(&lc->lru); + INIT_LIST_HEAD(&lc->free); + + lc->name = name; + lc->element_size = e_size; + lc->element_off = e_off; + lc->nr_elements = e_count; + lc->new_number = LC_FREE; + lc->lc_cache = cache; + lc->lc_element = element; + lc->lc_slot = slot; + + /* preallocate all objects */ + for (i = 0; i < e_count; i++) { + void *p = kmem_cache_alloc(cache, GFP_KERNEL); + if (!p) + break; + memset(p, 0, lc->element_size); + e = p + e_off; + e->lc_index = i; + e->lc_number = LC_FREE; + list_add(&e->list, &lc->free); + element[i] = e; + } + if (i == e_count) + return lc; + + /* else: could not allocate all elements, give up */ + for (i--; i; i--) { + void *p = element[i]; + kmem_cache_free(cache, p - e_off); + } + kfree(lc); +out_fail: + kfree(element); + kfree(slot); + return NULL; +} + +void lc_free_by_index(struct lru_cache *lc, unsigned i) +{ + void *p = lc->lc_element[i]; + WARN_ON(!p); + if (p) { + p -= lc->element_off; + kmem_cache_free(lc->lc_cache, p); + } +} + +/** + * lc_destroy - frees memory allocated by lc_create() + * @lc: the lru cache to destroy + */ +void lc_destroy(struct lru_cache *lc) +{ + unsigned i; + if (!lc) + return; + for (i = 0; i < lc->nr_elements; i++) + lc_free_by_index(lc, i); + kfree(lc->lc_element); + kfree(lc->lc_slot); + kfree(lc); +} + +/** + * lc_reset - does a full reset for @lc and the hash table slots. + * @lc: the lru cache to operate on + * + * It is roughly the equivalent of re-allocating a fresh lru_cache object, + * basically a short cut to lc_destroy(lc); lc = lc_create(...); + */ +void lc_reset(struct lru_cache *lc) +{ + unsigned i; + + INIT_LIST_HEAD(&lc->in_use); + INIT_LIST_HEAD(&lc->lru); + INIT_LIST_HEAD(&lc->free); + lc->used = 0; + lc->hits = 0; + lc->misses = 0; + lc->starving = 0; + lc->dirty = 0; + lc->changed = 0; + lc->flags = 0; + lc->changing_element = NULL; + lc->new_number = LC_FREE; + memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements); + + for (i = 0; i < lc->nr_elements; i++) { + struct lc_element *e = lc->lc_element[i]; + void *p = e; + p -= lc->element_off; + memset(p, 0, lc->element_size); + /* re-init it */ + e->lc_index = i; + e->lc_number = LC_FREE; + list_add(&e->list, &lc->free); + } +} + +/** + * lc_seq_printf_stats - print stats about @lc into @seq + * @seq: the seq_file to print into + * @lc: the lru cache to print statistics of + */ +size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) +{ + /* NOTE: + * total calls to lc_get are + * (starving + hits + misses) + * misses include "dirty" count (update from an other thread in + * progress) and "changed", when this in fact lead to an successful + * update of the cache. + */ + return seq_printf(seq, "\t%s: used:%u/%u " + "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n", + lc->name, lc->used, lc->nr_elements, + lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed); +} + +static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) +{ + return lc->lc_slot + (enr % lc->nr_elements); +} + + +/** + * lc_find - find element by label, if present in the hash table + * @lc: The lru_cache object + * @enr: element number + * + * Returns the pointer to an element, if the element with the requested + * "label" or element number is present in the hash table, + * or NULL if not found. Does not change the refcnt. + */ +struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr) +{ + struct hlist_node *n; + struct lc_element *e; + + BUG_ON(!lc); + BUG_ON(!lc->nr_elements); + hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) { + if (e->lc_number == enr) + return e; + } + return NULL; +} + +/* returned element will be "recycled" immediately */ +static struct lc_element *lc_evict(struct lru_cache *lc) +{ + struct list_head *n; + struct lc_element *e; + + if (list_empty(&lc->lru)) + return NULL; + + n = lc->lru.prev; + e = list_entry(n, struct lc_element, list); + + PARANOIA_LC_ELEMENT(lc, e); + + list_del(&e->list); + hlist_del(&e->colision); + return e; +} + +/** + * lc_del - removes an element from the cache + * @lc: The lru_cache object + * @e: The element to remove + * + * @e must be unused (refcnt == 0). Moves @e from "lru" to "free" list, + * sets @e->enr to %LC_FREE. + */ +void lc_del(struct lru_cache *lc, struct lc_element *e) +{ + PARANOIA_ENTRY(); + PARANOIA_LC_ELEMENT(lc, e); + BUG_ON(e->refcnt); + + e->lc_number = LC_FREE; + hlist_del_init(&e->colision); + list_move(&e->list, &lc->free); + RETURN(); +} + +static struct lc_element *lc_get_unused_element(struct lru_cache *lc) +{ + struct list_head *n; + + if (list_empty(&lc->free)) + return lc_evict(lc); + + n = lc->free.next; + list_del(n); + return list_entry(n, struct lc_element, list); +} + +static int lc_unused_element_available(struct lru_cache *lc) +{ + if (!list_empty(&lc->free)) + return 1; /* something on the free list */ + if (!list_empty(&lc->lru)) + return 1; /* something to evict */ + + return 0; +} + + +/** + * lc_get - get element by label, maybe change the active set + * @lc: the lru cache to operate on + * @enr: the label to look up + * + * Finds an element in the cache, increases its usage count, + * "touches" and returns it. + * + * In case the requested number is not present, it needs to be added to the + * cache. Therefore it is possible that an other element becomes evicted from + * the cache. In either case, the user is notified so he is able to e.g. keep + * a persistent log of the cache changes, and therefore the objects in use. + * + * Return values: + * NULL + * The cache was marked %LC_STARVING, + * or the requested label was not in the active set + * and a changing transaction is still pending (@lc was marked %LC_DIRTY). + * Or no unused or free element could be recycled (@lc will be marked as + * %LC_STARVING, blocking further lc_get() operations). + * + * pointer to the element with the REQUESTED element number. + * In this case, it can be used right away + * + * pointer to an UNUSED element with some different element number, + * where that different number may also be %LC_FREE. + * + * In this case, the cache is marked %LC_DIRTY (blocking further changes), + * and the returned element pointer is removed from the lru list and + * hash collision chains. The user now should do whatever housekeeping + * is necessary. + * Then he must call lc_changed(lc,element_pointer), to finish + * the change. + * + * NOTE: The user needs to check the lc_number on EACH use, so he recognizes + * any cache set change. + */ +struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr) +{ + struct lc_element *e; + + PARANOIA_ENTRY(); + if (lc->flags & LC_STARVING) { + ++lc->starving; + RETURN(NULL); + } + + e = lc_find(lc, enr); + if (e) { + ++lc->hits; + if (e->refcnt++ == 0) + lc->used++; + list_move(&e->list, &lc->in_use); /* Not evictable... */ + RETURN(e); + } + + ++lc->misses; + + /* In case there is nothing available and we can not kick out + * the LRU element, we have to wait ... + */ + if (!lc_unused_element_available(lc)) { + __set_bit(__LC_STARVING, &lc->flags); + RETURN(NULL); + } + + /* it was not present in the active set. + * we are going to recycle an unused (or even "free") element. + * user may need to commit a transaction to record that change. + * we serialize on flags & TF_DIRTY */ + if (test_and_set_bit(__LC_DIRTY, &lc->flags)) { + ++lc->dirty; + RETURN(NULL); + } + + e = lc_get_unused_element(lc); + BUG_ON(!e); + + clear_bit(__LC_STARVING, &lc->flags); + BUG_ON(++e->refcnt != 1); + lc->used++; + + lc->changing_element = e; + lc->new_number = enr; + + RETURN(e); +} + +/* similar to lc_get, + * but only gets a new reference on an existing element. + * you either get the requested element, or NULL. + * will be consolidated into one function. + */ +struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr) +{ + struct lc_element *e; + + PARANOIA_ENTRY(); + if (lc->flags & LC_STARVING) { + ++lc->starving; + RETURN(NULL); + } + + e = lc_find(lc, enr); + if (e) { + ++lc->hits; + if (e->refcnt++ == 0) + lc->used++; + list_move(&e->list, &lc->in_use); /* Not evictable... */ + } + RETURN(e); +} + +/** + * lc_changed - tell @lc that the change has been recorded + * @lc: the lru cache to operate on + * @e: the element pending label change + */ +void lc_changed(struct lru_cache *lc, struct lc_element *e) +{ + PARANOIA_ENTRY(); + BUG_ON(e != lc->changing_element); + PARANOIA_LC_ELEMENT(lc, e); + ++lc->changed; + e->lc_number = lc->new_number; + list_add(&e->list, &lc->in_use); + hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number)); + lc->changing_element = NULL; + lc->new_number = LC_FREE; + clear_bit(__LC_DIRTY, &lc->flags); + smp_mb__after_clear_bit(); + RETURN(); +} + + +/** + * lc_put - give up refcnt of @e + * @lc: the lru cache to operate on + * @e: the element to put + * + * If refcnt reaches zero, the element is moved to the lru list, + * and a %LC_STARVING (if set) is cleared. + * Returns the new (post-decrement) refcnt. + */ +unsigned int lc_put(struct lru_cache *lc, struct lc_element *e) +{ + PARANOIA_ENTRY(); + PARANOIA_LC_ELEMENT(lc, e); + BUG_ON(e->refcnt == 0); + BUG_ON(e == lc->changing_element); + if (--e->refcnt == 0) { + /* move it to the front of LRU. */ + list_move(&e->list, &lc->lru); + lc->used--; + clear_bit(__LC_STARVING, &lc->flags); + smp_mb__after_clear_bit(); + } + RETURN(e->refcnt); +} + +/** + * lc_element_by_index + * @lc: the lru cache to operate on + * @i: the index of the element to return + */ +struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i) +{ + BUG_ON(i >= lc->nr_elements); + BUG_ON(lc->lc_element[i] == NULL); + BUG_ON(lc->lc_element[i]->lc_index != i); + return lc->lc_element[i]; +} + +/** + * lc_index_of + * @lc: the lru cache to operate on + * @e: the element to query for its index position in lc->element + */ +unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e) +{ + PARANOIA_LC_ELEMENT(lc, e); + return e->lc_index; +} + +/** + * lc_set - associate index with label + * @lc: the lru cache to operate on + * @enr: the label to set + * @index: the element index to associate label with. + * + * Used to initialize the active set to some previously recorded state. + */ +void lc_set(struct lru_cache *lc, unsigned int enr, int index) +{ + struct lc_element *e; + + if (index < 0 || index >= lc->nr_elements) + return; + + e = lc_element_by_index(lc, index); + e->lc_number = enr; + + hlist_del_init(&e->colision); + hlist_add_head(&e->colision, lc_hash_slot(lc, enr)); + list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru); +} + +/** + * lc_dump - Dump a complete LRU cache to seq in textual form. + * @lc: the lru cache to operate on + * @seq: the &struct seq_file pointer to seq_printf into + * @utext: user supplied "heading" or other info + * @detail: function pointer the user may provide to dump further details + * of the object the lc_element is embedded in. + */ +void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, + void (*detail) (struct seq_file *, struct lc_element *)) +{ + unsigned int nr_elements = lc->nr_elements; + struct lc_element *e; + int i; + + seq_printf(seq, "\tnn: lc_number refcnt %s\n ", utext); + for (i = 0; i < nr_elements; i++) { + e = lc_element_by_index(lc, i); + if (e->lc_number == LC_FREE) { + seq_printf(seq, "\t%2d: FREE\n", i); + } else { + seq_printf(seq, "\t%2d: %4u %4u ", i, + e->lc_number, e->refcnt); + detail(seq, e); + } + } +} + +EXPORT_SYMBOL(lc_create); +EXPORT_SYMBOL(lc_reset); +EXPORT_SYMBOL(lc_destroy); +EXPORT_SYMBOL(lc_set); +EXPORT_SYMBOL(lc_del); +EXPORT_SYMBOL(lc_try_get); +EXPORT_SYMBOL(lc_find); +EXPORT_SYMBOL(lc_get); +EXPORT_SYMBOL(lc_put); +EXPORT_SYMBOL(lc_changed); +EXPORT_SYMBOL(lc_element_by_index); +EXPORT_SYMBOL(lc_index_of); +EXPORT_SYMBOL(lc_seq_printf_stats); +EXPORT_SYMBOL(lc_seq_dump_details); From ab8fafc2e1ecc0090f2c78902d3b992eec8b11f8 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 28 Sep 2009 10:28:01 +0200 Subject: [PATCH 005/113] dropping unneeded include autoconf.h It is force-included on the gcc command line since at least 2.6.15. Explicit include lines seem to break compilation now in certain configurations. Signed-off-by: Lars Ellenberg Signed-off-by: Kamalesh Babulal Acked-by: Sam Ravnborg --- drivers/block/drbd/drbd_main.c | 1 - drivers/block/drbd/drbd_nl.c | 1 - drivers/block/drbd/drbd_proc.c | 1 - drivers/block/drbd/drbd_receiver.c | 1 - drivers/block/drbd/drbd_req.c | 1 - drivers/block/drbd/drbd_req.h | 1 - drivers/block/drbd/drbd_worker.c | 1 - 7 files changed, 7 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index edf0b8031e6..80273f21a4a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -26,7 +26,6 @@ */ -#include #include #include #include diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1927acefe23..cfde31002df 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -23,7 +23,6 @@ */ -#include #include #include #include diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 98fcb7450c7..bdd0b4943b1 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -23,7 +23,6 @@ */ -#include #include #include diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 63686c4d85c..2f81821c2e0 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -23,7 +23,6 @@ */ -#include #include #include diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 0656cf1edd5..1aaa397669a 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -23,7 +23,6 @@ */ -#include #include #include diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index d37ab57f120..f22c1bc8ec7 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -25,7 +25,6 @@ #ifndef _DRBD_REQ_H #define _DRBD_REQ_H -#include #include #include diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 212e9545e63..34a4b3ef6c0 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -23,7 +23,6 @@ */ -#include #include #include #include From 6a0afdf58d40200abd0c717261d1bc4c49195c2f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 1 Oct 2009 09:04:14 +0200 Subject: [PATCH 006/113] drbd: remove tracing bits They should be reimplemented in the current scheme. Signed-off-by: Jens Axboe --- drivers/block/drbd/Kconfig | 11 - drivers/block/drbd/Makefile | 3 - drivers/block/drbd/drbd_actlog.c | 62 +-- drivers/block/drbd/drbd_int.h | 7 - drivers/block/drbd/drbd_main.c | 36 +- drivers/block/drbd/drbd_nl.c | 9 - drivers/block/drbd/drbd_receiver.c | 30 +- drivers/block/drbd/drbd_req.c | 11 - drivers/block/drbd/drbd_tracing.c | 752 ----------------------------- drivers/block/drbd/drbd_tracing.h | 87 ---- drivers/block/drbd/drbd_worker.c | 16 - 11 files changed, 3 insertions(+), 1021 deletions(-) delete mode 100644 drivers/block/drbd/drbd_tracing.c delete mode 100644 drivers/block/drbd/drbd_tracing.h diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig index 4e6f90f487c..f4acd04ebee 100644 --- a/drivers/block/drbd/Kconfig +++ b/drivers/block/drbd/Kconfig @@ -38,17 +38,6 @@ config BLK_DEV_DRBD If unsure, say N. -config DRBD_TRACE - tristate "DRBD tracing" - depends on BLK_DEV_DRBD - select TRACEPOINTS - default n - help - - Say Y here if you want to be able to trace various events in DRBD. - - If unsure, say N. - config DRBD_FAULT_INJECTION bool "DRBD fault injection" depends on BLK_DEV_DRBD diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile index 7d86ef8a8b4..0d3f337ff5f 100644 --- a/drivers/block/drbd/Makefile +++ b/drivers/block/drbd/Makefile @@ -2,7 +2,4 @@ drbd-y := drbd_bitmap.o drbd_proc.o drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o drbd-y += drbd_main.o drbd_strings.o drbd_nl.o -drbd_trace-y := drbd_tracing.o - obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o -obj-$(CONFIG_DRBD_TRACE) += drbd_trace.o diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 74b4835d310..17956ff6a08 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -26,7 +26,6 @@ #include #include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_wrappers.h" /* We maintain a trivial check sum in our on disk activity log. @@ -66,17 +65,6 @@ struct drbd_atodb_wait { int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); -/* The actual tracepoint needs to have constant number of known arguments... - */ -void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - trace__drbd_resync(mdev, level, fmt, ap); - va_end(ap); -} - static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, @@ -105,8 +93,6 @@ static int _drbd_md_sync_page_io(struct drbd_conf *mdev, bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; - trace_drbd_bio(mdev, "Md", bio, 0, NULL); - if (FAULT_ACTIVE(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else @@ -236,8 +222,6 @@ void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector) D_ASSERT(atomic_read(&mdev->local_cnt) > 0); - trace_drbd_actlog(mdev, sector, "al_begin_io"); - wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr))); if (al_ext->lc_number != enr) { @@ -270,8 +254,6 @@ void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector) struct lc_element *extent; unsigned long flags; - trace_drbd_actlog(mdev, sector, "al_complete_io"); - spin_lock_irqsave(&mdev->al_lock, flags); extent = lc_find(mdev->act_log, enr); @@ -967,10 +949,6 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size, ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1)); sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1); - trace_drbd_resync(mdev, TRACE_LVL_METRICS, - "drbd_set_in_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", - (unsigned long long)sector, size, sbnr, ebnr); - if (sbnr > ebnr) return; @@ -1045,10 +1023,6 @@ void __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, sbnr = BM_SECT_TO_BIT(sector); ebnr = BM_SECT_TO_BIT(esector); - trace_drbd_resync(mdev, TRACE_LVL_METRICS, - "drbd_set_out_of_sync: sector=%llus size=%u sbnr=%lu ebnr=%lu\n", - (unsigned long long)sector, size, sbnr, ebnr); - /* ok, (capacity & 7) != 0 sometimes, but who cares... * we count rs_{total,left} in bits, not sectors. */ spin_lock_irqsave(&mdev->al_lock, flags); @@ -1143,10 +1117,6 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector) struct bm_extent *bm_ext; int i, sig; - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "drbd_rs_begin_io: sector=%llus (rs_end=%d)\n", - (unsigned long long)sector, enr); - sig = wait_event_interruptible(mdev->al_wait, (bm_ext = _bme_get(mdev, enr))); if (sig) @@ -1192,9 +1162,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) struct bm_extent *bm_ext; int i; - trace_drbd_resync(mdev, TRACE_LVL_ALL, "drbd_try_rs_begin_io: sector=%llus\n", - (unsigned long long)sector); - spin_lock_irq(&mdev->al_lock); if (mdev->resync_wenr != LC_FREE && mdev->resync_wenr != enr) { /* in case you have very heavy scattered io, it may @@ -1210,11 +1177,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) * the lc_put here... * we also have to wake_up */ - - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "dropping %u, apparently got 'synced' by application io\n", - mdev->resync_wenr); - e = lc_find(mdev->resync, mdev->resync_wenr); bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; if (bm_ext) { @@ -1242,21 +1204,14 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) * but then could not set BME_LOCKED, * so we tried again. * drop the extra reference. */ - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "dropping extra reference on %u\n", enr); - bm_ext->lce.refcnt--; D_ASSERT(bm_ext->lce.refcnt > 0); } goto check_al; } else { /* do we rather want to try later? */ - if (mdev->resync_locked > mdev->resync->nr_elements-3) { - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "resync_locked = %u!\n", mdev->resync_locked); - + if (mdev->resync_locked > mdev->resync->nr_elements-3) goto try_again; - } /* Do or do not. There is no try. -- Yoda */ e = lc_get(mdev->resync, enr); bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; @@ -1281,8 +1236,6 @@ int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector) goto check_al; } check_al: - trace_drbd_resync(mdev, TRACE_LVL_ALL, "checking al for %u\n", enr); - for (i = 0; i < AL_EXT_PER_BM_SECT; i++) { if (unlikely(al_enr+i == mdev->act_log->new_number)) goto try_again; @@ -1296,7 +1249,6 @@ proceed: return 0; try_again: - trace_drbd_resync(mdev, TRACE_LVL_ALL, "need to try again for %u\n", enr); if (bm_ext) mdev->resync_wenr = enr; spin_unlock_irq(&mdev->al_lock); @@ -1310,10 +1262,6 @@ void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) struct bm_extent *bm_ext; unsigned long flags; - trace_drbd_resync(mdev, TRACE_LVL_ALL, - "drbd_rs_complete_io: sector=%llus (rs_enr=%d)\n", - (long long)sector, enr); - spin_lock_irqsave(&mdev->al_lock, flags); e = lc_find(mdev->resync, enr); bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL; @@ -1348,8 +1296,6 @@ void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector) */ void drbd_rs_cancel_all(struct drbd_conf *mdev) { - trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_cancel_all\n"); - spin_lock_irq(&mdev->al_lock); if (get_ldev_if_state(mdev, D_FAILED)) { /* Makes sure ->resync is there. */ @@ -1375,8 +1321,6 @@ int drbd_rs_del_all(struct drbd_conf *mdev) struct bm_extent *bm_ext; int i; - trace_drbd_resync(mdev, TRACE_LVL_METRICS, "drbd_rs_del_all\n"); - spin_lock_irq(&mdev->al_lock); if (get_ldev_if_state(mdev, D_FAILED)) { @@ -1429,10 +1373,6 @@ void drbd_rs_failed_io(struct drbd_conf *mdev, sector_t sector, int size) sector_t esector, nr_sectors; int wake_up = 0; - trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, - "drbd_rs_failed_io: sector=%llus, size=%u\n", - (unsigned long long)sector, size); - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_SEGMENT_SIZE) { dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n", (unsigned long long)sector, size); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8da602e010b..4e6255991e5 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -135,8 +135,6 @@ enum { DRBD_FAULT_MAX, }; -extern void trace_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, ...); - #ifdef CONFIG_DRBD_FAULT_INJECTION extern unsigned int _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type); @@ -712,11 +710,6 @@ enum epoch_event { EV_GOT_BARRIER_NR, EV_BARRIER_DONE, EV_BECAME_LAST, - EV_TRACE_FLUSH, /* TRACE_ are not real events, only used for tracing */ - EV_TRACE_ADD_BARRIER, /* Doing the first write as a barrier write */ - EV_TRACE_SETTING_BI, /* Barrier is expressed with the first write of the next epoch */ - EV_TRACE_ALLOC, - EV_TRACE_FREE, EV_CLEANUP = 32, /* used as flag */ }; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 80273f21a4a..11d8ff6016a 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -53,7 +53,6 @@ #include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */ #include "drbd_vli.h" @@ -80,18 +79,6 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); -DEFINE_TRACE(drbd_unplug); -DEFINE_TRACE(drbd_uuid); -DEFINE_TRACE(drbd_ee); -DEFINE_TRACE(drbd_packet); -DEFINE_TRACE(drbd_md_io); -DEFINE_TRACE(drbd_epoch); -DEFINE_TRACE(drbd_netlink); -DEFINE_TRACE(drbd_actlog); -DEFINE_TRACE(drbd_bio); -DEFINE_TRACE(_drbd_resync); -DEFINE_TRACE(drbd_req); - MODULE_AUTHOR("Philipp Reisner , " "Lars Ellenberg "); MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION); @@ -1576,7 +1563,6 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, h->command = cpu_to_be16(cmd); h->length = cpu_to_be16(size-sizeof(struct p_header)); - trace_drbd_packet(mdev, sock, 0, (void *)h, __FILE__, __LINE__); sent = drbd_send(mdev, sock, h, size, msg_flags); ok = (sent == size); @@ -1628,8 +1614,6 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data, if (!drbd_get_data_sock(mdev)) return 0; - trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&h, __FILE__, __LINE__); - ok = (sizeof(h) == drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0)); ok = ok && (size == @@ -2359,7 +2343,6 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) dp_flags |= DP_MAY_SET_IN_SYNC; p.dp_flags = cpu_to_be32(dp_flags); - trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); set_bit(UNPLUG_REMOTE, &mdev->flags); ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE)); @@ -2410,7 +2393,6 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd, if (!drbd_get_data_sock(mdev)) return 0; - trace_drbd_packet(mdev, mdev->data.socket, 0, (void *)&p, __FILE__, __LINE__); ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), MSG_MORE); if (ok && dgs) { @@ -2546,8 +2528,6 @@ static void drbd_unplug_fn(struct request_queue *q) { struct drbd_conf *mdev = q->queuedata; - trace_drbd_unplug(mdev, "got unplugged"); - /* unplug FIRST */ spin_lock_irq(q->queue_lock); blk_remove_plug(q); @@ -3252,8 +3232,6 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!get_ldev_if_state(mdev, D_FAILED)) return; - trace_drbd_md_io(mdev, WRITE, mdev->ldev); - mutex_lock(&mdev->md_io_mutex); buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); memset(buffer, 0, 512); @@ -3308,8 +3286,6 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - trace_drbd_md_io(mdev, READ, bdev); - mutex_lock(&mdev->md_io_mutex); buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); @@ -3388,11 +3364,8 @@ static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local) { int i; - for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) { + for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i]; - - trace_drbd_uuid(mdev, i+1); - } } void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) @@ -3407,7 +3380,6 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) } mdev->ldev->md.uuid[idx] = val; - trace_drbd_uuid(mdev, idx); drbd_md_mark_dirty(mdev); } @@ -3417,7 +3389,6 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local) if (mdev->ldev->md.uuid[idx]) { drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx]; - trace_drbd_uuid(mdev, UI_HISTORY_START); } _drbd_uuid_set(mdev, idx, val); } @@ -3436,7 +3407,6 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local) dev_info(DEV, "Creating new current UUID\n"); D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0); mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT]; - trace_drbd_uuid(mdev, UI_BITMAP); get_random_bytes(&val, sizeof(u64)); _drbd_uuid_set(mdev, UI_CURRENT, val); @@ -3451,8 +3421,6 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) drbd_uuid_move_history(mdev); mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP]; mdev->ldev->md.uuid[UI_BITMAP] = 0; - trace_drbd_uuid(mdev, UI_HISTORY_START); - trace_drbd_uuid(mdev, UI_BITMAP); } else { if (mdev->ldev->md.uuid[UI_BITMAP]) dev_warn(DEV, "bm UUID already set"); @@ -3460,7 +3428,6 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local) mdev->ldev->md.uuid[UI_BITMAP] = val; mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1); - trace_drbd_uuid(mdev, UI_BITMAP); } drbd_md_mark_dirty(mdev); } @@ -3727,7 +3694,6 @@ const char *drbd_buildtag(void) module_init(drbd_init) module_exit(drbd_cleanup) -/* For drbd_tracing: */ EXPORT_SYMBOL(drbd_conn_str); EXPORT_SYMBOL(drbd_role_str); EXPORT_SYMBOL(drbd_disk_str); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index cfde31002df..73c55ccb629 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -33,7 +33,6 @@ #include #include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_wrappers.h" #include #include @@ -2024,8 +2023,6 @@ static void drbd_connector_callback(struct cn_msg *req) goto fail; } - trace_drbd_netlink(req, 1); - if (nlp->packet_type >= P_nl_after_last_packet) { retcode = ERR_PACKET_NR; goto fail; @@ -2063,7 +2060,6 @@ static void drbd_connector_callback(struct cn_msg *req) cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr; cn_reply->flags = 0; - trace_drbd_netlink(cn_reply, 0); rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL); if (rr && rr != -ESRCH) printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); @@ -2157,7 +2153,6 @@ void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state) reply->minor = mdev_to_minor(mdev); reply->ret_code = NO_ERROR; - trace_drbd_netlink(cn_reply, 0); cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); } @@ -2190,7 +2185,6 @@ void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name) reply->minor = mdev_to_minor(mdev); reply->ret_code = NO_ERROR; - trace_drbd_netlink(cn_reply, 0); cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); } @@ -2262,7 +2256,6 @@ void drbd_bcast_ee(struct drbd_conf *mdev, reply->minor = mdev_to_minor(mdev); reply->ret_code = NO_ERROR; - trace_drbd_netlink(cn_reply, 0); cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); kfree(cn_reply); } @@ -2302,7 +2295,6 @@ void drbd_bcast_sync_progress(struct drbd_conf *mdev) reply->minor = mdev_to_minor(mdev); reply->ret_code = NO_ERROR; - trace_drbd_netlink(cn_reply, 0); cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); } @@ -2356,7 +2348,6 @@ void drbd_nl_send_reply(struct cn_msg *req, int ret_code) reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor; reply->ret_code = ret_code; - trace_drbd_netlink(cn_reply, 0); rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO); if (rr && rr != -ESRCH) printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 2f81821c2e0..360baf60f57 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -47,7 +47,6 @@ #include #include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_req.h" #include "drbd_vli.h" @@ -350,8 +349,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, e->epoch = NULL; e->flags = 0; - trace_drbd_ee(mdev, e, "allocated"); - return e; fail2: @@ -366,7 +363,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e) { struct bio *bio = e->private_bio; - trace_drbd_ee(mdev, e, "freed"); drbd_pp_free_bio_pages(mdev, bio); bio_put(bio); D_ASSERT(hlist_unhashed(&e->colision)); @@ -420,7 +416,6 @@ static int drbd_process_done_ee(struct drbd_conf *mdev) * all ignore the last argument. */ list_for_each_entry_safe(e, t, &work_list, w.list) { - trace_drbd_ee(mdev, e, "process_done_ee"); /* list_del not necessary, next/prev members not touched */ ok = e->w.cb(mdev, &e->w, !ok) && ok; drbd_free_ee(mdev, e); @@ -1021,8 +1016,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, break; } - trace_drbd_epoch(mdev, epoch, ev); - if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) && @@ -1054,7 +1047,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, list_del(&epoch->list); ev = EV_BECAME_LAST | (ev & EV_CLEANUP); mdev->epochs--; - trace_drbd_epoch(mdev, epoch, EV_TRACE_FREE); kfree(epoch); if (rv == FE_STILL_LIVE) @@ -1080,7 +1072,6 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, struct flush_work *fw; fw = kmalloc(sizeof(*fw), GFP_ATOMIC); if (fw) { - trace_drbd_epoch(mdev, epoch, EV_TRACE_FLUSH); fw->w.cb = w_flush; fw->epoch = epoch; drbd_queue_work(&mdev->data.work, &fw->w); @@ -1251,7 +1242,6 @@ static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h) list_add(&epoch->list, &mdev->current_epoch->list); mdev->current_epoch = epoch; mdev->epochs++; - trace_drbd_epoch(mdev, epoch, EV_TRACE_ALLOC); } else { /* The current_epoch got recycled while we allocated this one... */ kfree(epoch); @@ -1458,8 +1448,6 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si list_add(&e->w.list, &mdev->sync_ee); spin_unlock_irq(&mdev->req_lock); - trace_drbd_ee(mdev, e, "submitting for (rs)write"); - trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); drbd_generic_make_request(mdev, DRBD_FAULT_RS_WR, e->private_bio); /* accounting done in endio */ @@ -1721,16 +1709,13 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) epoch = list_entry(e->epoch->list.prev, struct drbd_epoch, list); if (epoch == e->epoch) { set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); rw |= (1<flags |= EE_IS_BARRIER; } else { if (atomic_read(&epoch->epoch_size) > 1 || !test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags)) { set_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags); - trace_drbd_epoch(mdev, epoch, EV_TRACE_SETTING_BI); set_bit(DE_CONTAINS_A_BARRIER, &e->epoch->flags); - trace_drbd_epoch(mdev, e->epoch, EV_TRACE_ADD_BARRIER); rw |= (1<flags |= EE_IS_BARRIER; } @@ -1905,8 +1890,6 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h) } e->private_bio->bi_rw = rw; - trace_drbd_ee(mdev, e, "submitting for (data)write"); - trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); drbd_generic_make_request(mdev, DRBD_FAULT_DT_WR, e->private_bio); /* accounting done in endio */ @@ -2065,8 +2048,6 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h) inc_unacked(mdev); - trace_drbd_ee(mdev, e, "submitting for read"); - trace_drbd_bio(mdev, "Sec", e->private_bio, 0, NULL); drbd_generic_make_request(mdev, fault_type, e->private_bio); maybe_kick_lo(mdev); @@ -3543,9 +3524,6 @@ static void drbdd(struct drbd_conf *mdev) drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); break; } - - trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, - __FILE__, __LINE__); } } @@ -3825,9 +3803,6 @@ static int drbd_do_handshake(struct drbd_conf *mdev) return 0; } - trace_drbd_packet(mdev, mdev->data.socket, 2, &mdev->data.rbuf, - __FILE__, __LINE__); - p->protocol_min = be32_to_cpu(p->protocol_min); p->protocol_max = be32_to_cpu(p->protocol_max); if (p->protocol_max == 0) @@ -4420,14 +4395,11 @@ int drbd_asender(struct drbd_thread *thi) goto disconnect; } expect = cmd->pkt_size; - ERR_IF(len != expect-sizeof(struct p_header)) { - trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); + ERR_IF(len != expect-sizeof(struct p_header)) goto reconnect; - } } if (received == expect) { D_ASSERT(cmd != NULL); - trace_drbd_packet(mdev, mdev->meta.socket, 1, (void *)h, __FILE__, __LINE__); if (!cmd->process(mdev, h)) goto reconnect; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 1aaa397669a..3678d3d66c6 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -28,7 +28,6 @@ #include #include #include "drbd_int.h" -#include "drbd_tracing.h" #include "drbd_req.h" @@ -218,7 +217,6 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev, void complete_master_bio(struct drbd_conf *mdev, struct bio_and_error *m) { - trace_drbd_bio(mdev, "Rq", m->bio, 1, NULL); bio_endio(m->bio, m->error); dec_ap_bio(mdev); } @@ -236,8 +234,6 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) /* only WRITES may end up here without a master bio (on barrier ack) */ int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; - trace_drbd_req(req, nothing, "_req_may_be_done"); - /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) * not yet acknowledged by the peer @@ -415,8 +411,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, struct drbd_conf *mdev = req->mdev; m->bio = NULL; - trace_drbd_req(req, what, NULL); - switch (what) { default: dev_err(DEV, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__); @@ -666,7 +660,6 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, * this is bad, because if the connection is lost now, * we won't be able to clean them up... */ dev_err(DEV, "FIXME (barrier_acked but pending)\n"); - trace_drbd_req(req, nothing, "FIXME (barrier_acked but pending)"); list_move(&req->tl_requests, &mdev->out_of_sequence_requests); } D_ASSERT(req->rq_state & RQ_NET_SENT); @@ -736,8 +729,6 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio) return 0; } - trace_drbd_bio(mdev, "Rq", bio, 0, req); - local = get_ldev(mdev); if (!local) { bio_put(req->private_bio); /* or we get a bio leak */ @@ -928,8 +919,6 @@ allocate_barrier: if (local) { req->private_bio->bi_bdev = mdev->ldev->backing_bdev; - trace_drbd_bio(mdev, "Pri", req->private_bio, 0, NULL); - if (FAULT_ACTIVE(mdev, rw == WRITE ? DRBD_FAULT_DT_WR : rw == READ ? DRBD_FAULT_DT_RD : DRBD_FAULT_DT_RA)) diff --git a/drivers/block/drbd/drbd_tracing.c b/drivers/block/drbd/drbd_tracing.c deleted file mode 100644 index d18d4f7b4be..00000000000 --- a/drivers/block/drbd/drbd_tracing.c +++ /dev/null @@ -1,752 +0,0 @@ -/* - drbd_tracing.c - - This file is part of DRBD by Philipp Reisner and Lars Ellenberg. - - Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. - Copyright (C) 2003-2008, Philipp Reisner . - Copyright (C) 2003-2008, Lars Ellenberg . - - drbd is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - drbd is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with drbd; see the file COPYING. If not, write to - the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - - */ - -#include -#include -#include -#include "drbd_int.h" -#include "drbd_tracing.h" -#include - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Philipp Reisner, Lars Ellenberg"); -MODULE_DESCRIPTION("DRBD tracepoint probes"); -MODULE_PARM_DESC(trace_mask, "Bitmap of events to trace see drbd_tracing.c"); -MODULE_PARM_DESC(trace_level, "Current tracing level (changeable in /sys)"); -MODULE_PARM_DESC(trace_devs, "Bitmap of devices to trace (changeable in /sys)"); - -unsigned int trace_mask = 0; /* Bitmap of events to trace */ -int trace_level; /* Current trace level */ -int trace_devs; /* Bitmap of devices to trace */ - -module_param(trace_mask, uint, 0444); -module_param(trace_level, int, 0644); -module_param(trace_devs, int, 0644); - -enum { - TRACE_PACKET = 0x0001, - TRACE_RQ = 0x0002, - TRACE_UUID = 0x0004, - TRACE_RESYNC = 0x0008, - TRACE_EE = 0x0010, - TRACE_UNPLUG = 0x0020, - TRACE_NL = 0x0040, - TRACE_AL_EXT = 0x0080, - TRACE_INT_RQ = 0x0100, - TRACE_MD_IO = 0x0200, - TRACE_EPOCH = 0x0400, -}; - -/* Buffer printing support - * dbg_print_flags: used for Flags arg to drbd_print_buffer - * - DBGPRINT_BUFFADDR; if set, each line starts with the - * virtual address of the line being output. If clear, - * each line starts with the offset from the beginning - * of the buffer. */ -enum dbg_print_flags { - DBGPRINT_BUFFADDR = 0x0001, -}; - -/* Macro stuff */ -static char *nl_packet_name(int packet_type) -{ -/* Generate packet type strings */ -#define NL_PACKET(name, number, fields) \ - [P_ ## name] = # name, -#define NL_INTEGER Argh! -#define NL_BIT Argh! -#define NL_INT64 Argh! -#define NL_STRING Argh! - - static char *nl_tag_name[P_nl_after_last_packet] = { -#include "linux/drbd_nl.h" - }; - - return (packet_type < sizeof(nl_tag_name)/sizeof(nl_tag_name[0])) ? - nl_tag_name[packet_type] : "*Unknown*"; -} -/* /Macro stuff */ - -static inline int is_mdev_trace(struct drbd_conf *mdev, unsigned int level) -{ - return trace_level >= level && ((1 << mdev_to_minor(mdev)) & trace_devs); -} - -static void probe_drbd_unplug(struct drbd_conf *mdev, char *msg) -{ - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, "%s, ap_bio_count=%d\n", msg, atomic_read(&mdev->ap_bio_cnt)); -} - -static void probe_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index) -{ - static char *uuid_str[UI_EXTENDED_SIZE] = { - [UI_CURRENT] = "CURRENT", - [UI_BITMAP] = "BITMAP", - [UI_HISTORY_START] = "HISTORY_START", - [UI_HISTORY_END] = "HISTORY_END", - [UI_SIZE] = "SIZE", - [UI_FLAGS] = "FLAGS", - }; - - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - if (index >= UI_EXTENDED_SIZE) { - dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n"); - return; - } - - dev_info(DEV, " uuid[%s] now %016llX\n", - uuid_str[index], - (unsigned long long)mdev->ldev->md.uuid[index]); -} - -static void probe_drbd_md_io(struct drbd_conf *mdev, int rw, - struct drbd_backing_dev *bdev) -{ - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, " %s metadata superblock now\n", - rw == READ ? "Reading" : "Writing"); -} - -static void probe_drbd_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg) -{ - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, "EE %s sec=%llus size=%u e=%p\n", - msg, (unsigned long long)e->sector, e->size, e); -} - -static void probe_drbd_epoch(struct drbd_conf *mdev, struct drbd_epoch *epoch, - enum epoch_event ev) -{ - static char *epoch_event_str[] = { - [EV_PUT] = "put", - [EV_GOT_BARRIER_NR] = "got_barrier_nr", - [EV_BARRIER_DONE] = "barrier_done", - [EV_BECAME_LAST] = "became_last", - [EV_TRACE_FLUSH] = "issuing_flush", - [EV_TRACE_ADD_BARRIER] = "added_barrier", - [EV_TRACE_SETTING_BI] = "just set barrier_in_next_epoch", - }; - - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - ev &= ~EV_CLEANUP; - - switch (ev) { - case EV_TRACE_ALLOC: - dev_info(DEV, "Allocate epoch %p/xxxx { } nr_epochs=%d\n", epoch, mdev->epochs); - break; - case EV_TRACE_FREE: - dev_info(DEV, "Freeing epoch %p/%d { size=%d } nr_epochs=%d\n", - epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), - mdev->epochs); - break; - default: - dev_info(DEV, "Update epoch %p/%d { size=%d active=%d %c%c n%c%c } ev=%s\n", - epoch, epoch->barrier_nr, atomic_read(&epoch->epoch_size), - atomic_read(&epoch->active), - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) ? 'n' : '-', - test_bit(DE_CONTAINS_A_BARRIER, &epoch->flags) ? 'b' : '-', - test_bit(DE_BARRIER_IN_NEXT_EPOCH_ISSUED, &epoch->flags) ? 'i' : '-', - test_bit(DE_BARRIER_IN_NEXT_EPOCH_DONE, &epoch->flags) ? 'd' : '-', - epoch_event_str[ev]); - } -} - -static void probe_drbd_netlink(void *data, int is_req) -{ - struct cn_msg *msg = data; - - if (is_req) { - struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)msg->data; - - printk(KERN_INFO "drbd%d: " - "Netlink: << %s (%d) - seq: %x, ack: %x, len: %x\n", - nlp->drbd_minor, - nl_packet_name(nlp->packet_type), - nlp->packet_type, - msg->seq, msg->ack, msg->len); - } else { - struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply *)msg->data; - - printk(KERN_INFO "drbd%d: " - "Netlink: >> %s (%d) - seq: %x, ack: %x, len: %x\n", - nlp->minor, - nlp->packet_type == P_nl_after_last_packet ? - "Empty-Reply" : nl_packet_name(nlp->packet_type), - nlp->packet_type, - msg->seq, msg->ack, msg->len); - } -} - -static void probe_drbd_actlog(struct drbd_conf *mdev, sector_t sector, char* msg) -{ - unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9)); - - if (!is_mdev_trace(mdev, TRACE_LVL_ALWAYS)) - return; - - dev_info(DEV, "%s (sec=%llus, al_enr=%u, rs_enr=%d)\n", - msg, (unsigned long long) sector, enr, - (int)BM_SECT_TO_EXT(sector)); -} - -/** - * drbd_print_buffer() - Hexdump arbitrary binary data into a buffer - * @prefix: String is output at the beginning of each line output. - * @flags: Currently only defined flag: DBGPRINT_BUFFADDR; if set, each - * line starts with the virtual address of the line being - * output. If clear, each line starts with the offset from the - * beginning of the buffer. - * @size: Indicates the size of each entry in the buffer. Supported - * values are sizeof(char), sizeof(short) and sizeof(int) - * @buffer: Start address of buffer - * @buffer_va: Virtual address of start of buffer (normally the same - * as Buffer, but having it separate allows it to hold - * file address for example) - * @length: length of buffer - */ -static void drbd_print_buffer(const char *prefix, unsigned int flags, int size, - const void *buffer, const void *buffer_va, - unsigned int length) - -#define LINE_SIZE 16 -#define LINE_ENTRIES (int)(LINE_SIZE/size) -{ - const unsigned char *pstart; - const unsigned char *pstart_va; - const unsigned char *pend; - char bytes_str[LINE_SIZE*3+8], ascii_str[LINE_SIZE+8]; - char *pbytes = bytes_str, *pascii = ascii_str; - int offset = 0; - long sizemask; - int field_width; - int index; - const unsigned char *pend_str; - const unsigned char *p; - int count; - - /* verify size parameter */ - if (size != sizeof(char) && - size != sizeof(short) && - size != sizeof(int)) { - printk(KERN_DEBUG "drbd_print_buffer: " - "ERROR invalid size %d\n", size); - return; - } - - sizemask = size-1; - field_width = size*2; - - /* Adjust start/end to be on appropriate boundary for size */ - buffer = (const char *)((long)buffer & ~sizemask); - pend = (const unsigned char *) - (((long)buffer + length + sizemask) & ~sizemask); - - if (flags & DBGPRINT_BUFFADDR) { - /* Move start back to nearest multiple of line size, - * if printing address. This results in nicely formatted output - * with addresses being on line size (16) byte boundaries */ - pstart = (const unsigned char *)((long)buffer & ~(LINE_SIZE-1)); - } else { - pstart = (const unsigned char *)buffer; - } - - /* Set value of start VA to print if addresses asked for */ - pstart_va = (const unsigned char *)buffer_va - - ((const unsigned char *)buffer-pstart); - - /* Calculate end position to nicely align right hand side */ - pend_str = pstart + (((pend-pstart) + LINE_SIZE-1) & ~(LINE_SIZE-1)); - - /* Init strings */ - *pbytes = *pascii = '\0'; - - /* Start at beginning of first line */ - p = pstart; - count = 0; - - while (p < pend_str) { - if (p < (const unsigned char *)buffer || p >= pend) { - /* Before start of buffer or after end- print spaces */ - pbytes += sprintf(pbytes, "%*c ", field_width, ' '); - pascii += sprintf(pascii, "%*c", size, ' '); - p += size; - } else { - /* Add hex and ascii to strings */ - int val; - switch (size) { - default: - case 1: - val = *(unsigned char *)p; - break; - case 2: - val = *(unsigned short *)p; - break; - case 4: - val = *(unsigned int *)p; - break; - } - - pbytes += sprintf(pbytes, "%0*x ", field_width, val); - - for (index = size; index; index--) { - *pascii++ = isprint(*p) ? *p : '.'; - p++; - } - } - - count++; - - if (count == LINE_ENTRIES || p >= pend_str) { - /* Null terminate and print record */ - *pascii = '\0'; - printk(KERN_DEBUG "%s%8.8lx: %*s|%*s|\n", - prefix, - (flags & DBGPRINT_BUFFADDR) - ? (long)pstart_va:(long)offset, - LINE_ENTRIES*(field_width+1), bytes_str, - LINE_SIZE, ascii_str); - - /* Move onto next line */ - pstart_va += (p-pstart); - pstart = p; - count = 0; - offset += LINE_SIZE; - - /* Re-init strings */ - pbytes = bytes_str; - pascii = ascii_str; - *pbytes = *pascii = '\0'; - } - } -} - -static void probe_drbd_resync(struct drbd_conf *mdev, int level, const char *fmt, va_list args) -{ - char str[256]; - - if (!is_mdev_trace(mdev, level)) - return; - - if (vsnprintf(str, 256, fmt, args) >= 256) - str[255] = 0; - - printk(KERN_INFO "%s %s: %s", dev_driver_string(disk_to_dev(mdev->vdisk)), - dev_name(disk_to_dev(mdev->vdisk)), str); -} - -static void probe_drbd_bio(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, - struct drbd_request *r) -{ -#if defined(CONFIG_LBDAF) || defined(CONFIG_LBD) -#define SECTOR_FORMAT "%Lx" -#else -#define SECTOR_FORMAT "%lx" -#endif -#define SECTOR_SHIFT 9 - - unsigned long lowaddr = (unsigned long)(bio->bi_sector << SECTOR_SHIFT); - char *faddr = (char *)(lowaddr); - char rb[sizeof(void *)*2+6] = { 0, }; - struct bio_vec *bvec; - int segno; - - const int rw = bio->bi_rw; - const int biorw = (rw & (RW_MASK|RWA_MASK)); - const int biobarrier = (rw & (1<>>", - pfx, - biorw == WRITE ? "Write" : "Read", - biobarrier ? " : B" : "", - biosync ? " : S" : "", - bio, - rb, - complete ? (bio_flagged(bio, BIO_UPTODATE) ? "Success, " : "Failed, ") : "", - bio->bi_sector << SECTOR_SHIFT, - bio->bi_size); - - if (trace_level >= TRACE_LVL_METRICS && - ((biorw == WRITE) ^ complete)) { - printk(KERN_DEBUG " ind page offset length\n"); - __bio_for_each_segment(bvec, bio, segno, 0) { - printk(KERN_DEBUG " [%d] %p %8.8x %8.8x\n", segno, - bvec->bv_page, bvec->bv_offset, bvec->bv_len); - - if (trace_level >= TRACE_LVL_ALL) { - char *bvec_buf; - unsigned long flags; - - bvec_buf = bvec_kmap_irq(bvec, &flags); - - drbd_print_buffer(" ", DBGPRINT_BUFFADDR, 1, - bvec_buf, - faddr, - (bvec->bv_len <= 0x80) - ? bvec->bv_len : 0x80); - - bvec_kunmap_irq(bvec_buf, &flags); - - if (bvec->bv_len > 0x40) - printk(KERN_DEBUG " ....\n"); - - faddr += bvec->bv_len; - } - } - } -} - -static void probe_drbd_req(struct drbd_request *req, enum drbd_req_event what, char *msg) -{ - static const char *rq_event_names[] = { - [created] = "created", - [to_be_send] = "to_be_send", - [to_be_submitted] = "to_be_submitted", - [queue_for_net_write] = "queue_for_net_write", - [queue_for_net_read] = "queue_for_net_read", - [send_canceled] = "send_canceled", - [send_failed] = "send_failed", - [handed_over_to_network] = "handed_over_to_network", - [connection_lost_while_pending] = - "connection_lost_while_pending", - [recv_acked_by_peer] = "recv_acked_by_peer", - [write_acked_by_peer] = "write_acked_by_peer", - [neg_acked] = "neg_acked", - [conflict_discarded_by_peer] = "conflict_discarded_by_peer", - [barrier_acked] = "barrier_acked", - [data_received] = "data_received", - [read_completed_with_error] = "read_completed_with_error", - [read_ahead_completed_with_error] = "reada_completed_with_error", - [write_completed_with_error] = "write_completed_with_error", - [completed_ok] = "completed_ok", - }; - - struct drbd_conf *mdev = req->mdev; - - const int rw = (req->master_bio == NULL || - bio_data_dir(req->master_bio) == WRITE) ? - 'W' : 'R'; - const unsigned long s = req->rq_state; - - if (what != nothing) { - dev_info(DEV, "__req_mod(%p %c ,%s)\n", req, rw, rq_event_names[what]); - } else { - dev_info(DEV, "%s %p %c L%c%c%cN%c%c%c%c%c %u (%llus +%u) %s\n", - msg, req, rw, - s & RQ_LOCAL_PENDING ? 'p' : '-', - s & RQ_LOCAL_COMPLETED ? 'c' : '-', - s & RQ_LOCAL_OK ? 'o' : '-', - s & RQ_NET_PENDING ? 'p' : '-', - s & RQ_NET_QUEUED ? 'q' : '-', - s & RQ_NET_SENT ? 's' : '-', - s & RQ_NET_DONE ? 'd' : '-', - s & RQ_NET_OK ? 'o' : '-', - req->epoch, - (unsigned long long)req->sector, - req->size, - drbd_conn_str(mdev->state.conn)); - } -} - - -#define drbd_peer_str drbd_role_str -#define drbd_pdsk_str drbd_disk_str - -#define PSM(A) \ -do { \ - if (mask.A) { \ - int i = snprintf(p, len, " " #A "( %s )", \ - drbd_##A##_str(val.A)); \ - if (i >= len) \ - return op; \ - p += i; \ - len -= i; \ - } \ -} while (0) - -static char *dump_st(char *p, int len, union drbd_state mask, union drbd_state val) -{ - char *op = p; - *p = '\0'; - PSM(role); - PSM(peer); - PSM(conn); - PSM(disk); - PSM(pdsk); - - return op; -} - -#define INFOP(fmt, args...) \ -do { \ - if (trace_level >= TRACE_LVL_ALL) { \ - dev_info(DEV, "%s:%d: %s [%d] %s %s " fmt , \ - file, line, current->comm, current->pid, \ - sockname, recv ? "<<<" : ">>>" , \ - ## args); \ - } else { \ - dev_info(DEV, "%s %s " fmt, sockname, \ - recv ? "<<<" : ">>>" , \ - ## args); \ - } \ -} while (0) - -static char *_dump_block_id(u64 block_id, char *buff) -{ - if (is_syncer_block_id(block_id)) - strcpy(buff, "SyncerId"); - else - sprintf(buff, "%llx", (unsigned long long)block_id); - - return buff; -} - -static void probe_drbd_packet(struct drbd_conf *mdev, struct socket *sock, - int recv, union p_polymorph *p, char *file, int line) -{ - char *sockname = sock == mdev->meta.socket ? "meta" : "data"; - int cmd = (recv == 2) ? p->header.command : be16_to_cpu(p->header.command); - char tmp[300]; - union drbd_state m, v; - - switch (cmd) { - case P_HAND_SHAKE: - INFOP("%s (protocol %u-%u)\n", cmdname(cmd), - be32_to_cpu(p->handshake.protocol_min), - be32_to_cpu(p->handshake.protocol_max)); - break; - - case P_BITMAP: /* don't report this */ - case P_COMPRESSED_BITMAP: /* don't report this */ - break; - - case P_DATA: - INFOP("%s (sector %llus, id %s, seq %u, f %x)\n", cmdname(cmd), - (unsigned long long)be64_to_cpu(p->data.sector), - _dump_block_id(p->data.block_id, tmp), - be32_to_cpu(p->data.seq_num), - be32_to_cpu(p->data.dp_flags) - ); - break; - - case P_DATA_REPLY: - case P_RS_DATA_REPLY: - INFOP("%s (sector %llus, id %s)\n", cmdname(cmd), - (unsigned long long)be64_to_cpu(p->data.sector), - _dump_block_id(p->data.block_id, tmp) - ); - break; - - case P_RECV_ACK: - case P_WRITE_ACK: - case P_RS_WRITE_ACK: - case P_DISCARD_ACK: - case P_NEG_ACK: - case P_NEG_RS_DREPLY: - INFOP("%s (sector %llus, size %u, id %s, seq %u)\n", - cmdname(cmd), - (long long)be64_to_cpu(p->block_ack.sector), - be32_to_cpu(p->block_ack.blksize), - _dump_block_id(p->block_ack.block_id, tmp), - be32_to_cpu(p->block_ack.seq_num) - ); - break; - - case P_DATA_REQUEST: - case P_RS_DATA_REQUEST: - INFOP("%s (sector %llus, size %u, id %s)\n", cmdname(cmd), - (long long)be64_to_cpu(p->block_req.sector), - be32_to_cpu(p->block_req.blksize), - _dump_block_id(p->block_req.block_id, tmp) - ); - break; - - case P_BARRIER: - case P_BARRIER_ACK: - INFOP("%s (barrier %u)\n", cmdname(cmd), p->barrier.barrier); - break; - - case P_SYNC_PARAM: - case P_SYNC_PARAM89: - INFOP("%s (rate %u, verify-alg \"%.64s\", csums-alg \"%.64s\")\n", - cmdname(cmd), be32_to_cpu(p->rs_param_89.rate), - p->rs_param_89.verify_alg, p->rs_param_89.csums_alg); - break; - - case P_UUIDS: - INFOP("%s Curr:%016llX, Bitmap:%016llX, " - "HisSt:%016llX, HisEnd:%016llX\n", - cmdname(cmd), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_CURRENT]), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_BITMAP]), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_START]), - (unsigned long long)be64_to_cpu(p->uuids.uuid[UI_HISTORY_END])); - break; - - case P_SIZES: - INFOP("%s (d %lluMiB, u %lluMiB, c %lldMiB, " - "max bio %x, q order %x)\n", - cmdname(cmd), - (long long)(be64_to_cpu(p->sizes.d_size)>>(20-9)), - (long long)(be64_to_cpu(p->sizes.u_size)>>(20-9)), - (long long)(be64_to_cpu(p->sizes.c_size)>>(20-9)), - be32_to_cpu(p->sizes.max_segment_size), - be32_to_cpu(p->sizes.queue_order_type)); - break; - - case P_STATE: - v.i = be32_to_cpu(p->state.state); - m.i = 0xffffffff; - dump_st(tmp, sizeof(tmp), m, v); - INFOP("%s (s %x {%s})\n", cmdname(cmd), v.i, tmp); - break; - - case P_STATE_CHG_REQ: - m.i = be32_to_cpu(p->req_state.mask); - v.i = be32_to_cpu(p->req_state.val); - dump_st(tmp, sizeof(tmp), m, v); - INFOP("%s (m %x v %x {%s})\n", cmdname(cmd), m.i, v.i, tmp); - break; - - case P_STATE_CHG_REPLY: - INFOP("%s (ret %x)\n", cmdname(cmd), - be32_to_cpu(p->req_state_reply.retcode)); - break; - - case P_PING: - case P_PING_ACK: - /* - * Dont trace pings at summary level - */ - if (trace_level < TRACE_LVL_ALL) - break; - /* fall through... */ - default: - INFOP("%s (%u)\n", cmdname(cmd), cmd); - break; - } -} - - -static int __init drbd_trace_init(void) -{ - int ret; - - if (trace_mask & TRACE_UNPLUG) { - ret = register_trace_drbd_unplug(probe_drbd_unplug); - WARN_ON(ret); - } - if (trace_mask & TRACE_UUID) { - ret = register_trace_drbd_uuid(probe_drbd_uuid); - WARN_ON(ret); - } - if (trace_mask & TRACE_EE) { - ret = register_trace_drbd_ee(probe_drbd_ee); - WARN_ON(ret); - } - if (trace_mask & TRACE_PACKET) { - ret = register_trace_drbd_packet(probe_drbd_packet); - WARN_ON(ret); - } - if (trace_mask & TRACE_MD_IO) { - ret = register_trace_drbd_md_io(probe_drbd_md_io); - WARN_ON(ret); - } - if (trace_mask & TRACE_EPOCH) { - ret = register_trace_drbd_epoch(probe_drbd_epoch); - WARN_ON(ret); - } - if (trace_mask & TRACE_NL) { - ret = register_trace_drbd_netlink(probe_drbd_netlink); - WARN_ON(ret); - } - if (trace_mask & TRACE_AL_EXT) { - ret = register_trace_drbd_actlog(probe_drbd_actlog); - WARN_ON(ret); - } - if (trace_mask & TRACE_RQ) { - ret = register_trace_drbd_bio(probe_drbd_bio); - WARN_ON(ret); - } - if (trace_mask & TRACE_INT_RQ) { - ret = register_trace_drbd_req(probe_drbd_req); - WARN_ON(ret); - } - if (trace_mask & TRACE_RESYNC) { - ret = register_trace__drbd_resync(probe_drbd_resync); - WARN_ON(ret); - } - return 0; -} - -module_init(drbd_trace_init); - -static void __exit drbd_trace_exit(void) -{ - if (trace_mask & TRACE_UNPLUG) - unregister_trace_drbd_unplug(probe_drbd_unplug); - if (trace_mask & TRACE_UUID) - unregister_trace_drbd_uuid(probe_drbd_uuid); - if (trace_mask & TRACE_EE) - unregister_trace_drbd_ee(probe_drbd_ee); - if (trace_mask & TRACE_PACKET) - unregister_trace_drbd_packet(probe_drbd_packet); - if (trace_mask & TRACE_MD_IO) - unregister_trace_drbd_md_io(probe_drbd_md_io); - if (trace_mask & TRACE_EPOCH) - unregister_trace_drbd_epoch(probe_drbd_epoch); - if (trace_mask & TRACE_NL) - unregister_trace_drbd_netlink(probe_drbd_netlink); - if (trace_mask & TRACE_AL_EXT) - unregister_trace_drbd_actlog(probe_drbd_actlog); - if (trace_mask & TRACE_RQ) - unregister_trace_drbd_bio(probe_drbd_bio); - if (trace_mask & TRACE_INT_RQ) - unregister_trace_drbd_req(probe_drbd_req); - if (trace_mask & TRACE_RESYNC) - unregister_trace__drbd_resync(probe_drbd_resync); - - tracepoint_synchronize_unregister(); -} - -module_exit(drbd_trace_exit); diff --git a/drivers/block/drbd/drbd_tracing.h b/drivers/block/drbd/drbd_tracing.h deleted file mode 100644 index c4531a137f6..00000000000 --- a/drivers/block/drbd/drbd_tracing.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - drbd_tracing.h - - This file is part of DRBD by Philipp Reisner and Lars Ellenberg. - - Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. - Copyright (C) 2003-2008, Philipp Reisner . - Copyright (C) 2003-2008, Lars Ellenberg . - - drbd is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - drbd is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with drbd; see the file COPYING. If not, write to - the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - - */ - -#ifndef DRBD_TRACING_H -#define DRBD_TRACING_H - -#include -#include "drbd_int.h" -#include "drbd_req.h" - -enum { - TRACE_LVL_ALWAYS = 0, - TRACE_LVL_SUMMARY, - TRACE_LVL_METRICS, - TRACE_LVL_ALL, - TRACE_LVL_MAX -}; - -DECLARE_TRACE(drbd_unplug, - TP_PROTO(struct drbd_conf *mdev, char* msg), - TP_ARGS(mdev, msg)); - -DECLARE_TRACE(drbd_uuid, - TP_PROTO(struct drbd_conf *mdev, enum drbd_uuid_index index), - TP_ARGS(mdev, index)); - -DECLARE_TRACE(drbd_ee, - TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch_entry *e, char* msg), - TP_ARGS(mdev, e, msg)); - -DECLARE_TRACE(drbd_md_io, - TP_PROTO(struct drbd_conf *mdev, int rw, struct drbd_backing_dev *bdev), - TP_ARGS(mdev, rw, bdev)); - -DECLARE_TRACE(drbd_epoch, - TP_PROTO(struct drbd_conf *mdev, struct drbd_epoch *epoch, enum epoch_event ev), - TP_ARGS(mdev, epoch, ev)); - -DECLARE_TRACE(drbd_netlink, - TP_PROTO(void *data, int is_req), - TP_ARGS(data, is_req)); - -DECLARE_TRACE(drbd_actlog, - TP_PROTO(struct drbd_conf *mdev, sector_t sector, char* msg), - TP_ARGS(mdev, sector, msg)); - -DECLARE_TRACE(drbd_bio, - TP_PROTO(struct drbd_conf *mdev, const char *pfx, struct bio *bio, int complete, - struct drbd_request *r), - TP_ARGS(mdev, pfx, bio, complete, r)); - -DECLARE_TRACE(drbd_req, - TP_PROTO(struct drbd_request *req, enum drbd_req_event what, char *msg), - TP_ARGS(req, what, msg)); - -DECLARE_TRACE(drbd_packet, - TP_PROTO(struct drbd_conf *mdev, struct socket *sock, - int recv, union p_polymorph *p, char *file, int line), - TP_ARGS(mdev, sock, recv, p, file, line)); - -DECLARE_TRACE(_drbd_resync, - TP_PROTO(struct drbd_conf *mdev, int level, const char *fmt, va_list args), - TP_ARGS(mdev, level, fmt, args)); - -#endif diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 34a4b3ef6c0..ed8796f1112 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -40,7 +40,6 @@ #include "drbd_int.h" #include "drbd_req.h" -#include "drbd_tracing.h" #define SLEEP_TIME (HZ/10) @@ -82,8 +81,6 @@ void drbd_md_io_complete(struct bio *bio, int error) md_io = (struct drbd_md_io *)bio->bi_private; md_io->error = error; - trace_drbd_bio(md_io->mdev, "Md", bio, 1, NULL); - complete(&md_io->event); } @@ -114,8 +111,6 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) D_ASSERT(e->block_id != ID_VACANT); - trace_drbd_bio(mdev, "Sec", bio, 1, NULL); - spin_lock_irqsave(&mdev->req_lock, flags); mdev->read_cnt += e->size >> 9; list_del(&e->w.list); @@ -126,8 +121,6 @@ void drbd_endio_read_sec(struct bio *bio, int error) __releases(local) drbd_chk_io_error(mdev, error, FALSE); drbd_queue_work(&mdev->data.work, &e->w); put_ldev(mdev); - - trace_drbd_ee(mdev, e, "read completed"); } /* writes on behalf of the partner, or resync writes, @@ -176,8 +169,6 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) D_ASSERT(e->block_id != ID_VACANT); - trace_drbd_bio(mdev, "Sec", bio, 1, NULL); - spin_lock_irqsave(&mdev->req_lock, flags); mdev->writ_cnt += e->size >> 9; is_syncer_req = is_syncer_block_id(e->block_id); @@ -192,8 +183,6 @@ void drbd_endio_write_sec(struct bio *bio, int error) __releases(local) list_del(&e->w.list); /* has been on active_ee or sync_ee */ list_add_tail(&e->w.list, &mdev->done_ee); - trace_drbd_ee(mdev, e, "write completed"); - /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, * neither did we wake possibly waiting conflicting requests. * done from "drbd_process_done_ee" within the appropriate w.cb @@ -244,8 +233,6 @@ void drbd_endio_pri(struct bio *bio, int error) error = -EIO; } - trace_drbd_bio(mdev, "Pri", bio, 1, NULL); - /* to avoid recursion in __req_mod */ if (unlikely(error)) { what = (bio_data_dir(bio) == WRITE) @@ -1321,9 +1308,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) return; } - trace_drbd_resync(mdev, TRACE_LVL_SUMMARY, "Resync starting: side=%s\n", - side == C_SYNC_TARGET ? "SyncTarget" : "SyncSource"); - /* In case a previous resync run was aborted by an IO error/detach on the peer. */ drbd_rs_cancel_all(mdev); From 492af6350a5ccf087e4964104a276ed358811458 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 3 Oct 2009 09:37:51 +0200 Subject: [PATCH 007/113] block: remove the anticipatory IO scheduler AS is mostly a subset of CFQ, so there's little point in still providing this separate IO scheduler. Hopefully at some point we can get down to one single IO scheduler again, at least this brings us closer by having only one intelligent IO scheduler. Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 22 +- block/Makefile | 1 - block/as-iosched.c | 1520 ----------------------------------------- block/elevator.c | 10 +- 4 files changed, 6 insertions(+), 1547 deletions(-) delete mode 100644 block/as-iosched.c diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 7e803fc8877..baad3dae365 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -12,24 +12,14 @@ config IOSCHED_NOOP that do their own scheduling and require only minimal assistance from the kernel. -config IOSCHED_AS - tristate "Anticipatory I/O scheduler" - default y - ---help--- - The anticipatory I/O scheduler is generally a good choice for most - environments, but is quite large and complex when compared to the - deadline I/O scheduler, it can also be slower in some cases - especially some database loads. - config IOSCHED_DEADLINE tristate "Deadline I/O scheduler" default y ---help--- - The deadline I/O scheduler is simple and compact, and is often as - good as the anticipatory I/O scheduler, and in some database - workloads, better. In the case of a single process performing I/O to - a disk at any one time, its behaviour is almost identical to the - anticipatory I/O scheduler and so is a good choice. + The deadline I/O scheduler is simple and compact. It will provide + CSCAN service with FIFO expiration of requests, switching to + a new point in the service tree and doing a batch of IO from there + in case of expiry. config IOSCHED_CFQ tristate "CFQ I/O scheduler" @@ -47,9 +37,6 @@ choice Select the I/O scheduler which will be used by default for all block devices. - config DEFAULT_AS - bool "Anticipatory" if IOSCHED_AS=y - config DEFAULT_DEADLINE bool "Deadline" if IOSCHED_DEADLINE=y @@ -63,7 +50,6 @@ endchoice config DEFAULT_IOSCHED string - default "anticipatory" if DEFAULT_AS default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ default "noop" if DEFAULT_NOOP diff --git a/block/Makefile b/block/Makefile index ba74ca6bfa1..7914108952f 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,6 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ obj-$(CONFIG_BLK_DEV_BSG) += bsg.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o -obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o diff --git a/block/as-iosched.c b/block/as-iosched.c deleted file mode 100644 index ce8ba57c655..00000000000 --- a/block/as-iosched.c +++ /dev/null @@ -1,1520 +0,0 @@ -/* - * Anticipatory & deadline i/o scheduler. - * - * Copyright (C) 2002 Jens Axboe - * Nick Piggin - * - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * See Documentation/block/as-iosched.txt - */ - -/* - * max time before a read is submitted. - */ -#define default_read_expire (HZ / 8) - -/* - * ditto for writes, these limits are not hard, even - * if the disk is capable of satisfying them. - */ -#define default_write_expire (HZ / 4) - -/* - * read_batch_expire describes how long we will allow a stream of reads to - * persist before looking to see whether it is time to switch over to writes. - */ -#define default_read_batch_expire (HZ / 2) - -/* - * write_batch_expire describes how long we want a stream of writes to run for. - * This is not a hard limit, but a target we set for the auto-tuning thingy. - * See, the problem is: we can send a lot of writes to disk cache / TCQ in - * a short amount of time... - */ -#define default_write_batch_expire (HZ / 8) - -/* - * max time we may wait to anticipate a read (default around 6ms) - */ -#define default_antic_expire ((HZ / 150) ? HZ / 150 : 1) - -/* - * Keep track of up to 20ms thinktimes. We can go as big as we like here, - * however huge values tend to interfere and not decay fast enough. A program - * might be in a non-io phase of operation. Waiting on user input for example, - * or doing a lengthy computation. A small penalty can be justified there, and - * will still catch out those processes that constantly have large thinktimes. - */ -#define MAX_THINKTIME (HZ/50UL) - -/* Bits in as_io_context.state */ -enum as_io_states { - AS_TASK_RUNNING=0, /* Process has not exited */ - AS_TASK_IOSTARTED, /* Process has started some IO */ - AS_TASK_IORUNNING, /* Process has completed some IO */ -}; - -enum anticipation_status { - ANTIC_OFF=0, /* Not anticipating (normal operation) */ - ANTIC_WAIT_REQ, /* The last read has not yet completed */ - ANTIC_WAIT_NEXT, /* Currently anticipating a request vs - last read (which has completed) */ - ANTIC_FINISHED, /* Anticipating but have found a candidate - * or timed out */ -}; - -struct as_data { - /* - * run time data - */ - - struct request_queue *q; /* the "owner" queue */ - - /* - * requests (as_rq s) are present on both sort_list and fifo_list - */ - struct rb_root sort_list[2]; - struct list_head fifo_list[2]; - - struct request *next_rq[2]; /* next in sort order */ - sector_t last_sector[2]; /* last SYNC & ASYNC sectors */ - - unsigned long exit_prob; /* probability a task will exit while - being waited on */ - unsigned long exit_no_coop; /* probablility an exited task will - not be part of a later cooperating - request */ - unsigned long new_ttime_total; /* mean thinktime on new proc */ - unsigned long new_ttime_mean; - u64 new_seek_total; /* mean seek on new proc */ - sector_t new_seek_mean; - - unsigned long current_batch_expires; - unsigned long last_check_fifo[2]; - int changed_batch; /* 1: waiting for old batch to end */ - int new_batch; /* 1: waiting on first read complete */ - int batch_data_dir; /* current batch SYNC / ASYNC */ - int write_batch_count; /* max # of reqs in a write batch */ - int current_write_count; /* how many requests left this batch */ - int write_batch_idled; /* has the write batch gone idle? */ - - enum anticipation_status antic_status; - unsigned long antic_start; /* jiffies: when it started */ - struct timer_list antic_timer; /* anticipatory scheduling timer */ - struct work_struct antic_work; /* Deferred unplugging */ - struct io_context *io_context; /* Identify the expected process */ - int ioc_finished; /* IO associated with io_context is finished */ - int nr_dispatched; - - /* - * settings that change how the i/o scheduler behaves - */ - unsigned long fifo_expire[2]; - unsigned long batch_expire[2]; - unsigned long antic_expire; -}; - -/* - * per-request data. - */ -enum arq_state { - AS_RQ_NEW=0, /* New - not referenced and not on any lists */ - AS_RQ_QUEUED, /* In the request queue. It belongs to the - scheduler */ - AS_RQ_DISPATCHED, /* On the dispatch list. It belongs to the - driver now */ - AS_RQ_PRESCHED, /* Debug poisoning for requests being used */ - AS_RQ_REMOVED, - AS_RQ_MERGED, - AS_RQ_POSTSCHED, /* when they shouldn't be */ -}; - -#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private) -#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) -#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) - -static DEFINE_PER_CPU(unsigned long, as_ioc_count); -static struct completion *ioc_gone; -static DEFINE_SPINLOCK(ioc_gone_lock); - -static void as_move_to_dispatch(struct as_data *ad, struct request *rq); -static void as_antic_stop(struct as_data *ad); - -/* - * IO Context helper functions - */ - -/* Called to deallocate the as_io_context */ -static void free_as_io_context(struct as_io_context *aic) -{ - kfree(aic); - elv_ioc_count_dec(as_ioc_count); - if (ioc_gone) { - /* - * AS scheduler is exiting, grab exit lock and check - * the pending io context count. If it hits zero, - * complete ioc_gone and set it back to NULL. - */ - spin_lock(&ioc_gone_lock); - if (ioc_gone && !elv_ioc_count_read(as_ioc_count)) { - complete(ioc_gone); - ioc_gone = NULL; - } - spin_unlock(&ioc_gone_lock); - } -} - -static void as_trim(struct io_context *ioc) -{ - spin_lock_irq(&ioc->lock); - if (ioc->aic) - free_as_io_context(ioc->aic); - ioc->aic = NULL; - spin_unlock_irq(&ioc->lock); -} - -/* Called when the task exits */ -static void exit_as_io_context(struct as_io_context *aic) -{ - WARN_ON(!test_bit(AS_TASK_RUNNING, &aic->state)); - clear_bit(AS_TASK_RUNNING, &aic->state); -} - -static struct as_io_context *alloc_as_io_context(void) -{ - struct as_io_context *ret; - - ret = kmalloc(sizeof(*ret), GFP_ATOMIC); - if (ret) { - ret->dtor = free_as_io_context; - ret->exit = exit_as_io_context; - ret->state = 1 << AS_TASK_RUNNING; - atomic_set(&ret->nr_queued, 0); - atomic_set(&ret->nr_dispatched, 0); - spin_lock_init(&ret->lock); - ret->ttime_total = 0; - ret->ttime_samples = 0; - ret->ttime_mean = 0; - ret->seek_total = 0; - ret->seek_samples = 0; - ret->seek_mean = 0; - elv_ioc_count_inc(as_ioc_count); - } - - return ret; -} - -/* - * If the current task has no AS IO context then create one and initialise it. - * Then take a ref on the task's io context and return it. - */ -static struct io_context *as_get_io_context(int node) -{ - struct io_context *ioc = get_io_context(GFP_ATOMIC, node); - if (ioc && !ioc->aic) { - ioc->aic = alloc_as_io_context(); - if (!ioc->aic) { - put_io_context(ioc); - ioc = NULL; - } - } - return ioc; -} - -static void as_put_io_context(struct request *rq) -{ - struct as_io_context *aic; - - if (unlikely(!RQ_IOC(rq))) - return; - - aic = RQ_IOC(rq)->aic; - - if (rq_is_sync(rq) && aic) { - unsigned long flags; - - spin_lock_irqsave(&aic->lock, flags); - set_bit(AS_TASK_IORUNNING, &aic->state); - aic->last_end_request = jiffies; - spin_unlock_irqrestore(&aic->lock, flags); - } - - put_io_context(RQ_IOC(rq)); -} - -/* - * rb tree support functions - */ -#define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))]) - -static void as_add_rq_rb(struct as_data *ad, struct request *rq) -{ - struct request *alias; - - while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) { - as_move_to_dispatch(ad, alias); - as_antic_stop(ad); - } -} - -static inline void as_del_rq_rb(struct as_data *ad, struct request *rq) -{ - elv_rb_del(RQ_RB_ROOT(ad, rq), rq); -} - -/* - * IO Scheduler proper - */ - -#define MAXBACK (1024 * 1024) /* - * Maximum distance the disk will go backward - * for a request. - */ - -#define BACK_PENALTY 2 - -/* - * as_choose_req selects the preferred one of two requests of the same data_dir - * ignoring time - eg. timeouts, which is the job of as_dispatch_request - */ -static struct request * -as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2) -{ - int data_dir; - sector_t last, s1, s2, d1, d2; - int r1_wrap=0, r2_wrap=0; /* requests are behind the disk head */ - const sector_t maxback = MAXBACK; - - if (rq1 == NULL || rq1 == rq2) - return rq2; - if (rq2 == NULL) - return rq1; - - data_dir = rq_is_sync(rq1); - - last = ad->last_sector[data_dir]; - s1 = blk_rq_pos(rq1); - s2 = blk_rq_pos(rq2); - - BUG_ON(data_dir != rq_is_sync(rq2)); - - /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. - */ - if (s1 >= last) - d1 = s1 - last; - else if (s1+maxback >= last) - d1 = (last - s1)*BACK_PENALTY; - else { - r1_wrap = 1; - d1 = 0; /* shut up, gcc */ - } - - if (s2 >= last) - d2 = s2 - last; - else if (s2+maxback >= last) - d2 = (last - s2)*BACK_PENALTY; - else { - r2_wrap = 1; - d2 = 0; - } - - /* Found required data */ - if (!r1_wrap && r2_wrap) - return rq1; - else if (!r2_wrap && r1_wrap) - return rq2; - else if (r1_wrap && r2_wrap) { - /* both behind the head */ - if (s1 <= s2) - return rq1; - else - return rq2; - } - - /* Both requests in front of the head */ - if (d1 < d2) - return rq1; - else if (d2 < d1) - return rq2; - else { - if (s1 >= s2) - return rq1; - else - return rq2; - } -} - -/* - * as_find_next_rq finds the next request after @prev in elevator order. - * this with as_choose_req form the basis for how the scheduler chooses - * what request to process next. Anticipation works on top of this. - */ -static struct request * -as_find_next_rq(struct as_data *ad, struct request *last) -{ - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev) - prev = rb_entry_rq(rbprev); - - if (rbnext) - next = rb_entry_rq(rbnext); - else { - const int data_dir = rq_is_sync(last); - - rbnext = rb_first(&ad->sort_list[data_dir]); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } - - return as_choose_req(ad, next, prev); -} - -/* - * anticipatory scheduling functions follow - */ - -/* - * as_antic_expired tells us when we have anticipated too long. - * The funny "absolute difference" math on the elapsed time is to handle - * jiffy wraps, and disks which have been idle for 0x80000000 jiffies. - */ -static int as_antic_expired(struct as_data *ad) -{ - long delta_jif; - - delta_jif = jiffies - ad->antic_start; - if (unlikely(delta_jif < 0)) - delta_jif = -delta_jif; - if (delta_jif < ad->antic_expire) - return 0; - - return 1; -} - -/* - * as_antic_waitnext starts anticipating that a nice request will soon be - * submitted. See also as_antic_waitreq - */ -static void as_antic_waitnext(struct as_data *ad) -{ - unsigned long timeout; - - BUG_ON(ad->antic_status != ANTIC_OFF - && ad->antic_status != ANTIC_WAIT_REQ); - - timeout = ad->antic_start + ad->antic_expire; - - mod_timer(&ad->antic_timer, timeout); - - ad->antic_status = ANTIC_WAIT_NEXT; -} - -/* - * as_antic_waitreq starts anticipating. We don't start timing the anticipation - * until the request that we're anticipating on has finished. This means we - * are timing from when the candidate process wakes up hopefully. - */ -static void as_antic_waitreq(struct as_data *ad) -{ - BUG_ON(ad->antic_status == ANTIC_FINISHED); - if (ad->antic_status == ANTIC_OFF) { - if (!ad->io_context || ad->ioc_finished) - as_antic_waitnext(ad); - else - ad->antic_status = ANTIC_WAIT_REQ; - } -} - -/* - * This is called directly by the functions in this file to stop anticipation. - * We kill the timer and schedule a call to the request_fn asap. - */ -static void as_antic_stop(struct as_data *ad) -{ - int status = ad->antic_status; - - if (status == ANTIC_WAIT_REQ || status == ANTIC_WAIT_NEXT) { - if (status == ANTIC_WAIT_NEXT) - del_timer(&ad->antic_timer); - ad->antic_status = ANTIC_FINISHED; - /* see as_work_handler */ - kblockd_schedule_work(ad->q, &ad->antic_work); - } -} - -/* - * as_antic_timeout is the timer function set by as_antic_waitnext. - */ -static void as_antic_timeout(unsigned long data) -{ - struct request_queue *q = (struct request_queue *)data; - struct as_data *ad = q->elevator->elevator_data; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - if (ad->antic_status == ANTIC_WAIT_REQ - || ad->antic_status == ANTIC_WAIT_NEXT) { - struct as_io_context *aic; - spin_lock(&ad->io_context->lock); - aic = ad->io_context->aic; - - ad->antic_status = ANTIC_FINISHED; - kblockd_schedule_work(q, &ad->antic_work); - - if (aic->ttime_samples == 0) { - /* process anticipated on has exited or timed out*/ - ad->exit_prob = (7*ad->exit_prob + 256)/8; - } - if (!test_bit(AS_TASK_RUNNING, &aic->state)) { - /* process not "saved" by a cooperating request */ - ad->exit_no_coop = (7*ad->exit_no_coop + 256)/8; - } - spin_unlock(&ad->io_context->lock); - } - spin_unlock_irqrestore(q->queue_lock, flags); -} - -static void as_update_thinktime(struct as_data *ad, struct as_io_context *aic, - unsigned long ttime) -{ - /* fixed point: 1.0 == 1<<8 */ - if (aic->ttime_samples == 0) { - ad->new_ttime_total = (7*ad->new_ttime_total + 256*ttime) / 8; - ad->new_ttime_mean = ad->new_ttime_total / 256; - - ad->exit_prob = (7*ad->exit_prob)/8; - } - aic->ttime_samples = (7*aic->ttime_samples + 256) / 8; - aic->ttime_total = (7*aic->ttime_total + 256*ttime) / 8; - aic->ttime_mean = (aic->ttime_total + 128) / aic->ttime_samples; -} - -static void as_update_seekdist(struct as_data *ad, struct as_io_context *aic, - sector_t sdist) -{ - u64 total; - - if (aic->seek_samples == 0) { - ad->new_seek_total = (7*ad->new_seek_total + 256*(u64)sdist)/8; - ad->new_seek_mean = ad->new_seek_total / 256; - } - - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (aic->seek_samples <= 60) /* second&third seek */ - sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*1024); - else - sdist = min(sdist, (aic->seek_mean * 4) + 2*1024*64); - - aic->seek_samples = (7*aic->seek_samples + 256) / 8; - aic->seek_total = (7*aic->seek_total + (u64)256*sdist) / 8; - total = aic->seek_total + (aic->seek_samples/2); - do_div(total, aic->seek_samples); - aic->seek_mean = (sector_t)total; -} - -/* - * as_update_iohist keeps a decaying histogram of IO thinktimes, and - * updates @aic->ttime_mean based on that. It is called when a new - * request is queued. - */ -static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, - struct request *rq) -{ - int data_dir = rq_is_sync(rq); - unsigned long thinktime = 0; - sector_t seek_dist; - - if (aic == NULL) - return; - - if (data_dir == BLK_RW_SYNC) { - unsigned long in_flight = atomic_read(&aic->nr_queued) - + atomic_read(&aic->nr_dispatched); - spin_lock(&aic->lock); - if (test_bit(AS_TASK_IORUNNING, &aic->state) || - test_bit(AS_TASK_IOSTARTED, &aic->state)) { - /* Calculate read -> read thinktime */ - if (test_bit(AS_TASK_IORUNNING, &aic->state) - && in_flight == 0) { - thinktime = jiffies - aic->last_end_request; - thinktime = min(thinktime, MAX_THINKTIME-1); - } - as_update_thinktime(ad, aic, thinktime); - - /* Calculate read -> read seek distance */ - if (aic->last_request_pos < blk_rq_pos(rq)) - seek_dist = blk_rq_pos(rq) - - aic->last_request_pos; - else - seek_dist = aic->last_request_pos - - blk_rq_pos(rq); - as_update_seekdist(ad, aic, seek_dist); - } - aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - set_bit(AS_TASK_IOSTARTED, &aic->state); - spin_unlock(&aic->lock); - } -} - -/* - * as_close_req decides if one request is considered "close" to the - * previous one issued. - */ -static int as_close_req(struct as_data *ad, struct as_io_context *aic, - struct request *rq) -{ - unsigned long delay; /* jiffies */ - sector_t last = ad->last_sector[ad->batch_data_dir]; - sector_t next = blk_rq_pos(rq); - sector_t delta; /* acceptable close offset (in sectors) */ - sector_t s; - - if (ad->antic_status == ANTIC_OFF || !ad->ioc_finished) - delay = 0; - else - delay = jiffies - ad->antic_start; - - if (delay == 0) - delta = 8192; - else if (delay <= (20 * HZ / 1000) && delay <= ad->antic_expire) - delta = 8192 << delay; - else - return 1; - - if ((last <= next + (delta>>1)) && (next <= last + delta)) - return 1; - - if (last < next) - s = next - last; - else - s = last - next; - - if (aic->seek_samples == 0) { - /* - * Process has just started IO. Use past statistics to - * gauge success possibility - */ - if (ad->new_seek_mean > s) { - /* this request is better than what we're expecting */ - return 1; - } - - } else { - if (aic->seek_mean > s) { - /* this request is better than what we're expecting */ - return 1; - } - } - - return 0; -} - -/* - * as_can_break_anticipation returns true if we have been anticipating this - * request. - * - * It also returns true if the process against which we are anticipating - * submits a write - that's presumably an fsync, O_SYNC write, etc. We want to - * dispatch it ASAP, because we know that application will not be submitting - * any new reads. - * - * If the task which has submitted the request has exited, break anticipation. - * - * If this task has queued some other IO, do not enter enticipation. - */ -static int as_can_break_anticipation(struct as_data *ad, struct request *rq) -{ - struct io_context *ioc; - struct as_io_context *aic; - - ioc = ad->io_context; - BUG_ON(!ioc); - spin_lock(&ioc->lock); - - if (rq && ioc == RQ_IOC(rq)) { - /* request from same process */ - spin_unlock(&ioc->lock); - return 1; - } - - if (ad->ioc_finished && as_antic_expired(ad)) { - /* - * In this situation status should really be FINISHED, - * however the timer hasn't had the chance to run yet. - */ - spin_unlock(&ioc->lock); - return 1; - } - - aic = ioc->aic; - if (!aic) { - spin_unlock(&ioc->lock); - return 0; - } - - if (atomic_read(&aic->nr_queued) > 0) { - /* process has more requests queued */ - spin_unlock(&ioc->lock); - return 1; - } - - if (atomic_read(&aic->nr_dispatched) > 0) { - /* process has more requests dispatched */ - spin_unlock(&ioc->lock); - return 1; - } - - if (rq && rq_is_sync(rq) && as_close_req(ad, aic, rq)) { - /* - * Found a close request that is not one of ours. - * - * This makes close requests from another process update - * our IO history. Is generally useful when there are - * two or more cooperating processes working in the same - * area. - */ - if (!test_bit(AS_TASK_RUNNING, &aic->state)) { - if (aic->ttime_samples == 0) - ad->exit_prob = (7*ad->exit_prob + 256)/8; - - ad->exit_no_coop = (7*ad->exit_no_coop)/8; - } - - as_update_iohist(ad, aic, rq); - spin_unlock(&ioc->lock); - return 1; - } - - if (!test_bit(AS_TASK_RUNNING, &aic->state)) { - /* process anticipated on has exited */ - if (aic->ttime_samples == 0) - ad->exit_prob = (7*ad->exit_prob + 256)/8; - - if (ad->exit_no_coop > 128) { - spin_unlock(&ioc->lock); - return 1; - } - } - - if (aic->ttime_samples == 0) { - if (ad->new_ttime_mean > ad->antic_expire) { - spin_unlock(&ioc->lock); - return 1; - } - if (ad->exit_prob * ad->exit_no_coop > 128*256) { - spin_unlock(&ioc->lock); - return 1; - } - } else if (aic->ttime_mean > ad->antic_expire) { - /* the process thinks too much between requests */ - spin_unlock(&ioc->lock); - return 1; - } - spin_unlock(&ioc->lock); - return 0; -} - -/* - * as_can_anticipate indicates whether we should either run rq - * or keep anticipating a better request. - */ -static int as_can_anticipate(struct as_data *ad, struct request *rq) -{ -#if 0 /* disable for now, we need to check tag level as well */ - /* - * SSD device without seek penalty, disable idling - */ - if (blk_queue_nonrot(ad->q)) axman - return 0; -#endif - - if (!ad->io_context) - /* - * Last request submitted was a write - */ - return 0; - - if (ad->antic_status == ANTIC_FINISHED) - /* - * Don't restart if we have just finished. Run the next request - */ - return 0; - - if (as_can_break_anticipation(ad, rq)) - /* - * This request is a good candidate. Don't keep anticipating, - * run it. - */ - return 0; - - /* - * OK from here, we haven't finished, and don't have a decent request! - * Status is either ANTIC_OFF so start waiting, - * ANTIC_WAIT_REQ so continue waiting for request to finish - * or ANTIC_WAIT_NEXT so continue waiting for an acceptable request. - */ - - return 1; -} - -/* - * as_update_rq must be called whenever a request (rq) is added to - * the sort_list. This function keeps caches up to date, and checks if the - * request might be one we are "anticipating" - */ -static void as_update_rq(struct as_data *ad, struct request *rq) -{ - const int data_dir = rq_is_sync(rq); - - /* keep the next_rq cache up to date */ - ad->next_rq[data_dir] = as_choose_req(ad, rq, ad->next_rq[data_dir]); - - /* - * have we been anticipating this request? - * or does it come from the same process as the one we are anticipating - * for? - */ - if (ad->antic_status == ANTIC_WAIT_REQ - || ad->antic_status == ANTIC_WAIT_NEXT) { - if (as_can_break_anticipation(ad, rq)) - as_antic_stop(ad); - } -} - -/* - * Gathers timings and resizes the write batch automatically - */ -static void update_write_batch(struct as_data *ad) -{ - unsigned long batch = ad->batch_expire[BLK_RW_ASYNC]; - long write_time; - - write_time = (jiffies - ad->current_batch_expires) + batch; - if (write_time < 0) - write_time = 0; - - if (write_time > batch && !ad->write_batch_idled) { - if (write_time > batch * 3) - ad->write_batch_count /= 2; - else - ad->write_batch_count--; - } else if (write_time < batch && ad->current_write_count == 0) { - if (batch > write_time * 3) - ad->write_batch_count *= 2; - else - ad->write_batch_count++; - } - - if (ad->write_batch_count < 1) - ad->write_batch_count = 1; -} - -/* - * as_completed_request is to be called when a request has completed and - * returned something to the requesting process, be it an error or data. - */ -static void as_completed_request(struct request_queue *q, struct request *rq) -{ - struct as_data *ad = q->elevator->elevator_data; - - WARN_ON(!list_empty(&rq->queuelist)); - - if (RQ_STATE(rq) != AS_RQ_REMOVED) { - WARN(1, "rq->state %d\n", RQ_STATE(rq)); - goto out; - } - - if (ad->changed_batch && ad->nr_dispatched == 1) { - ad->current_batch_expires = jiffies + - ad->batch_expire[ad->batch_data_dir]; - kblockd_schedule_work(q, &ad->antic_work); - ad->changed_batch = 0; - - if (ad->batch_data_dir == BLK_RW_SYNC) - ad->new_batch = 1; - } - WARN_ON(ad->nr_dispatched == 0); - ad->nr_dispatched--; - - /* - * Start counting the batch from when a request of that direction is - * actually serviced. This should help devices with big TCQ windows - * and writeback caches - */ - if (ad->new_batch && ad->batch_data_dir == rq_is_sync(rq)) { - update_write_batch(ad); - ad->current_batch_expires = jiffies + - ad->batch_expire[BLK_RW_SYNC]; - ad->new_batch = 0; - } - - if (ad->io_context == RQ_IOC(rq) && ad->io_context) { - ad->antic_start = jiffies; - ad->ioc_finished = 1; - if (ad->antic_status == ANTIC_WAIT_REQ) { - /* - * We were waiting on this request, now anticipate - * the next one - */ - as_antic_waitnext(ad); - } - } - - as_put_io_context(rq); -out: - RQ_SET_STATE(rq, AS_RQ_POSTSCHED); -} - -/* - * as_remove_queued_request removes a request from the pre dispatch queue - * without updating refcounts. It is expected the caller will drop the - * reference unless it replaces the request at somepart of the elevator - * (ie. the dispatch queue) - */ -static void as_remove_queued_request(struct request_queue *q, - struct request *rq) -{ - const int data_dir = rq_is_sync(rq); - struct as_data *ad = q->elevator->elevator_data; - struct io_context *ioc; - - WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); - - ioc = RQ_IOC(rq); - if (ioc && ioc->aic) { - BUG_ON(!atomic_read(&ioc->aic->nr_queued)); - atomic_dec(&ioc->aic->nr_queued); - } - - /* - * Update the "next_rq" cache if we are about to remove its - * entry - */ - if (ad->next_rq[data_dir] == rq) - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); - - rq_fifo_clear(rq); - as_del_rq_rb(ad, rq); -} - -/* - * as_fifo_expired returns 0 if there are no expired requests on the fifo, - * 1 otherwise. It is ratelimited so that we only perform the check once per - * `fifo_expire' interval. Otherwise a large number of expired requests - * would create a hopeless seekstorm. - * - * See as_antic_expired comment. - */ -static int as_fifo_expired(struct as_data *ad, int adir) -{ - struct request *rq; - long delta_jif; - - delta_jif = jiffies - ad->last_check_fifo[adir]; - if (unlikely(delta_jif < 0)) - delta_jif = -delta_jif; - if (delta_jif < ad->fifo_expire[adir]) - return 0; - - ad->last_check_fifo[adir] = jiffies; - - if (list_empty(&ad->fifo_list[adir])) - return 0; - - rq = rq_entry_fifo(ad->fifo_list[adir].next); - - return time_after(jiffies, rq_fifo_time(rq)); -} - -/* - * as_batch_expired returns true if the current batch has expired. A batch - * is a set of reads or a set of writes. - */ -static inline int as_batch_expired(struct as_data *ad) -{ - if (ad->changed_batch || ad->new_batch) - return 0; - - if (ad->batch_data_dir == BLK_RW_SYNC) - /* TODO! add a check so a complete fifo gets written? */ - return time_after(jiffies, ad->current_batch_expires); - - return time_after(jiffies, ad->current_batch_expires) - || ad->current_write_count == 0; -} - -/* - * move an entry to dispatch queue - */ -static void as_move_to_dispatch(struct as_data *ad, struct request *rq) -{ - const int data_dir = rq_is_sync(rq); - - BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); - - as_antic_stop(ad); - ad->antic_status = ANTIC_OFF; - - /* - * This has to be set in order to be correctly updated by - * as_find_next_rq - */ - ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (data_dir == BLK_RW_SYNC) { - struct io_context *ioc = RQ_IOC(rq); - /* In case we have to anticipate after this */ - copy_io_context(&ad->io_context, &ioc); - } else { - if (ad->io_context) { - put_io_context(ad->io_context); - ad->io_context = NULL; - } - - if (ad->current_write_count != 0) - ad->current_write_count--; - } - ad->ioc_finished = 0; - - ad->next_rq[data_dir] = as_find_next_rq(ad, rq); - - /* - * take it off the sort and fifo list, add to dispatch queue - */ - as_remove_queued_request(ad->q, rq); - WARN_ON(RQ_STATE(rq) != AS_RQ_QUEUED); - - elv_dispatch_sort(ad->q, rq); - - RQ_SET_STATE(rq, AS_RQ_DISPATCHED); - if (RQ_IOC(rq) && RQ_IOC(rq)->aic) - atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); - ad->nr_dispatched++; -} - -/* - * as_dispatch_request selects the best request according to - * read/write expire, batch expire, etc, and moves it to the dispatch - * queue. Returns 1 if a request was found, 0 otherwise. - */ -static int as_dispatch_request(struct request_queue *q, int force) -{ - struct as_data *ad = q->elevator->elevator_data; - const int reads = !list_empty(&ad->fifo_list[BLK_RW_SYNC]); - const int writes = !list_empty(&ad->fifo_list[BLK_RW_ASYNC]); - struct request *rq; - - if (unlikely(force)) { - /* - * Forced dispatch, accounting is useless. Reset - * accounting states and dump fifo_lists. Note that - * batch_data_dir is reset to BLK_RW_SYNC to avoid - * screwing write batch accounting as write batch - * accounting occurs on W->R transition. - */ - int dispatched = 0; - - ad->batch_data_dir = BLK_RW_SYNC; - ad->changed_batch = 0; - ad->new_batch = 0; - - while (ad->next_rq[BLK_RW_SYNC]) { - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_SYNC]); - dispatched++; - } - ad->last_check_fifo[BLK_RW_SYNC] = jiffies; - - while (ad->next_rq[BLK_RW_ASYNC]) { - as_move_to_dispatch(ad, ad->next_rq[BLK_RW_ASYNC]); - dispatched++; - } - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; - - return dispatched; - } - - /* Signal that the write batch was uncontended, so we can't time it */ - if (ad->batch_data_dir == BLK_RW_ASYNC && !reads) { - if (ad->current_write_count == 0 || !writes) - ad->write_batch_idled = 1; - } - - if (!(reads || writes) - || ad->antic_status == ANTIC_WAIT_REQ - || ad->antic_status == ANTIC_WAIT_NEXT - || ad->changed_batch) - return 0; - - if (!(reads && writes && as_batch_expired(ad))) { - /* - * batch is still running or no reads or no writes - */ - rq = ad->next_rq[ad->batch_data_dir]; - - if (ad->batch_data_dir == BLK_RW_SYNC && ad->antic_expire) { - if (as_fifo_expired(ad, BLK_RW_SYNC)) - goto fifo_expired; - - if (as_can_anticipate(ad, rq)) { - as_antic_waitreq(ad); - return 0; - } - } - - if (rq) { - /* we have a "next request" */ - if (reads && !writes) - ad->current_batch_expires = - jiffies + ad->batch_expire[BLK_RW_SYNC]; - goto dispatch_request; - } - } - - /* - * at this point we are not running a batch. select the appropriate - * data direction (read / write) - */ - - if (reads) { - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_SYNC])); - - if (writes && ad->batch_data_dir == BLK_RW_SYNC) - /* - * Last batch was a read, switch to writes - */ - goto dispatch_writes; - - if (ad->batch_data_dir == BLK_RW_ASYNC) { - WARN_ON(ad->new_batch); - ad->changed_batch = 1; - } - ad->batch_data_dir = BLK_RW_SYNC; - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_SYNC].next); - ad->last_check_fifo[ad->batch_data_dir] = jiffies; - goto dispatch_request; - } - - /* - * the last batch was a read - */ - - if (writes) { -dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&ad->sort_list[BLK_RW_ASYNC])); - - if (ad->batch_data_dir == BLK_RW_SYNC) { - ad->changed_batch = 1; - - /* - * new_batch might be 1 when the queue runs out of - * reads. A subsequent submission of a write might - * cause a change of batch before the read is finished. - */ - ad->new_batch = 0; - } - ad->batch_data_dir = BLK_RW_ASYNC; - ad->current_write_count = ad->write_batch_count; - ad->write_batch_idled = 0; - rq = rq_entry_fifo(ad->fifo_list[BLK_RW_ASYNC].next); - ad->last_check_fifo[BLK_RW_ASYNC] = jiffies; - goto dispatch_request; - } - - BUG(); - return 0; - -dispatch_request: - /* - * If a request has expired, service it. - */ - - if (as_fifo_expired(ad, ad->batch_data_dir)) { -fifo_expired: - rq = rq_entry_fifo(ad->fifo_list[ad->batch_data_dir].next); - } - - if (ad->changed_batch) { - WARN_ON(ad->new_batch); - - if (ad->nr_dispatched) - return 0; - - if (ad->batch_data_dir == BLK_RW_ASYNC) - ad->current_batch_expires = jiffies + - ad->batch_expire[BLK_RW_ASYNC]; - else - ad->new_batch = 1; - - ad->changed_batch = 0; - } - - /* - * rq is the selected appropriate request. - */ - as_move_to_dispatch(ad, rq); - - return 1; -} - -/* - * add rq to rbtree and fifo - */ -static void as_add_request(struct request_queue *q, struct request *rq) -{ - struct as_data *ad = q->elevator->elevator_data; - int data_dir; - - RQ_SET_STATE(rq, AS_RQ_NEW); - - data_dir = rq_is_sync(rq); - - rq->elevator_private = as_get_io_context(q->node); - - if (RQ_IOC(rq)) { - as_update_iohist(ad, RQ_IOC(rq)->aic, rq); - atomic_inc(&RQ_IOC(rq)->aic->nr_queued); - } - - as_add_rq_rb(ad, rq); - - /* - * set expire time and add to fifo list - */ - rq_set_fifo_time(rq, jiffies + ad->fifo_expire[data_dir]); - list_add_tail(&rq->queuelist, &ad->fifo_list[data_dir]); - - as_update_rq(ad, rq); /* keep state machine up to date */ - RQ_SET_STATE(rq, AS_RQ_QUEUED); -} - -static void as_activate_request(struct request_queue *q, struct request *rq) -{ - WARN_ON(RQ_STATE(rq) != AS_RQ_DISPATCHED); - RQ_SET_STATE(rq, AS_RQ_REMOVED); - if (RQ_IOC(rq) && RQ_IOC(rq)->aic) - atomic_dec(&RQ_IOC(rq)->aic->nr_dispatched); -} - -static void as_deactivate_request(struct request_queue *q, struct request *rq) -{ - WARN_ON(RQ_STATE(rq) != AS_RQ_REMOVED); - RQ_SET_STATE(rq, AS_RQ_DISPATCHED); - if (RQ_IOC(rq) && RQ_IOC(rq)->aic) - atomic_inc(&RQ_IOC(rq)->aic->nr_dispatched); -} - -/* - * as_queue_empty tells us if there are requests left in the device. It may - * not be the case that a driver can get the next request even if the queue - * is not empty - it is used in the block layer to check for plugging and - * merging opportunities - */ -static int as_queue_empty(struct request_queue *q) -{ - struct as_data *ad = q->elevator->elevator_data; - - return list_empty(&ad->fifo_list[BLK_RW_ASYNC]) - && list_empty(&ad->fifo_list[BLK_RW_SYNC]); -} - -static int -as_merge(struct request_queue *q, struct request **req, struct bio *bio) -{ - struct as_data *ad = q->elevator->elevator_data; - sector_t rb_key = bio->bi_sector + bio_sectors(bio); - struct request *__rq; - - /* - * check for front merge - */ - __rq = elv_rb_find(&ad->sort_list[bio_data_dir(bio)], rb_key); - if (__rq && elv_rq_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - - return ELEVATOR_NO_MERGE; -} - -static void as_merged_request(struct request_queue *q, struct request *req, - int type) -{ - struct as_data *ad = q->elevator->elevator_data; - - /* - * if the merge was a front merge, we need to reposition request - */ - if (type == ELEVATOR_FRONT_MERGE) { - as_del_rq_rb(ad, req); - as_add_rq_rb(ad, req); - /* - * Note! At this stage of this and the next function, our next - * request may not be optimal - eg the request may have "grown" - * behind the disk head. We currently don't bother adjusting. - */ - } -} - -static void as_merged_requests(struct request_queue *q, struct request *req, - struct request *next) -{ - /* - * if next expires before rq, assign its expire time to arq - * and move into next position (next will be deleted) in fifo - */ - if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { - if (time_before(rq_fifo_time(next), rq_fifo_time(req))) { - list_move(&req->queuelist, &next->queuelist); - rq_set_fifo_time(req, rq_fifo_time(next)); - } - } - - /* - * kill knowledge of next, this one is a goner - */ - as_remove_queued_request(q, next); - as_put_io_context(next); - - RQ_SET_STATE(next, AS_RQ_MERGED); -} - -/* - * This is executed in a "deferred" process context, by kblockd. It calls the - * driver's request_fn so the driver can submit that request. - * - * IMPORTANT! This guy will reenter the elevator, so set up all queue global - * state before calling, and don't rely on any state over calls. - * - * FIXME! dispatch queue is not a queue at all! - */ -static void as_work_handler(struct work_struct *work) -{ - struct as_data *ad = container_of(work, struct as_data, antic_work); - - blk_run_queue(ad->q); -} - -static int as_may_queue(struct request_queue *q, int rw) -{ - int ret = ELV_MQUEUE_MAY; - struct as_data *ad = q->elevator->elevator_data; - struct io_context *ioc; - if (ad->antic_status == ANTIC_WAIT_REQ || - ad->antic_status == ANTIC_WAIT_NEXT) { - ioc = as_get_io_context(q->node); - if (ad->io_context == ioc) - ret = ELV_MQUEUE_MUST; - put_io_context(ioc); - } - - return ret; -} - -static void as_exit_queue(struct elevator_queue *e) -{ - struct as_data *ad = e->elevator_data; - - del_timer_sync(&ad->antic_timer); - cancel_work_sync(&ad->antic_work); - - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_SYNC])); - BUG_ON(!list_empty(&ad->fifo_list[BLK_RW_ASYNC])); - - put_io_context(ad->io_context); - kfree(ad); -} - -/* - * initialize elevator private data (as_data). - */ -static void *as_init_queue(struct request_queue *q) -{ - struct as_data *ad; - - ad = kmalloc_node(sizeof(*ad), GFP_KERNEL | __GFP_ZERO, q->node); - if (!ad) - return NULL; - - ad->q = q; /* Identify what queue the data belongs to */ - - /* anticipatory scheduling helpers */ - ad->antic_timer.function = as_antic_timeout; - ad->antic_timer.data = (unsigned long)q; - init_timer(&ad->antic_timer); - INIT_WORK(&ad->antic_work, as_work_handler); - - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_SYNC]); - INIT_LIST_HEAD(&ad->fifo_list[BLK_RW_ASYNC]); - ad->sort_list[BLK_RW_SYNC] = RB_ROOT; - ad->sort_list[BLK_RW_ASYNC] = RB_ROOT; - ad->fifo_expire[BLK_RW_SYNC] = default_read_expire; - ad->fifo_expire[BLK_RW_ASYNC] = default_write_expire; - ad->antic_expire = default_antic_expire; - ad->batch_expire[BLK_RW_SYNC] = default_read_batch_expire; - ad->batch_expire[BLK_RW_ASYNC] = default_write_batch_expire; - - ad->current_batch_expires = jiffies + ad->batch_expire[BLK_RW_SYNC]; - ad->write_batch_count = ad->batch_expire[BLK_RW_ASYNC] / 10; - if (ad->write_batch_count < 2) - ad->write_batch_count = 2; - - return ad; -} - -/* - * sysfs parts below - */ - -static ssize_t -as_var_show(unsigned int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static ssize_t -as_var_store(unsigned long *var, const char *page, size_t count) -{ - char *p = (char *) page; - - *var = simple_strtoul(p, &p, 10); - return count; -} - -static ssize_t est_time_show(struct elevator_queue *e, char *page) -{ - struct as_data *ad = e->elevator_data; - int pos = 0; - - pos += sprintf(page+pos, "%lu %% exit probability\n", - 100*ad->exit_prob/256); - pos += sprintf(page+pos, "%lu %% probability of exiting without a " - "cooperating process submitting IO\n", - 100*ad->exit_no_coop/256); - pos += sprintf(page+pos, "%lu ms new thinktime\n", ad->new_ttime_mean); - pos += sprintf(page+pos, "%llu sectors new seek distance\n", - (unsigned long long)ad->new_seek_mean); - - return pos; -} - -#define SHOW_FUNCTION(__FUNC, __VAR) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct as_data *ad = e->elevator_data; \ - return as_var_show(jiffies_to_msecs((__VAR)), (page)); \ -} -SHOW_FUNCTION(as_read_expire_show, ad->fifo_expire[BLK_RW_SYNC]); -SHOW_FUNCTION(as_write_expire_show, ad->fifo_expire[BLK_RW_ASYNC]); -SHOW_FUNCTION(as_antic_expire_show, ad->antic_expire); -SHOW_FUNCTION(as_read_batch_expire_show, ad->batch_expire[BLK_RW_SYNC]); -SHOW_FUNCTION(as_write_batch_expire_show, ad->batch_expire[BLK_RW_ASYNC]); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct as_data *ad = e->elevator_data; \ - int ret = as_var_store(__PTR, (page), count); \ - if (*(__PTR) < (MIN)) \ - *(__PTR) = (MIN); \ - else if (*(__PTR) > (MAX)) \ - *(__PTR) = (MAX); \ - *(__PTR) = msecs_to_jiffies(*(__PTR)); \ - return ret; \ -} -STORE_FUNCTION(as_read_expire_store, &ad->fifo_expire[BLK_RW_SYNC], 0, INT_MAX); -STORE_FUNCTION(as_write_expire_store, - &ad->fifo_expire[BLK_RW_ASYNC], 0, INT_MAX); -STORE_FUNCTION(as_antic_expire_store, &ad->antic_expire, 0, INT_MAX); -STORE_FUNCTION(as_read_batch_expire_store, - &ad->batch_expire[BLK_RW_SYNC], 0, INT_MAX); -STORE_FUNCTION(as_write_batch_expire_store, - &ad->batch_expire[BLK_RW_ASYNC], 0, INT_MAX); -#undef STORE_FUNCTION - -#define AS_ATTR(name) \ - __ATTR(name, S_IRUGO|S_IWUSR, as_##name##_show, as_##name##_store) - -static struct elv_fs_entry as_attrs[] = { - __ATTR_RO(est_time), - AS_ATTR(read_expire), - AS_ATTR(write_expire), - AS_ATTR(antic_expire), - AS_ATTR(read_batch_expire), - AS_ATTR(write_batch_expire), - __ATTR_NULL -}; - -static struct elevator_type iosched_as = { - .ops = { - .elevator_merge_fn = as_merge, - .elevator_merged_fn = as_merged_request, - .elevator_merge_req_fn = as_merged_requests, - .elevator_dispatch_fn = as_dispatch_request, - .elevator_add_req_fn = as_add_request, - .elevator_activate_req_fn = as_activate_request, - .elevator_deactivate_req_fn = as_deactivate_request, - .elevator_queue_empty_fn = as_queue_empty, - .elevator_completed_req_fn = as_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_may_queue_fn = as_may_queue, - .elevator_init_fn = as_init_queue, - .elevator_exit_fn = as_exit_queue, - .trim = as_trim, - }, - - .elevator_attrs = as_attrs, - .elevator_name = "anticipatory", - .elevator_owner = THIS_MODULE, -}; - -static int __init as_init(void) -{ - elv_register(&iosched_as); - - return 0; -} - -static void __exit as_exit(void) -{ - DECLARE_COMPLETION_ONSTACK(all_gone); - elv_unregister(&iosched_as); - ioc_gone = &all_gone; - /* ioc_gone's update must be visible before reading ioc_count */ - smp_wmb(); - if (elv_ioc_count_read(as_ioc_count)) - wait_for_completion(&all_gone); - synchronize_rcu(); -} - -module_init(as_init); -module_exit(as_exit); - -MODULE_AUTHOR("Nick Piggin"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("anticipatory IO scheduler"); diff --git a/block/elevator.c b/block/elevator.c index 1975b619c86..bb30f0e92d4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -154,10 +154,7 @@ static struct elevator_type *elevator_get(const char *name) spin_unlock(&elv_list_lock); - if (!strcmp(name, "anticipatory")) - sprintf(elv, "as-iosched"); - else - sprintf(elv, "%s-iosched", name); + sprintf(elv, "%s-iosched", name); request_module("%s", elv); spin_lock(&elv_list_lock); @@ -193,10 +190,7 @@ static int __init elevator_setup(char *str) * Be backwards-compatible with previous kernels, so users * won't get the wrong elevator. */ - if (!strcmp(str, "as")) - strcpy(chosen_elevator, "anticipatory"); - else - strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); + strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); return 1; } From 08dc8726d4be85bca793141c827574fd32a681bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 3 Oct 2009 09:40:47 +0200 Subject: [PATCH 008/113] block: CFQ is more than a desktop scheduler Update Kconfig.iosched entry. Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index baad3dae365..8bd105115a6 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -27,7 +27,9 @@ config IOSCHED_CFQ ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally among all processes in the system. It should provide a fair - working environment, suitable for desktop systems. + and low latency working environment, suitable for both desktop + and server systems. + This is the default I/O scheduler. choice From 25d2d4edfa509b69fe4832094b8a07e634363ba3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 5 Oct 2009 09:31:59 +0200 Subject: [PATCH 009/113] drbd: fixup for reverted dual in_flight patch Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3678d3d66c6..d3426ff405b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -40,7 +40,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_stat_unlock(); - mdev->vdisk->part0.in_flight[rw]++; + mdev->vdisk->part0.in_flight++; } /* Update disk stats when completing request upwards */ @@ -53,7 +53,7 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); part_round_stats(cpu, &mdev->vdisk->part0); part_stat_unlock(); - mdev->vdisk->part0.in_flight[rw]--; + mdev->vdisk->part0.in_flight--; } static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) From 9f5180e5c331d7b3ccc35e1a78072235d38f9f34 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 6 Oct 2009 09:30:14 +0200 Subject: [PATCH 010/113] drbd: Work on permission enforcement Now we have the capabilities of the sending process available, use them to enforce CAP_SYS_ADMIN. Signed-off-by: Philipp Reisner Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_nl.c | 7 ++++++- include/linux/drbd.h | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 73c55ccb629..22538d9628f 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2000,7 +2000,7 @@ static struct cn_handler_struct cnd_table[] = { [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 }, }; -static void drbd_connector_callback(struct cn_msg *req) +static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp) { struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data; struct cn_handler_struct *cm; @@ -2017,6 +2017,11 @@ static void drbd_connector_callback(struct cn_msg *req) return; } + if (!cap_raised(nsp->eff_cap, CAP_SYS_ADMIN)) { + retcode = ERR_PERM; + goto fail; + } + mdev = ensure_mdev(nlp); if (!mdev) { retcode = ERR_MINOR_INVALID; diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 69dc711f37b..233db5c18b8 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -138,6 +138,7 @@ enum drbd_ret_codes { ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */ ERR_DATA_NOT_CURRENT = 150, ERR_CONNECTED = 151, /* DRBD 8.3 only */ + ERR_PERM = 152, /* insert new ones above this line */ AFTER_LAST_ERR_CODE From 132cc538cd90f60a0b5df6a512dfd4bc5fe2039a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 7 Oct 2009 19:26:00 +0200 Subject: [PATCH 011/113] drbd: needs __ratelimit() drbd_int.h uses __ratelimit(), so it needs to #include ratelimit.h: drivers/block/drbd/drbd_int.h:1765: error: implicit declaration of function '__ratelimit' Signed-off-by: Randy Dunlap Cc: drbd-dev@lists.linbit.com Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_int.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 4e6255991e5..2312d782fe9 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include From b2c18e1e08a5a9663094d57bb4be2f02226ee61c Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 23 Oct 2009 17:14:49 -0400 Subject: [PATCH 012/113] cfq: calculate the seek_mean per cfq_queue not per cfq_io_context async cfq_queue's are already shared between processes within the same priority, and forthcoming patches will change the mapping of cic to sync cfq_queue from 1:1 to 1:N. So, calculate the seekiness of a process based on the cfq_queue instead of the cfq_io_context. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 68 +++++++++++++++++++-------------------- include/linux/iocontext.h | 5 --- 2 files changed, 33 insertions(+), 40 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 069a61017c0..78cc8ee5da4 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -112,6 +112,11 @@ struct cfq_queue { unsigned short ioprio, org_ioprio; unsigned short ioprio_class, org_ioprio_class; + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + pid_t pid; }; @@ -962,16 +967,16 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, return cfqd->last_position - blk_rq_pos(rq); } -#define CIC_SEEK_THR 8 * 1024 -#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) +#define CFQQ_SEEK_THR 8 * 1024 +#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) -static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) +static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct request *rq) { - struct cfq_io_context *cic = cfqd->active_cic; - sector_t sdist = cic->seek_mean; + sector_t sdist = cfqq->seek_mean; - if (!sample_valid(cic->seek_samples)) - sdist = CIC_SEEK_THR; + if (!sample_valid(cfqq->seek_samples)) + sdist = CFQQ_SEEK_THR; return cfq_dist_from_last(cfqd, rq) <= sdist; } @@ -1000,7 +1005,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * will contain the closest sector. */ __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, __cfqq->next_rq)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; if (blk_rq_pos(__cfqq->next_rq) < sector) @@ -1011,7 +1016,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, return NULL; __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, __cfqq->next_rq)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; return NULL; @@ -1033,13 +1038,6 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, { struct cfq_queue *cfqq; - /* - * A valid cfq_io_context is necessary to compare requests against - * the seek_mean of the current cfqq. - */ - if (!cfqd->active_cic) - return NULL; - /* * We should notice if some of the queues are cooperating, eg * working closely on the same area of the disk. In that case, @@ -1110,7 +1108,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) * seeks. so allow a little bit of time for him to submit a new rq */ sl = cfqd->cfq_slice_idle; - if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) + if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); mod_timer(&cfqd->idle_slice_timer, jiffies + sl); @@ -1947,33 +1945,33 @@ cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic) } static void -cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, +cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct request *rq) { sector_t sdist; u64 total; - if (!cic->last_request_pos) + if (!cfqq->last_request_pos) sdist = 0; - else if (cic->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - cic->last_request_pos; + else if (cfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - cfqq->last_request_pos; else - sdist = cic->last_request_pos - blk_rq_pos(rq); + sdist = cfqq->last_request_pos - blk_rq_pos(rq); /* * Don't allow the seek distance to get too large from the * odd fragment, pagein, etc */ - if (cic->seek_samples <= 60) /* second&third seek */ - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); + if (cfqq->seek_samples <= 60) /* second&third seek */ + sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*1024); else - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); + sdist = min(sdist, (cfqq->seek_mean * 4) + 2*1024*64); - cic->seek_samples = (7*cic->seek_samples + 256) / 8; - cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; - total = cic->seek_total + (cic->seek_samples/2); - do_div(total, cic->seek_samples); - cic->seek_mean = (sector_t)total; + cfqq->seek_samples = (7*cfqq->seek_samples + 256) / 8; + cfqq->seek_total = (7*cfqq->seek_total + (u64)256*sdist) / 8; + total = cfqq->seek_total + (cfqq->seek_samples/2); + do_div(total, cfqq->seek_samples); + cfqq->seek_mean = (sector_t)total; } /* @@ -1995,11 +1993,11 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic))) + (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { unsigned int slice_idle = cfqd->cfq_slice_idle; - if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) + if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) slice_idle = msecs_to_jiffies(CFQ_MIN_TT); if (cic->ttime_mean > slice_idle) enable_idle = 0; @@ -2066,7 +2064,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, rq)) + if (cfq_rq_close(cfqd, cfqq, rq)) return true; return false; @@ -2108,10 +2106,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->meta_pending++; cfq_update_io_thinktime(cfqd, cic); - cfq_update_io_seektime(cfqd, cic, rq); + cfq_update_io_seektime(cfqd, cfqq, rq); cfq_update_idle_window(cfqd, cfqq, cic); - cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); if (cfqq == cfqd->active_queue) { /* diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 4da4a75c3f1..eb73632440f 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -40,16 +40,11 @@ struct cfq_io_context { struct io_context *ioc; unsigned long last_end_request; - sector_t last_request_pos; unsigned long ttime_total; unsigned long ttime_samples; unsigned long ttime_mean; - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; - struct list_head queue_list; struct hlist_node cic_list; From df5fe3e8e13883f58dc97489076bbcc150789a21 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 23 Oct 2009 17:14:50 -0400 Subject: [PATCH 013/113] cfq: merge cooperating cfq_queues When cooperating cfq_queues are detected currently, they are allowed to skip ahead in the scheduling order. It is much more efficient to automatically share the cfq_queue data structure between cooperating processes. Performance of the read-test2 benchmark (which is written to emulate the dump(8) utility) went from 12MB/s to 90MB/s on my SATA disk. NFS servers with multiple nfsd threads also saw performance increases. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 89 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 78cc8ee5da4..f0994aedb39 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -118,6 +118,8 @@ struct cfq_queue { sector_t last_request_pos; pid_t pid; + + struct cfq_queue *new_cfqq; }; /* @@ -1047,6 +1049,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (!cfqq) return NULL; + /* + * It only makes sense to merge sync queues. + */ + if (!cfq_cfqq_sync(cfqq)) + return NULL; + if (cfq_cfqq_coop(cfqq)) return NULL; @@ -1167,6 +1175,43 @@ cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); } +/* + * Must be called with the queue_lock held. + */ +static int cfqq_process_refs(struct cfq_queue *cfqq) +{ + int process_refs, io_refs; + + io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; + process_refs = atomic_read(&cfqq->ref) - io_refs; + BUG_ON(process_refs < 0); + return process_refs; +} + +static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) +{ + int process_refs; + struct cfq_queue *__cfqq; + + /* Avoid a circular list and skip interim queue merges */ + while ((__cfqq = new_cfqq->new_cfqq)) { + if (__cfqq == cfqq) + return; + new_cfqq = __cfqq; + } + + process_refs = cfqq_process_refs(cfqq); + /* + * If the process for the cfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0) + return; + + cfqq->new_cfqq = new_cfqq; + atomic_add(process_refs, &new_cfqq->ref); +} + /* * Select a queue for service. If we have a current active queue, * check whether to continue servicing it, or retrieve and set a new one. @@ -1196,11 +1241,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * If another queue has a request waiting within our mean seek * distance, let it run. The expire code will check for close * cooperators and put the close queue at the front of the service - * tree. + * tree. If possible, merge the expiring queue with the new cfqq. */ new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); - if (new_cfqq) + if (new_cfqq) { + if (!cfqq->new_cfqq) + cfq_setup_merge(cfqq, new_cfqq); goto expire; + } /* * No requests pending. If the active queue still has requests in @@ -1511,11 +1559,29 @@ static void cfq_free_io_context(struct io_context *ioc) static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_queue *__cfqq, *next; + if (unlikely(cfqq == cfqd->active_queue)) { __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. + */ + __cfqq = cfqq->new_cfqq; + while (__cfqq) { + if (__cfqq == cfqq) { + WARN(1, "cfqq->new_cfqq loop detected\n"); + break; + } + next = __cfqq->new_cfqq; + cfq_put_queue(__cfqq); + __cfqq = next; + } + cfq_put_queue(cfqq); } @@ -2323,6 +2389,16 @@ static void cfq_put_request(struct request *rq) } } +static struct cfq_queue * +cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, + struct cfq_queue *cfqq) +{ + cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); + cic_set_cfqq(cic, cfqq->new_cfqq, 1); + cfq_put_queue(cfqq); + return cic_to_cfqq(cic, 1); +} + /* * Allocate cfq data structures associated with this request. */ @@ -2349,6 +2425,15 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) if (!cfqq || cfqq == &cfqd->oom_cfqq) { cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); + } else { + /* + * Check to see if this queue is scheduled to merge with + * another, closely cooperating queue. The merging of + * queues happens here as it must be done in process context. + * The reference on new_cfqq was taken in merge_cfqqs. + */ + if (cfqq->new_cfqq) + cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); } cfqq->allocated[rw]++; From b3b6d0408c953524f979468562e7e210d8634150 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 23 Oct 2009 17:14:51 -0400 Subject: [PATCH 014/113] cfq: change the meaning of the cfqq_coop flag The flag used to indicate that a cfqq was allowed to jump ahead in the scheduling order due to submitting a request close to the queue that just executed. Since closely cooperating queues are now merged, the flag holds little meaning. Change it to indicate that multiple queues were merged. This will later be used to allow the breaking up of merged queues when they are no longer cooperating. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f0994aedb39..5e01a0a92c0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -202,7 +202,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ - CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ + CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ }; #define CFQ_CFQQ_FNS(name) \ @@ -950,11 +950,8 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - if (!cfqq) { + if (!cfqq) cfqq = cfq_get_next_queue(cfqd); - if (cfqq) - cfq_clear_cfqq_coop(cfqq); - } __cfq_set_active_queue(cfqd, cfqq); return cfqq; @@ -1035,8 +1032,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, * assumption. */ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq, - bool probe) + struct cfq_queue *cur_cfqq) { struct cfq_queue *cfqq; @@ -1055,11 +1051,6 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (!cfq_cfqq_sync(cfqq)) return NULL; - if (cfq_cfqq_coop(cfqq)) - return NULL; - - if (!probe) - cfq_mark_cfqq_coop(cfqq); return cfqq; } @@ -1243,7 +1234,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * cooperators and put the close queue at the front of the service * tree. If possible, merge the expiring queue with the new cfqq. */ - new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); + new_cfqq = cfq_close_cooperator(cfqd, cfqq); if (new_cfqq) { if (!cfqq->new_cfqq) cfq_setup_merge(cfqq, new_cfqq); @@ -2294,7 +2285,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) */ if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) cfq_slice_expired(cfqd, 1); - else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && + else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) && sync && !rq_noidle(rq)) cfq_arm_slice_timer(cfqd); } @@ -2395,6 +2386,7 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, { cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); cic_set_cfqq(cic, cfqq->new_cfqq, 1); + cfq_mark_cfqq_coop(cfqq->new_cfqq); cfq_put_queue(cfqq); return cic_to_cfqq(cic, 1); } From e6c5bc737ab71e4af6025ef7d150f5a26ae5f146 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 23 Oct 2009 17:14:52 -0400 Subject: [PATCH 015/113] cfq: break apart merged cfqqs if they stop cooperating cfq_queues are merged if they are issuing requests within the mean seek distance of one another. This patch detects when the coopearting stops and breaks the queues back up. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 79 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5e01a0a92c0..47d6aaca0c5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -38,6 +38,12 @@ static int cfq_slice_idle = HZ / 125; */ #define CFQ_MIN_TT (2) +/* + * Allow merged cfqqs to perform this amount of seeky I/O before + * deciding to break the queues up again. + */ +#define CFQQ_COOP_TOUT (HZ) + #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) @@ -116,6 +122,7 @@ struct cfq_queue { u64 seek_total; sector_t seek_mean; sector_t last_request_pos; + unsigned long seeky_start; pid_t pid; @@ -1036,6 +1043,11 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, { struct cfq_queue *cfqq; + if (!cfq_cfqq_sync(cur_cfqq)) + return NULL; + if (CFQQ_SEEKY(cur_cfqq)) + return NULL; + /* * We should notice if some of the queues are cooperating, eg * working closely on the same area of the disk. In that case, @@ -1050,6 +1062,8 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, */ if (!cfq_cfqq_sync(cfqq)) return NULL; + if (CFQQ_SEEKY(cfqq)) + return NULL; return cfqq; } @@ -1181,7 +1195,7 @@ static int cfqq_process_refs(struct cfq_queue *cfqq) static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) { - int process_refs; + int process_refs, new_process_refs; struct cfq_queue *__cfqq; /* Avoid a circular list and skip interim queue merges */ @@ -1199,8 +1213,17 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) if (process_refs == 0) return; - cfqq->new_cfqq = new_cfqq; - atomic_add(process_refs, &new_cfqq->ref); + /* + * Merge in the direction of the lesser amount of work. + */ + new_process_refs = cfqq_process_refs(new_cfqq); + if (new_process_refs >= process_refs) { + cfqq->new_cfqq = new_cfqq; + atomic_add(process_refs, &new_cfqq->ref); + } else { + new_cfqq->new_cfqq = cfqq; + atomic_add(new_process_refs, &cfqq->ref); + } } /* @@ -2029,6 +2052,19 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, total = cfqq->seek_total + (cfqq->seek_samples/2); do_div(total, cfqq->seek_samples); cfqq->seek_mean = (sector_t)total; + + /* + * If this cfqq is shared between multiple processes, check to + * make sure that those processes are still issuing I/Os within + * the mean seek distance. If not, it may be time to break the + * queues apart again. + */ + if (cfq_cfqq_coop(cfqq)) { + if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) + cfqq->seeky_start = jiffies; + else if (!CFQQ_SEEKY(cfqq)) + cfqq->seeky_start = 0; + } } /* @@ -2391,6 +2427,32 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, return cic_to_cfqq(cic, 1); } +static int should_split_cfqq(struct cfq_queue *cfqq) +{ + if (cfqq->seeky_start && + time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) + return 1; + return 0; +} + +/* + * Returns NULL if a new cfqq should be allocated, or the old cfqq if this + * was the last process referring to said cfqq. + */ +static struct cfq_queue * +split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) +{ + if (cfqq_process_refs(cfqq) == 1) { + cfqq->seeky_start = 0; + cfqq->pid = current->pid; + cfq_clear_cfqq_coop(cfqq); + return cfqq; + } + + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(cfqq); + return NULL; +} /* * Allocate cfq data structures associated with this request. */ @@ -2413,11 +2475,22 @@ cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask) if (!cic) goto queue_fail; +new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); } else { + /* + * If the queue was seeky for too long, break it apart. + */ + if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { + cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); + cfqq = split_cfqq(cic, cfqq); + if (!cfqq) + goto new_queue; + } + /* * Check to see if this queue is scheduled to merge with * another, closely cooperating queue. The merging of From 1a1238a7dd48e48b3bba8f426a1d61c22c80d6d1 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 27 Oct 2009 08:46:23 +0100 Subject: [PATCH 016/113] cfq-iosched: improve hw_tag detection If active queue hasn't enough requests and idle window opens, cfq will not dispatch sufficient requests to hardware. In such situation, current code will zero hw_tag. But this is because cfq doesn't dispatch enough requests instead of hardware queue doesn't work. Don't zero hw_tag in such case. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 47d6aaca0c5..418da9a49bb 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2257,6 +2257,8 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { + struct cfq_queue *cfqq = cfqd->active_queue; + if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) cfqd->rq_in_driver_peak = rq_in_driver(cfqd); @@ -2264,6 +2266,16 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) return; + /* + * If active queue hasn't enough requests and can idle, cfq might not + * dispatch sufficient requests to hardware. Don't zero hw_tag in this + * case + */ + if (cfqq && cfq_cfqq_idle_window(cfqq) && + cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < + CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) < CFQ_HW_QUEUE_MIN) + return; + if (cfqd->hw_tag_samples++ < 50) return; From 5db5d64277bf390056b1a87d0bb288c8b8553f96 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 26 Oct 2009 22:44:04 +0100 Subject: [PATCH 017/113] cfq-iosched: adapt slice to number of processes doing I/O When the number of processes performing I/O concurrently increases, a fixed time slice per process will cause large latencies. This patch, if low_latency mode is enabled, will scale the time slice assigned to each process according to a 300ms target latency. In order to keep fairness among processes: * The number of active processes is computed using a special form of running average, that quickly follows sudden increases (to keep latency low), and decrease slowly (to have fairness in spite of rapid decreases of this value). To safeguard sequential bandwidth, we impose a minimum time slice (computed using 2*cfq_slice_idle as base, adjusted according to priority and async-ness). Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 53 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 418da9a49bb..97d946585bc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -27,6 +27,8 @@ static const int cfq_slice_sync = HZ / 10; static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ +static const int cfq_hist_divisor = 4; /* * offset from end of service tree @@ -148,6 +150,8 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; + unsigned int busy_rt_queues; + unsigned int busy_queues_avg[2]; int rq_in_driver[2]; int sync_flight; @@ -315,10 +319,52 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } +/* + * get averaged number of queues of RT/BE priority. + * average is updated, with a formula that gives more weight to higher numbers, + * to quickly follows sudden increases and decrease slowly + */ + +static inline unsigned +cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) { + unsigned min_q, max_q; + unsigned mult = cfq_hist_divisor - 1; + unsigned round = cfq_hist_divisor / 2; + unsigned busy = cfqd->busy_rt_queues; + + if (!rt) + busy = cfqd->busy_queues - cfqd->busy_rt_queues; + + min_q = min(cfqd->busy_queues_avg[rt], busy); + max_q = max(cfqd->busy_queues_avg[rt], busy); + cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) / + cfq_hist_divisor; + return cfqd->busy_queues_avg[rt]; +} + static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; + unsigned slice = cfq_prio_to_slice(cfqd, cfqq); + if (cfqd->cfq_latency) { + /* interested queues (we consider only the ones with the same + * priority class) */ + unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq)); + unsigned sync_slice = cfqd->cfq_slice[1]; + unsigned expect_latency = sync_slice * iq; + if (expect_latency > cfq_target_latency) { + unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; + /* scale low_slice according to IO priority + * and sync vs async */ + unsigned low_slice = + min(slice, base_low_slice * slice / sync_slice); + /* the adapted slice value is scaled to fit all iqs + * into the target latency */ + slice = max(slice * cfq_target_latency / expect_latency, + low_slice); + } + } + cfqq->slice_end = jiffies + slice; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } @@ -669,7 +715,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - + if (cfq_class_rt(cfqq)) + cfqd->busy_rt_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -692,6 +739,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + if (cfq_class_rt(cfqq)) + cfqd->busy_rt_queues--; } /* From aa6f6a3de18131348f70951efb2c56d806033e09 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 26 Oct 2009 22:44:33 +0100 Subject: [PATCH 018/113] cfq-iosched: preparation to handle multiple service trees We embed a pointer to the service tree in each queue, to handle multiple service trees easily. Service trees are enriched with a counter. cfq_add_rq_rb is invoked after putting the rq in the fifo, to ensure that all fields in rq are properly initialized. Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 97d946585bc..c95c69e199f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -75,8 +75,9 @@ static DEFINE_SPINLOCK(ioc_gone_lock); struct cfq_rb_root { struct rb_root rb; struct rb_node *left; + unsigned count; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, } /* * Per process-grouping structure @@ -128,6 +129,7 @@ struct cfq_queue { pid_t pid; + struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; }; @@ -503,6 +505,7 @@ static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) if (root->left == n) root->left = NULL; rb_erase_init(n, &root->rb); + --root->count; } /* @@ -553,11 +556,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; + struct cfq_rb_root *service_tree = &cfqd->service_tree; int left; if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); + parent = rb_last(&service_tree->rb); if (parent && parent != &cfqq->rb_node) { __cfqq = rb_entry(parent, struct cfq_queue, rb_node); rb_key += __cfqq->rb_key; @@ -575,7 +579,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->slice_resid = 0; } else { rb_key = -HZ; - __cfqq = cfq_rb_first(&cfqd->service_tree); + __cfqq = cfq_rb_first(service_tree); rb_key += __cfqq ? __cfqq->rb_key : jiffies; } @@ -586,12 +590,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (rb_key == cfqq->rb_key) return; - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; } left = 1; parent = NULL; - p = &cfqd->service_tree.rb.rb_node; + cfqq->service_tree = service_tree; + p = &service_tree->rb.rb_node; while (*p) { struct rb_node **n; @@ -623,11 +629,12 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, } if (left) - cfqd->service_tree.left = &cfqq->rb_node; + service_tree->left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); + rb_insert_color(&cfqq->rb_node, &service_tree->rb); + service_tree->count++; } static struct cfq_queue * @@ -730,8 +737,10 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; + } if (cfqq->p_root) { rb_erase(&cfqq->p_node, cfqq->p_root); cfqq->p_root = NULL; @@ -2292,10 +2301,9 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq) cfq_log_cfqq(cfqd, cfqq, "insert_request"); cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); - cfq_add_rq_rb(rq); - rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); + cfq_add_rq_rb(rq); cfq_rq_enqueued(cfqd, cfqq, rq); } From c0324a020e5b351f100569b128715985f1023af8 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Tue, 27 Oct 2009 19:16:03 +0100 Subject: [PATCH 019/113] cfq-iosched: reimplement priorities using different service trees We use different service trees for different priority classes. This allows a simplification in the service tree insertion code, that no longer has to consider priority while walking the tree. Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 116 +++++++++++++++++++++++++++++++------------- 1 file changed, 82 insertions(+), 34 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c95c69e199f..6e5c3d715eb 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -133,6 +133,16 @@ struct cfq_queue { struct cfq_queue *new_cfqq; }; +/* + * Index in the service_trees. + * IDLE is handled separately, so it has negative index + */ +enum wl_prio_t { + IDLE_WORKLOAD = -1, + BE_WORKLOAD = 0, + RT_WORKLOAD = 1 +}; + /* * Per block device queue structure */ @@ -140,9 +150,15 @@ struct cfq_data { struct request_queue *queue; /* - * rr list of queues with requests and the count of them + * rr lists of queues with requests, onle rr for each priority class. + * Counts are embedded in the cfq_rb_root */ - struct cfq_rb_root service_tree; + struct cfq_rb_root service_trees[2]; + struct cfq_rb_root service_tree_idle; + /* + * The priority currently being served + */ + enum wl_prio_t serving_prio; /* * Each priority tree is sorted by next_request position. These @@ -152,7 +168,6 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; - unsigned int busy_rt_queues; unsigned int busy_queues_avg[2]; int rq_in_driver[2]; @@ -205,6 +220,15 @@ struct cfq_data { unsigned long last_end_sync_rq; }; +static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio, + struct cfq_data *cfqd) +{ + if (prio == IDLE_WORKLOAD) + return &cfqd->service_tree_idle; + + return &cfqd->service_trees[prio]; +} + enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ @@ -249,6 +273,23 @@ CFQ_CFQQ_FNS(coop); #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) +static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) +{ + if (cfq_class_idle(cfqq)) + return IDLE_WORKLOAD; + if (cfq_class_rt(cfqq)) + return RT_WORKLOAD; + return BE_WORKLOAD; +} + +static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd) +{ + if (wl == IDLE_WORKLOAD) + return cfqd->service_tree_idle.count; + + return cfqd->service_trees[wl].count; +} + static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, struct io_context *, gfp_t); @@ -332,10 +373,7 @@ cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) { unsigned min_q, max_q; unsigned mult = cfq_hist_divisor - 1; unsigned round = cfq_hist_divisor / 2; - unsigned busy = cfqd->busy_rt_queues; - - if (!rt) - busy = cfqd->busy_queues - cfqd->busy_rt_queues; + unsigned busy = cfq_busy_queues_wl(rt, cfqd); min_q = min(cfqd->busy_queues_avg[rt], busy); max_q = max(cfqd->busy_queues_avg[rt], busy); @@ -546,7 +584,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, } /* - * The cfqd->service_tree holds all pending cfq_queue's that have + * The cfqd->service_trees holds all pending cfq_queue's that have * requests waiting to be processed. It is sorted in the order that * we will service the queues. */ @@ -556,9 +594,10 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; - struct cfq_rb_root *service_tree = &cfqd->service_tree; + struct cfq_rb_root *service_tree; int left; + service_tree = service_tree_for(cfqq_prio(cfqq), cfqd); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; parent = rb_last(&service_tree->rb); @@ -587,7 +626,8 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * same position, nothing more to do */ - if (rb_key == cfqq->rb_key) + if (rb_key == cfqq->rb_key && + cfqq->service_tree == service_tree) return; cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); @@ -605,25 +645,14 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, __cfqq = rb_entry(parent, struct cfq_queue, rb_node); /* - * sort RT queues first, we always want to give - * preference to them. IDLE queues goes to the back. - * after that, sort on the next service time. + * sort by key, that represents service time. */ - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) + if (time_before(rb_key, __cfqq->rb_key)) n = &(*p)->rb_left; - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) + else { n = &(*p)->rb_right; - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) - n = &(*p)->rb_right; - else if (time_before(rb_key, __cfqq->rb_key)) - n = &(*p)->rb_left; - else - n = &(*p)->rb_right; - - if (n == &(*p)->rb_right) left = 0; + } p = n; } @@ -722,8 +751,7 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues++; + cfq_resort_rr_list(cfqd, cfqq); } @@ -748,8 +776,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; - if (cfq_class_rt(cfqq)) - cfqd->busy_rt_queues--; } /* @@ -1003,10 +1029,12 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) - return NULL; + struct cfq_rb_root *service_tree = + service_tree_for(cfqd->serving_prio, cfqd); - return cfq_rb_first(&cfqd->service_tree); + if (RB_EMPTY_ROOT(&service_tree->rb)) + return NULL; + return cfq_rb_first(service_tree); } /* @@ -1123,6 +1151,12 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (CFQQ_SEEKY(cfqq)) return NULL; + /* + * Do not merge queues of different priority classes + */ + if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) + return NULL; + return cfqq; } @@ -1336,6 +1370,14 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) expire: cfq_slice_expired(cfqd, 0); new_queue: + if (!new_cfqq) { + if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) + cfqd->serving_prio = RT_WORKLOAD; + else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd)) + cfqd->serving_prio = BE_WORKLOAD; + else + cfqd->serving_prio = IDLE_WORKLOAD; + } cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: return cfqq; @@ -1362,8 +1404,12 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) { struct cfq_queue *cfqq; int dispatched = 0; + int i; + for (i = 0; i < 2; ++i) + while ((cfqq = cfq_rb_first(&cfqd->service_trees[i])) != NULL) + dispatched += __cfq_forced_dispatch_cfqq(cfqq); - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) + while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); cfq_slice_expired(cfqd, 0); @@ -2710,7 +2756,9 @@ static void *cfq_init_queue(struct request_queue *q) if (!cfqd) return NULL; - cfqd->service_tree = CFQ_RB_ROOT; + for (i = 0; i < 2; ++i) + cfqd->service_trees[i] = CFQ_RB_ROOT; + cfqd->service_tree_idle = CFQ_RB_ROOT; /* * Not strictly needed (since RB_ROOT just clears the node and we From a6d44e982d3734583b3b4e1d36921af8cfd61fc0 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 26 Oct 2009 22:45:11 +0100 Subject: [PATCH 020/113] cfq-iosched: enable idling for last queue on priority class cfq can disable idling for queues in various circumstances. When workloads of different priorities are competing, if the higher priority queue has idling disabled, lower priority queues may steal its disk share. For example, in a scenario with an RT process performing seeky reads vs a BE process performing sequential reads, on an NCQ enabled hardware, with low_latency unset, the RT process will dispatch only the few pending requests every full slice of service for the BE process. The patch solves this issue by always performing idle on the last queue at a given priority class > idle. If the same process, or one that can pre-empt it (so at the same priority or higher), submits a new request within the idle window, the lower priority queue won't dispatch, saving the disk bandwidth for higher priority ones. Note: this doesn't touch the non_rotational + NCQ case (no hardware to test if this is a benefit in that case). Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 6e5c3d715eb..76afa369689 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1160,6 +1160,34 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, return cfqq; } +/* + * Determine whether we should enforce idle window for this queue. + */ + +static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + enum wl_prio_t prio = cfqq_prio(cfqq); + struct cfq_rb_root *service_tree; + + /* We never do for idle class queues. */ + if (prio == IDLE_WORKLOAD) + return false; + + /* We do for queues that were marked with idle window flag. */ + if (cfq_cfqq_idle_window(cfqq)) + return true; + + /* + * Otherwise, we do only if they are the last ones + * in their service tree. + */ + service_tree = service_tree_for(prio, cfqd); + if (service_tree->count == 0) + return true; + + return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq); +} + static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; @@ -1180,7 +1208,7 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) + if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq)) return; /* @@ -1362,7 +1390,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) * conditions to happen (or time out) before selecting a new queue. */ if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { + (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) { cfqq = NULL; goto keep_queue; } @@ -1427,7 +1455,7 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) /* * Drain async requests before we start sync IO */ - if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) return false; /* From 718eee0579b802aabe3bafacf09d0a9b0830f1dd Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 26 Oct 2009 22:45:29 +0100 Subject: [PATCH 021/113] cfq-iosched: fairness for sync no-idle queues Currently no-idle queues in cfq are not serviced fairly: even if they can only dispatch a small number of requests at a time, they have to compete with idling queues to be serviced, experiencing large latencies. We should notice, instead, that no-idle queues are the ones that would benefit most from having low latency, in fact they are any of: * processes with large think times (e.g. interactive ones like file managers) * seeky (e.g. programs faulting in their code at startup) * or marked as no-idle from upper levels, to improve latencies of those requests. This patch improves the fairness and latency for those queues, by: * separating sync idle, sync no-idle and async queues in separate service_trees, for each priority * service all no-idle queues together * and idling when the last no-idle queue has been serviced, to anticipate for more no-idle work * the timeslices allotted for idle and no-idle service_trees are computed proportionally to the number of processes in each set. Servicing all no-idle queues together should have a performance boost for NCQ-capable drives, without compromising fairness. Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 200 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 168 insertions(+), 32 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 76afa369689..859f534ae9e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -134,7 +134,7 @@ struct cfq_queue { }; /* - * Index in the service_trees. + * First index in the service_trees. * IDLE is handled separately, so it has negative index */ enum wl_prio_t { @@ -143,6 +143,16 @@ enum wl_prio_t { RT_WORKLOAD = 1 }; +/* + * Second index in the service_trees. + */ +enum wl_type_t { + ASYNC_WORKLOAD = 0, + SYNC_NOIDLE_WORKLOAD = 1, + SYNC_WORKLOAD = 2 +}; + + /* * Per block device queue structure */ @@ -153,12 +163,14 @@ struct cfq_data { * rr lists of queues with requests, onle rr for each priority class. * Counts are embedded in the cfq_rb_root */ - struct cfq_rb_root service_trees[2]; + struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; /* * The priority currently being served */ enum wl_prio_t serving_prio; + enum wl_type_t serving_type; + unsigned long workload_expires; /* * Each priority tree is sorted by next_request position. These @@ -221,12 +233,13 @@ struct cfq_data { }; static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio, + enum wl_type_t type, struct cfq_data *cfqd) { if (prio == IDLE_WORKLOAD) return &cfqd->service_tree_idle; - return &cfqd->service_trees[prio]; + return &cfqd->service_trees[prio][type]; } enum cfqq_state_flags { @@ -282,12 +295,24 @@ static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) return BE_WORKLOAD; } + +static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) +{ + if (!cfq_cfqq_sync(cfqq)) + return ASYNC_WORKLOAD; + if (!cfq_cfqq_idle_window(cfqq)) + return SYNC_NOIDLE_WORKLOAD; + return SYNC_WORKLOAD; +} + static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd) { if (wl == IDLE_WORKLOAD) return cfqd->service_tree_idle.count; - return cfqd->service_trees[wl].count; + return cfqd->service_trees[wl][ASYNC_WORKLOAD].count + + cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count + + cfqd->service_trees[wl][SYNC_WORKLOAD].count; } static void cfq_dispatch_insert(struct request_queue *, struct request *); @@ -597,7 +622,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rb_root *service_tree; int left; - service_tree = service_tree_for(cfqq_prio(cfqq), cfqd); + service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; parent = rb_last(&service_tree->rb); @@ -1030,7 +1055,7 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { struct cfq_rb_root *service_tree = - service_tree_for(cfqd->serving_prio, cfqd); + service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd); if (RB_EMPTY_ROOT(&service_tree->rb)) return NULL; @@ -1167,7 +1192,7 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) { enum wl_prio_t prio = cfqq_prio(cfqq); - struct cfq_rb_root *service_tree; + struct cfq_rb_root *service_tree = cfqq->service_tree; /* We never do for idle class queues. */ if (prio == IDLE_WORKLOAD) @@ -1181,7 +1206,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - service_tree = service_tree_for(prio, cfqd); + if (!service_tree) + service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd); + if (service_tree->count == 0) return true; @@ -1235,14 +1262,20 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) cfq_mark_cfqq_wait_request(cfqq); - /* - * we don't want to idle for seeks, but we do want to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. so allow a little bit of time for him to submit a new rq - */ sl = cfqd->cfq_slice_idle; - if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) + /* are we servicing noidle tree, and there are more queues? + * non-rotational or NCQ: no idle + * non-NCQ rotational : very small idle, to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. + */ + if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && + service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WORKLOAD, cfqd) + ->count > 0) { + if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag) + return; sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); + } mod_timer(&cfqd->idle_slice_timer, jiffies + sl); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); @@ -1346,6 +1379,106 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) } } +static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, + bool prio_changed) +{ + struct cfq_queue *queue; + int i; + bool key_valid = false; + unsigned long lowest_key = 0; + enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; + + if (prio_changed) { + /* + * When priorities switched, we prefer starting + * from SYNC_NOIDLE (first choice), or just SYNC + * over ASYNC + */ + if (service_tree_for(prio, cur_best, cfqd)->count) + return cur_best; + cur_best = SYNC_WORKLOAD; + if (service_tree_for(prio, cur_best, cfqd)->count) + return cur_best; + + return ASYNC_WORKLOAD; + } + + for (i = 0; i < 3; ++i) { + /* otherwise, select the one with lowest rb_key */ + queue = cfq_rb_first(service_tree_for(prio, i, cfqd)); + if (queue && + (!key_valid || time_before(queue->rb_key, lowest_key))) { + lowest_key = queue->rb_key; + cur_best = i; + key_valid = true; + } + } + + return cur_best; +} + +static void choose_service_tree(struct cfq_data *cfqd) +{ + enum wl_prio_t previous_prio = cfqd->serving_prio; + bool prio_changed; + unsigned slice; + unsigned count; + + /* Choose next priority. RT > BE > IDLE */ + if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) + cfqd->serving_prio = RT_WORKLOAD; + else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd)) + cfqd->serving_prio = BE_WORKLOAD; + else { + cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->workload_expires = jiffies + 1; + return; + } + + /* + * For RT and BE, we have to choose also the type + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload + * expiration time + */ + prio_changed = (cfqd->serving_prio != previous_prio); + count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) + ->count; + + /* + * If priority didn't change, check workload expiration, + * and that we still have other queues ready + */ + if (!prio_changed && count && + !time_after(jiffies, cfqd->workload_expires)) + return; + + /* otherwise select new workload type */ + cfqd->serving_type = + cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed); + count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) + ->count; + + /* + * the workload slice is computed as a fraction of target latency + * proportional to the number of queues in that workload, over + * all the queues in the same priority class + */ + slice = cfq_target_latency * count / + max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio], + cfq_busy_queues_wl(cfqd->serving_prio, cfqd)); + + if (cfqd->serving_type == ASYNC_WORKLOAD) + /* async workload slice is scaled down according to + * the sync/async slice ratio. */ + slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; + else + /* sync workload slice is at least 2 * cfq_slice_idle */ + slice = max(slice, 2 * cfqd->cfq_slice_idle); + + slice = max_t(unsigned, slice, CFQ_MIN_TT); + cfqd->workload_expires = jiffies + slice; +} + /* * Select a queue for service. If we have a current active queue, * check whether to continue servicing it, or retrieve and set a new one. @@ -1398,14 +1531,13 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) expire: cfq_slice_expired(cfqd, 0); new_queue: - if (!new_cfqq) { - if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) - cfqd->serving_prio = RT_WORKLOAD; - else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd)) - cfqd->serving_prio = BE_WORKLOAD; - else - cfqd->serving_prio = IDLE_WORKLOAD; - } + /* + * Current queue expired. Check if we have to switch to a new + * service tree + */ + if (!new_cfqq) + choose_service_tree(cfqd); + cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: return cfqq; @@ -1432,10 +1564,12 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) { struct cfq_queue *cfqq; int dispatched = 0; - int i; + int i, j; for (i = 0; i < 2; ++i) - while ((cfqq = cfq_rb_first(&cfqd->service_trees[i])) != NULL) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); + for (j = 0; j < 3; ++j) + while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j])) + != NULL) + dispatched += __cfq_forced_dispatch_cfqq(cfqq); while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); @@ -2218,13 +2352,10 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (!cfqd->cfq_latency && cfqd->hw_tag && CFQQ_SEEKY(cfqq))) + (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { - unsigned int slice_idle = cfqd->cfq_slice_idle; - if (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq)) - slice_idle = msecs_to_jiffies(CFQ_MIN_TT); - if (cic->ttime_mean > slice_idle) + if (cic->ttime_mean > cfqd->cfq_slice_idle) enable_idle = 0; else enable_idle = 1; @@ -2262,6 +2393,10 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_idle(cfqq)) return true; + if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD + && new_cfqq->service_tree == cfqq->service_tree) + return true; + /* * if the new request is sync, but the currently running queue is * not, let the sync request have priority. @@ -2778,14 +2913,15 @@ static void cfq_exit_queue(struct elevator_queue *e) static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; - int i; + int i, j; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; for (i = 0; i < 2; ++i) - cfqd->service_trees[i] = CFQ_RB_ROOT; + for (j = 0; j < 3; ++j) + cfqd->service_trees[i][j] = CFQ_RB_ROOT; cfqd->service_tree_idle = CFQ_RB_ROOT; /* From 5869619cb5b26754574375472fe54a390edf34c7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Oct 2009 09:27:07 +0100 Subject: [PATCH 022/113] cfq-iosched: fix style issue in cfq_get_avg_queues() Line breaks and bad brace placement. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 859f534ae9e..aa00d8f2d0b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -393,8 +393,8 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) * to quickly follows sudden increases and decrease slowly */ -static inline unsigned -cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) { +static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) +{ unsigned min_q, max_q; unsigned mult = cfq_hist_divisor - 1; unsigned round = cfq_hist_divisor / 2; From 1af60fbd759d31f565552fea315c2033947cfbe6 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 2 Oct 2009 18:56:53 -0400 Subject: [PATCH 023/113] block: get rid of the WRITE_ODIRECT flag Hi, The WRITE_ODIRECT flag is only used in one place, and that code path happens to also call blk_run_address_space. The introduction of this flag, then, could result in the device being unplugged twice for every I/O. Further, with the batching changes in the next patch, we don't want an O_DIRECT write to imply a queue unplug. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/direct-io.c | 2 +- include/linux/fs.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 8b10b87dc01..c86d35f142d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_ODIRECT; + rw = WRITE_SYNC_PLUG; if (bdev) bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2620a8c6357..2f5fca4147c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -129,7 +129,6 @@ struct inodes_stat_t { * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device * immediately after submission. The write equivalent * of READ_SYNC. - * WRITE_ODIRECT Special case write for O_DIRECT only. * SWRITE_SYNC * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. * See SWRITE. @@ -151,7 +150,6 @@ struct inodes_stat_t { #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define SWRITE_SYNC_PLUG \ (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) From cfb1e33eed48165763edc7a4a067cf5f74898d0b Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 2 Oct 2009 18:57:36 -0400 Subject: [PATCH 024/113] aio: implement request batching Hi, Some workloads issue batches of small I/O, and the performance is poor due to the call to blk_run_address_space for every single iocb. Nathan Roberts pointed this out, and suggested that by deferring this call until all I/Os in the iocb array are submitted to the block layer, we can realize some impressive performance gains (up to 30% for sequential 4k reads in batches of 16). Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/aio.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++-- fs/direct-io.c | 8 +++---- 2 files changed, 63 insertions(+), 6 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 02a2c934057..cf0bef428f8 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include @@ -60,6 +63,14 @@ static DECLARE_WORK(fput_work, aio_fput_routine); static DEFINE_SPINLOCK(fput_lock); static LIST_HEAD(fput_head); +#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */ +#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS) +struct aio_batch_entry { + struct hlist_node list; + struct address_space *mapping; +}; +mempool_t *abe_pool; + static void aio_kick_handler(struct work_struct *); static void aio_queue_work(struct kioctx *); @@ -73,6 +84,8 @@ static int __init aio_setup(void) kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); aio_wq = create_workqueue("aio"); + abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); + BUG_ON(!abe_pool); pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); @@ -1531,8 +1544,44 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode, return 1; } +static void aio_batch_add(struct address_space *mapping, + struct hlist_head *batch_hash) +{ + struct aio_batch_entry *abe; + struct hlist_node *pos; + unsigned bucket; + + bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS); + hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) { + if (abe->mapping == mapping) + return; + } + + abe = mempool_alloc(abe_pool, GFP_KERNEL); + BUG_ON(!igrab(mapping->host)); + abe->mapping = mapping; + hlist_add_head(&abe->list, &batch_hash[bucket]); + return; +} + +static void aio_batch_free(struct hlist_head *batch_hash) +{ + struct aio_batch_entry *abe; + struct hlist_node *pos, *n; + int i; + + for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) { + hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) { + blk_run_address_space(abe->mapping); + iput(abe->mapping->host); + hlist_del(&abe->list); + mempool_free(abe, abe_pool); + } + } +} + static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, - struct iocb *iocb) + struct iocb *iocb, struct hlist_head *batch_hash) { struct kiocb *req; struct file *file; @@ -1608,6 +1657,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, ; } spin_unlock_irq(&ctx->ctx_lock); + if (req->ki_opcode == IOCB_CMD_PREAD || + req->ki_opcode == IOCB_CMD_PREADV || + req->ki_opcode == IOCB_CMD_PWRITE || + req->ki_opcode == IOCB_CMD_PWRITEV) + aio_batch_add(file->f_mapping, batch_hash); + aio_put_req(req); /* drop extra ref to req */ return 0; @@ -1635,6 +1690,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, struct kioctx *ctx; long ret = 0; int i; + struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; if (unlikely(nr < 0)) return -EINVAL; @@ -1666,10 +1722,11 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, break; } - ret = io_submit_one(ctx, user_iocb, &tmp); + ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); if (ret) break; } + aio_batch_free(batch_hash); put_ioctx(ctx); return i ? i : ret; diff --git a/fs/direct-io.c b/fs/direct-io.c index c86d35f142d..3af761c8c5c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1028,9 +1028,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, if (dio->bio) dio_bio_submit(dio); - /* All IO is now issued, send it on its way */ - blk_run_address_space(inode->i_mapping); - /* * It is possible that, we return short IO due to end of file. * In that case, we need to release all the pages we got hold on. @@ -1057,8 +1054,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, ((rw & READ) || (dio->result == dio->size))) ret = -EIOCBQUEUED; - if (ret != -EIOCBQUEUED) + if (ret != -EIOCBQUEUED) { + /* All IO is now issued, send it on its way */ + blk_run_address_space(inode->i_mapping); dio_await_completion(dio); + } /* * Sync will always be dropping the final ref and completing the From a870a3a485ddf7c0dec549269ed71d169556d61c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 28 Oct 2009 09:30:27 +0100 Subject: [PATCH 025/113] drbd: fix in_flight rw indexing Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_req.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d3426ff405b..3678d3d66c6 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -40,7 +40,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_stat_unlock(); - mdev->vdisk->part0.in_flight++; + mdev->vdisk->part0.in_flight[rw]++; } /* Update disk stats when completing request upwards */ @@ -53,7 +53,7 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); part_round_stats(cpu, &mdev->vdisk->part0); part_stat_unlock(); - mdev->vdisk->part0.in_flight--; + mdev->vdisk->part0.in_flight[rw]--; } static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) From b9d128f1088ea5245109dfc9bbceb128b6371a77 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Oct 2009 13:59:26 +0100 Subject: [PATCH 026/113] block: move bdi/address_space unplug functions to backing-dev.h There's nothing block related about them, the backing device is used by things like NFS etc as well. This gets rid of the need to protect such calls by CONFIG_BLOCK. Signed-off-by: Jens Axboe --- fs/aio.c | 1 + include/linux/backing-dev.h | 13 +++++++++++++ include/linux/blkdev.h | 13 ------------- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index cf0bef428f8..c30dfc00610 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #define DEBUG 0 diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b449e738533..fcbc26af00e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -331,4 +331,17 @@ static inline int bdi_sched_wait(void *word) return 0; } +static inline void blk_run_backing_dev(struct backing_dev_info *bdi, + struct page *page) +{ + if (bdi && bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, page); +} + +static inline void blk_run_address_space(struct address_space *mapping) +{ + if (mapping) + blk_run_backing_dev(mapping->backing_dev_info, NULL); +} + #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 221cecd86bd..39c601f783a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -823,19 +823,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_disk->queue; } -static inline void blk_run_backing_dev(struct backing_dev_info *bdi, - struct page *page) -{ - if (bdi && bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, page); -} - -static inline void blk_run_address_space(struct address_space *mapping) -{ - if (mapping) - blk_run_backing_dev(mapping->backing_dev_info, NULL); -} - /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request From ab0a9735e06914ce4d2a94ffa41497dbc142fe7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 29 Oct 2009 14:14:04 +0100 Subject: [PATCH 027/113] blkdev: flush disk cache on ->fsync Currently there is no barrier support in the block device code. That means we cannot guarantee any sort of data integerity when using the block device node with dis kwrite caches enabled. Using the raw block device node is a typical use case for virtualization (and I assume databases, too). This patch changes block_fsync to issue a cache flush and thus make fsync on block device nodes actually useful. Note that in mainline we would also need to add such code to the ->aio_write method for O_SYNC handling, but assuming that Jan's patch series for the O_SYNC rewrite goes in it will also call into ->fsync for 2.6.32. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 9cf4b926f8e..dde91e7e1c3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -405,7 +405,17 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) { - return sync_blockdev(I_BDEV(filp->f_mapping->host)); + struct block_device *bdev = I_BDEV(filp->f_mapping->host); + int error; + + error = sync_blockdev(bdev); + if (error) + return error; + + error = blkdev_issue_flush(bdev, NULL); + if (error == -EOPNOTSUPP) + error = 0; + return error; } /* From dddb74519aec2081204d203a97578c9fc4e9fb64 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Mon, 2 Nov 2009 10:40:37 +0100 Subject: [PATCH 028/113] cfq-iosched: simplify prio-unboost code Eliminate redundant checks. Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 418da9a49bb..757010d8fb7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2359,12 +2359,10 @@ static void cfq_prio_boost(struct cfq_queue *cfqq) cfqq->ioprio = IOPRIO_NORM; } else { /* - * check if we need to unboost the queue + * unboost the queue (if needed) */ - if (cfqq->ioprio_class != cfqq->org_ioprio_class) - cfqq->ioprio_class = cfqq->org_ioprio_class; - if (cfqq->ioprio != cfqq->org_ioprio) - cfqq->ioprio = cfqq->org_ioprio; + cfqq->ioprio_class = cfqq->org_ioprio_class; + cfqq->ioprio = cfqq->org_ioprio; } } From 4f570f995f68ef77aae7e5a441222f59232f2d0e Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Nov 2009 11:40:16 +0100 Subject: [PATCH 029/113] Do not __always_inline bvec_kmap_irq() and bvec_kunmap_irq() So remove both the comment and the inline requirement, going back to the inline hint. Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 5be93f18d84..474792b825d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -450,11 +450,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; /* * remember never ever reenable interrupts between a bvec_kmap_irq and * bvec_kunmap_irq! - * - * This function MUST be inlined - it plays with the CPU interrupt flags. */ -static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, - unsigned long *flags) +static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) { unsigned long addr; @@ -470,8 +467,7 @@ static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, return (char *) addr + bvec->bv_offset; } -static __always_inline void bvec_kunmap_irq(char *buffer, - unsigned long *flags) +static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { unsigned long ptr = (unsigned long) buffer & PAGE_MASK; From 125c4f221a5352ae08aef2898055b879ad963f01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 3 Nov 2009 21:25:45 +0100 Subject: [PATCH 030/113] cfq-iosched: fix merge error We ended up with testing the same condition twice, pretty pointless. Remove that first if. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 13b612f9f27..b700f41cafb 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2433,7 +2433,6 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, cfqq, rq)) if (cfq_rq_close(cfqd, cfqq, rq) && (!cfq_cfqq_coop(new_cfqq) || cfqd->busy_queues == 1)) { /* From e00ef7997195e4f8e10593727a6286e2e2802159 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 4 Nov 2009 08:54:55 +0100 Subject: [PATCH 031/113] cfq-iosched: get rid of the coop_preempt flag We need to rework this logic post the cooperating cfq_queue merging, for now just get rid of it and Jeff Moyer will fix the fall out. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b700f41cafb..4ab240c875d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -253,7 +253,6 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ - CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */ }; #define CFQ_CFQQ_FNS(name) \ @@ -280,7 +279,6 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); -CFQ_CFQQ_FNS(coop_preempt); #undef CFQ_CFQQ_FNS #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ @@ -1070,16 +1068,9 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - if (!cfqq) { + if (!cfqq) cfqq = cfq_get_next_queue(cfqd); - if (cfqq && !cfq_cfqq_coop_preempt(cfqq)) - cfq_clear_cfqq_coop(cfqq); - } - - if (cfqq) - cfq_clear_cfqq_coop_preempt(cfqq); - __cfq_set_active_queue(cfqd, cfqq); return cfqq; } @@ -2433,16 +2424,8 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, cfqq, rq) && (!cfq_cfqq_coop(new_cfqq) || - cfqd->busy_queues == 1)) { - /* - * Mark new queue coop_preempt, so its coop flag will not be - * cleared when new queue gets scheduled at the very first time - */ - cfq_mark_cfqq_coop_preempt(new_cfqq); - cfq_mark_cfqq_coop(new_cfqq); + if (cfq_rq_close(cfqd, cfqq, rq)) return true; - } return false; } From f21121cde6e617b90cd03ce083652ca543004dc2 Mon Sep 17 00:00:00 2001 From: Hideyuki Sasaki Date: Wed, 4 Nov 2009 09:09:28 +0100 Subject: [PATCH 032/113] block/ps3: fix slow VRAM IO The current PS3 VRAM driver uses msleep() to wait for completion of RSX DMA transfers between system memory and VRAM. Depending on the system timing, the processing delay and overhead of this msleep() call can significantly impact VRAM driver IO. To avoid the condition, add a short duration (200 usec max) udelay() polling loop before entering the msleep() polling loop. Signed-off-by: Hideyuki Sasaki Signed-off-by: Geoff Levand Acked-by: Jim Paris Cc: Geert Uytterhoeven Signed-off-by: Jens Axboe --- drivers/block/ps3vram.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 3bb7c47c869..1fb6c3135fc 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -123,7 +123,15 @@ static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev, { struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); u32 *notify = ps3vram_get_notifier(priv->reports, NOTIFIER); - unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms); + unsigned long timeout; + + for (timeout = 20; timeout; timeout--) { + if (!notify[3]) + return 0; + udelay(10); + } + + timeout = jiffies + msecs_to_jiffies(timeout_ms); do { if (!notify[3]) From cc56f7de7f00d188c7c4da1e9861581853b9e92f Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Wed, 4 Nov 2009 09:09:52 +0100 Subject: [PATCH 033/113] sendfile(): check f_op.splice_write() rather than f_op.sendpage() sendfile(2) was reworked with the splice infrastructure, but it still checks f_op.sendpage() instead of f_op.splice_write() wrongly. Although if f_op.sendpage() exists, f_op.splice_write() always exists at the same time currently, the assumption will be broken in future silently. This patch also brings a side effect: sendfile(2) can work with any output file. Some security checks related to f_op are added too. Signed-off-by: Changli Gao Signed-off-by: Jens Axboe --- fs/read_write.c | 2 -- fs/splice.c | 24 +++++++++++++++--------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 3ac28987f22..b7f4a1f94d4 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -826,8 +826,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(out_file->f_mode & FMODE_WRITE)) goto fput_out; retval = -EINVAL; - if (!out_file->f_op || !out_file->f_op->sendpage) - goto fput_out; in_inode = in_file->f_path.dentry->d_inode; out_inode = out_file->f_path.dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); diff --git a/fs/splice.c b/fs/splice.c index 7394e9e1753..39208663aaf 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -648,9 +648,11 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe, ret = buf->ops->confirm(pipe, buf); if (!ret) { more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - - ret = file->f_op->sendpage(file, buf->page, buf->offset, - sd->len, &pos, more); + if (file->f_op && file->f_op->sendpage) + ret = file->f_op->sendpage(file, buf->page, buf->offset, + sd->len, &pos, more); + else + ret = -EINVAL; } return ret; @@ -1068,8 +1070,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(ret < 0)) return ret; - splice_write = out->f_op->splice_write; - if (!splice_write) + if (out->f_op && out->f_op->splice_write) + splice_write = out->f_op->splice_write; + else splice_write = default_file_splice_write; return splice_write(pipe, out, ppos, len, flags); @@ -1093,8 +1096,9 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - splice_read = in->f_op->splice_read; - if (!splice_read) + if (in->f_op && in->f_op->splice_read) + splice_read = in->f_op->splice_read; + else splice_read = default_file_splice_read; return splice_read(in, ppos, pipe, len, flags); @@ -1316,7 +1320,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) return -ESPIPE; if (off_out) { - if (out->f_op->llseek == no_llseek) + if (!out->f_op || !out->f_op->llseek || + out->f_op->llseek == no_llseek) return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; @@ -1336,7 +1341,8 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) return -ESPIPE; if (off_in) { - if (in->f_op->llseek == no_llseek) + if (!in->f_op || !in->f_op->llseek || + in->f_op->llseek == no_llseek) return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; From 476d42f138ba82389a92a894d8a630a70d36278f Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Wed, 4 Nov 2009 09:10:33 +0100 Subject: [PATCH 034/113] block/scsi_ioctl.c: quiet sparse noise Quiet sparse noise about symbol's not being declared. Symbol blk_default_cmd_filter is only used locally and should be static. The function blk_scsi_ioctl_init() is a fs_initcall and should also be static. Signed-off-by: H Hartley Sweeten Cc: James Bottomley Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index e5b10017a50..a8b5a10eb5b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -35,7 +35,9 @@ struct blk_cmd_filter { unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; -} blk_default_cmd_filter; +}; + +static struct blk_cmd_filter blk_default_cmd_filter; /* Command group 3 is reserved and should never be used. */ const unsigned char scsi_command_size_tbl[8] = @@ -675,7 +677,7 @@ int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mod } EXPORT_SYMBOL(scsi_cmd_ioctl); -int __init blk_scsi_ioctl_init(void) +static int __init blk_scsi_ioctl_init(void) { blk_set_cmd_filter_defaults(&blk_default_cmd_filter); return 0; From 89e1838f5f2c2af80268a096b9a687643b0d0846 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 21 Sep 2009 10:46:22 +0200 Subject: [PATCH 035/113] change default: by default, use socket buffer auto tuning Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd_limits.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 9d067ce4696..51f47a586ad 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -70,11 +70,11 @@ /* I don't think that a tcp send buffer of more than 10M is usefull */ #define DRBD_SNDBUF_SIZE_MIN 0 #define DRBD_SNDBUF_SIZE_MAX (10<<20) -#define DRBD_SNDBUF_SIZE_DEF (2*65535) +#define DRBD_SNDBUF_SIZE_DEF 0 #define DRBD_RCVBUF_SIZE_MIN 0 #define DRBD_RCVBUF_SIZE_MAX (10<<20) -#define DRBD_RCVBUF_SIZE_DEF (2*65535) +#define DRBD_RCVBUF_SIZE_DEF 0 /* @4k PageSize -> 128kB - 512MB */ #define DRBD_MAX_BUFFERS_MIN 32 From ad19bf6e544f4d1abc22d2be130c7d5e4163146f Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Wed, 14 Oct 2009 09:36:49 +0200 Subject: [PATCH 036/113] fix grammar in printk Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 360baf60f57..d9312b45393 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2099,7 +2099,7 @@ static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local) break; } /* Else fall through to one of the other strategies... */ - dev_warn(DEV, "Discard younger/older primary did not found a decision\n" + dev_warn(DEV, "Discard younger/older primary did not find a decision\n" "Using discard-least-changes instead\n"); case ASB_DISCARD_ZERO_CHG: if (ch_peer == 0 && ch_self == 0) { From 1352994b363195ce932749d3518d4dc9a5479fea Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 12 Oct 2009 19:07:49 +0200 Subject: [PATCH 037/113] drbd: fix check for too large lower level device To check wether we are truncating a very large device due to limited meta data space, we need to check the ll_dev size. Also improve the printk to suggest "flexible" or "internal". Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 22538d9628f..e2a5875a07b 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -894,11 +894,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); } - if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors) - dev_warn(DEV, "truncating very big lower level device " - "to currently maximum possible %llu sectors\n", - (unsigned long long) max_possible_sectors); - if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { retcode = ERR_MD_DISK_TO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " @@ -917,6 +912,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp nbc->known_size = drbd_get_capacity(nbc->backing_bdev); + if (nbc->known_size > max_possible_sectors) { + dev_warn(DEV, "==> truncating very big lower level device " + "to currently maximum possible %llu sectors <==\n", + (unsigned long long) max_possible_sectors); + if (nbc->dc.meta_dev_idx >= 0) + dev_warn(DEV, "==>> using internal or flexible " + "meta data may help <<==\n"); + } + drbd_suspend_io(mdev); /* also wait for the last barrier ack. */ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt)); From 0a4921662513ae60dc638c8e13fbe3439d84db64 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 21 Oct 2009 13:08:29 +0200 Subject: [PATCH 038/113] drbdsetup X resume-io should be usable to resume IO [Bugz 256] When IO gets frozen due to a broken fence-peer script, the user should be able to thaw IO by the resume-io command. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_main.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 11d8ff6016a..157d1e4343c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -867,10 +867,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state } if (fp == FP_STONITH && - (ns.role == R_PRIMARY && - ns.conn < C_CONNECTED && - ns.pdsk > D_OUTDATED)) - ns.susp = 1; + (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) && + !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)) + ns.susp = 1; if (ns.aftr_isp || ns.peer_isp || ns.user_isp) { if (ns.conn == C_SYNC_SOURCE) From e656ec8ae2c0319b6d52834695f9635217d62de5 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 23 Oct 2009 13:57:45 +0200 Subject: [PATCH 039/113] Do not deadlock in drbd_disconnect() [bugz 258] When there are many blocks on the fly (ua), and the AL gets into "starving" mode (random IO, scattered all over the device), and the connections gets interrupted, the receiver thread deadlocks in the drbd_disconnect() code path. Affected are only nodes in Primary role. The bug triggers most likely on system that mirror over "long distances" Regression introduced shortly before 8.3.3 with git commit 31e0f1250f174ac1ee317f360943a0159e19edc8 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index d9312b45393..9bbc509443e 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3619,10 +3619,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) set_bit(STOP_SYNC_TIMER, &mdev->flags); resync_timer_fn((unsigned long)mdev); - /* so we can be sure that all remote or resync reads - * made it at least to net_ee */ - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); - /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier, * w_make_resync_request etc. which may still be on the worker queue * to be "canceled" */ From 83c38830b04d4e369b9a41acbc562c0422f2f2f2 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Tue, 3 Nov 2009 02:22:06 +0100 Subject: [PATCH 040/113] drbd: performance - don't lose unplug events Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3678d3d66c6..d09aac4a84e 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -505,7 +505,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, * corresponding hlist_del is in _req_may_be_done() */ hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); - set_bit(UNPLUG_REMOTE, &mdev->flags); /* why? */ + set_bit(UNPLUG_REMOTE, &mdev->flags); D_ASSERT(req->rq_state & RQ_NET_PENDING); req->rq_state |= RQ_NET_QUEUED; @@ -536,6 +536,11 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what, * * Add req to the (now) current epoch (barrier). */ + /* otherwise we may lose an unplug, which may cause some remote + * io-scheduler timeout to expire, increasing maximum latency, + * hurting performance. */ + set_bit(UNPLUG_REMOTE, &mdev->flags); + /* see drbd_make_request_common, * just after it grabs the req_lock */ D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0); From ed814525f2e45188964c270fc3a5a0b644f7e4a9 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 27 Oct 2009 12:37:14 +0100 Subject: [PATCH 041/113] Now it is equal to DRBD release 8.3.5 without compat crap Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 233db5c18b8..18942ad115d 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.3rc2" +#define REL_VERSION "8.3.5" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 91 From cf7c25cf91f632a3528669fc0876e1fc8355ff9b Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Sun, 8 Nov 2009 17:16:46 +0100 Subject: [PATCH 042/113] cfq-iosched: fix next_rq computation Cfq has a bug in computation of next_rq, that affects transition between multiple sequential request streams in a single queue (e.g.: two sequential buffered writers of the same priority), causing the alternation between the two streams for a transient period. 8,0 1 18737 0.260400660 5312 D W 141653311 + 256 8,0 1 20839 0.273239461 5400 D W 141653567 + 256 8,0 1 20841 0.276343885 5394 D W 142803919 + 256 8,0 1 20843 0.279490878 5394 D W 141668927 + 256 8,0 1 20845 0.292459993 5400 D W 142804175 + 256 8,0 1 20847 0.295537247 5400 D W 141668671 + 256 8,0 1 20849 0.298656337 5400 D W 142804431 + 256 8,0 1 20851 0.311481148 5394 D W 141668415 + 256 8,0 1 20853 0.314421305 5394 D W 142804687 + 256 8,0 1 20855 0.318960112 5400 D W 142804943 + 256 The fix makes sure that the next_rq is computed from the last dispatched request, and not affected by merging. 8,0 1 37776 4.305161306 0 D W 141738087 + 256 8,0 1 37778 4.308298091 0 D W 141738343 + 256 8,0 1 37780 4.312885190 0 D W 141738599 + 256 8,0 1 37782 4.315933291 0 D W 141738855 + 256 8,0 1 37784 4.319064459 0 D W 141739111 + 256 8,0 1 37786 4.331918431 5672 D W 142803007 + 256 8,0 1 37788 4.334930332 5672 D W 142803263 + 256 8,0 1 37790 4.337902723 5672 D W 142803519 + 256 8,0 1 37792 4.342359774 5672 D W 142803775 + 256 8,0 1 37794 4.345318286 0 D W 142804031 + 256 Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 4ab240c875d..829d87d3e00 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -454,9 +454,9 @@ static inline bool cfq_slice_used(struct cfq_queue *cfqq) * behind the head is penalized and only allowed to a certain extent. */ static struct request * -cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) +cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) { - sector_t last, s1, s2, d1 = 0, d2 = 0; + sector_t s1, s2, d1 = 0, d2 = 0; unsigned long back_max; #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ @@ -479,8 +479,6 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); - last = cfqd->last_position; - /* * by definition, 1KiB is 2 sectors */ @@ -595,7 +593,7 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, next = rb_entry_rq(rbnext); } - return cfq_choose_req(cfqd, next, prev); + return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); } static unsigned long cfq_slice_offset(struct cfq_data *cfqd, @@ -843,7 +841,7 @@ static void cfq_add_rq_rb(struct request *rq) * check if this request is a better next-serve candidate */ prev = cfqq->next_rq; - cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); + cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); /* * adjust priority tree position, if ->next_rq changes @@ -950,6 +948,7 @@ static void cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { + struct cfq_queue *cfqq = RQ_CFQQ(rq); /* * reposition in fifo if next is older than rq */ @@ -959,6 +958,8 @@ cfq_merged_requests(struct request_queue *q, struct request *rq, rq_set_fifo_time(rq, rq_fifo_time(next)); } + if (cfqq->next_rq == next) + cfqq->next_rq = rq; cfq_remove_request(next); } From 86b37281411cf1e9bc0a6b5406c45edb7bd9ea5d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 10 Nov 2009 11:50:21 +0100 Subject: [PATCH 043/113] block: Expose discard granularity While SSDs track block usage on a per-sector basis, RAID arrays often have allocation blocks that are bigger. Allow the discard granularity and alignment to be set and teach the topology stacking logic how to handle them. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 46 +++++++++++++++++++++++++++++++++--------- block/blk-sysfs.c | 22 ++++++++++++++++++++ block/genhd.c | 12 +++++++++++ fs/partitions/check.c | 12 +++++++++++ include/linux/blkdev.h | 18 +++++++++++++++++ include/linux/genhd.h | 1 + 6 files changed, 101 insertions(+), 10 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 66d4aa8799b..7f986cafacd 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -96,7 +96,10 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_segment_size = MAX_SEGMENT_SIZE; lim->max_sectors = BLK_DEF_MAX_SECTORS; lim->max_hw_sectors = INT_MAX; - lim->max_discard_sectors = SAFE_MAX_SECTORS; + lim->max_discard_sectors = 0; + lim->discard_granularity = 0; + lim->discard_alignment = 0; + lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -488,6 +491,16 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) } EXPORT_SYMBOL(blk_queue_stack_limits); +static unsigned int lcm(unsigned int a, unsigned int b) +{ + if (a && b) + return (a * b) / gcd(a, b); + else if (b) + return b; + + return a; +} + /** * blk_stack_limits - adjust queue_limits for stacked devices * @t: the stacking driver limits (top) @@ -502,6 +515,10 @@ EXPORT_SYMBOL(blk_queue_stack_limits); int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset) { + int ret; + + ret = 0; + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); @@ -531,7 +548,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, if (offset && (offset & (b->physical_block_size - 1)) != b->alignment_offset) { t->misaligned = 1; - return -1; + ret = -1; + } + + if (offset && + (offset & (b->discard_granularity - 1)) != b->discard_alignment) { + t->discard_misaligned = 1; + ret = -1; } /* If top has no alignment offset, inherit from bottom */ @@ -539,23 +562,26 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->alignment_offset = b->alignment_offset & (b->physical_block_size - 1); + if (!t->discard_alignment) + t->discard_alignment = + b->discard_alignment & (b->discard_granularity - 1); + /* Top device aligned on logical block boundary? */ if (t->alignment_offset & (t->logical_block_size - 1)) { t->misaligned = 1; - return -1; + ret = -1; } - /* Find lcm() of optimal I/O size */ - if (t->io_opt && b->io_opt) - t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt); - else if (b->io_opt) - t->io_opt = b->io_opt; + /* Find lcm() of optimal I/O size and granularity */ + t->io_opt = lcm(t->io_opt, b->io_opt); + t->discard_granularity = lcm(t->discard_granularity, + b->discard_granularity); /* Verify that optimal I/O size is a multiple of io_min */ if (t->io_min && t->io_opt % t->io_min) - return -1; + ret = -1; - return 0; + return ret; } EXPORT_SYMBOL(blk_stack_limits); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8a6d81afb28..3147145edc1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -126,6 +126,16 @@ static ssize_t queue_io_opt_show(struct request_queue *q, char *page) return queue_var_show(queue_io_opt(q), page); } +static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.discard_granularity, page); +} + +static ssize_t queue_discard_max_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.max_discard_sectors << 9, page); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -293,6 +303,16 @@ static struct queue_sysfs_entry queue_io_opt_entry = { .show = queue_io_opt_show, }; +static struct queue_sysfs_entry queue_discard_granularity_entry = { + .attr = {.name = "discard_granularity", .mode = S_IRUGO }, + .show = queue_discard_granularity_show, +}; + +static struct queue_sysfs_entry queue_discard_max_entry = { + .attr = {.name = "discard_max_bytes", .mode = S_IRUGO }, + .show = queue_discard_max_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_nonrot_show, @@ -328,6 +348,8 @@ static struct attribute *default_attrs[] = { &queue_physical_block_size_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, + &queue_discard_granularity_entry.attr, + &queue_discard_max_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, diff --git a/block/genhd.c b/block/genhd.c index 517e4332cb3..b11a4ad7d57 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -861,12 +861,23 @@ static ssize_t disk_alignment_offset_show(struct device *dev, return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); } +static ssize_t disk_discard_alignment_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue)); +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, + NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); @@ -887,6 +898,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7b685e10cba..64bc8998ac9 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -226,6 +226,13 @@ ssize_t part_alignment_offset_show(struct device *dev, return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); } +ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%u\n", p->discard_alignment); +} + ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -288,6 +295,8 @@ static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, S_IRUGO, part_discard_alignment_show, + NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -300,6 +309,7 @@ static struct attribute *part_attrs[] = { &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -403,6 +413,8 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, p->start_sect = start; p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); + p->discard_alignment = queue_sector_discard_alignment(disk->queue, + start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 39c601f783a..1cc02972fbe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -312,12 +312,15 @@ struct queue_limits { unsigned int io_min; unsigned int io_opt; unsigned int max_discard_sectors; + unsigned int discard_granularity; + unsigned int discard_alignment; unsigned short logical_block_size; unsigned short max_hw_segments; unsigned short max_phys_segments; unsigned char misaligned; + unsigned char discard_misaligned; unsigned char no_cluster; }; @@ -1121,6 +1124,21 @@ static inline int bdev_alignment_offset(struct block_device *bdev) return q->limits.alignment_offset; } +static inline int queue_discard_alignment(struct request_queue *q) +{ + if (q->limits.discard_misaligned) + return -1; + + return q->limits.discard_alignment; +} + +static inline int queue_sector_discard_alignment(struct request_queue *q, + sector_t sector) +{ + return ((sector << 9) - q->limits.discard_alignment) + & (q->limits.discard_granularity - 1); +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45ffd0..c6c0c41af35 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -91,6 +91,7 @@ struct hd_struct { sector_t start_sect; sector_t nr_sects; sector_t alignment_offset; + unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; int policy, partno; From ad5ebd2fa2557b04a653bb3c3377a47da8f9b8e9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 11 Nov 2009 13:47:45 +0100 Subject: [PATCH 044/113] block: jiffies fixes Use HZ-independent calculation of milliseconds. Add jiffies.h where it was missing since functions or macros from it are used. Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- block/blk-settings.c | 3 ++- block/bsg.c | 3 ++- block/cfq-iosched.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 7f986cafacd..1ebc1fdb914 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -8,6 +8,7 @@ #include #include /* for max_pfn/max_low_pfn */ #include +#include #include "blk.h" @@ -144,7 +145,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->nr_batching = BLK_BATCH_REQ; q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ + q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */ if (q->unplug_delay == 0) q->unplug_delay = 1; diff --git a/block/bsg.c b/block/bsg.c index 0676301f16d..a9fd2d84b53 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq, rq->cmd_len = hdr->request_len; rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->timeout = (hdr->timeout * HZ) / 1000; + rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) rq->timeout = q->sg_timeout; if (!rq->timeout) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 829d87d3e00..1bcbd8c7989 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include From 8ba95c69fe6eb65ff36b64136ae24844ddba16a1 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Thu, 12 Nov 2009 12:49:14 -0600 Subject: [PATCH 045/113] cciss: Make device attributes static cciss: Make device attributes static Cc: Stephen M. Cameron Signed-off-by: Alex Chiang Acked-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 6399e5090df..92b126394fa 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -482,7 +482,7 @@ static ssize_t host_store_rescan(struct device *dev, return count; } -DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan); +static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan); static ssize_t dev_show_unique_id(struct device *dev, struct device_attribute *attr, @@ -512,7 +512,7 @@ static ssize_t dev_show_unique_id(struct device *dev, sn[8], sn[9], sn[10], sn[11], sn[12], sn[13], sn[14], sn[15]); } -DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL); +static DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL); static ssize_t dev_show_vendor(struct device *dev, struct device_attribute *attr, @@ -536,7 +536,7 @@ static ssize_t dev_show_vendor(struct device *dev, else return snprintf(buf, sizeof(vendor) + 1, "%s\n", drv->vendor); } -DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL); +static DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL); static ssize_t dev_show_model(struct device *dev, struct device_attribute *attr, @@ -560,7 +560,7 @@ static ssize_t dev_show_model(struct device *dev, else return snprintf(buf, sizeof(model) + 1, "%s\n", drv->model); } -DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL); +static DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL); static ssize_t dev_show_rev(struct device *dev, struct device_attribute *attr, @@ -584,7 +584,7 @@ static ssize_t dev_show_rev(struct device *dev, else return snprintf(buf, sizeof(rev) + 1, "%s\n", drv->rev); } -DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL); +static DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL); static ssize_t cciss_show_lunid(struct device *dev, struct device_attribute *attr, char *buf) @@ -609,7 +609,7 @@ static ssize_t cciss_show_lunid(struct device *dev, lunid[0], lunid[1], lunid[2], lunid[3], lunid[4], lunid[5], lunid[6], lunid[7]); } -DEVICE_ATTR(lunid, S_IRUGO, cciss_show_lunid, NULL); +static DEVICE_ATTR(lunid, S_IRUGO, cciss_show_lunid, NULL); static ssize_t cciss_show_raid_level(struct device *dev, struct device_attribute *attr, char *buf) @@ -632,7 +632,7 @@ static ssize_t cciss_show_raid_level(struct device *dev, return snprintf(buf, strlen(raid_label[raid]) + 7, "RAID %s\n", raid_label[raid]); } -DEVICE_ATTR(raid_level, S_IRUGO, cciss_show_raid_level, NULL); +static DEVICE_ATTR(raid_level, S_IRUGO, cciss_show_raid_level, NULL); static ssize_t cciss_show_usage_count(struct device *dev, struct device_attribute *attr, char *buf) @@ -651,7 +651,7 @@ static ssize_t cciss_show_usage_count(struct device *dev, spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags); return snprintf(buf, 20, "%d\n", count); } -DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL); +static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL); static struct attribute *cciss_host_attrs[] = { &dev_attr_rescan.attr, From fd8489cff419d216479655b8041b8574ed89f806 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:19 -0600 Subject: [PATCH 046/113] cciss: Fix problem with remove_from_scan_list on driver unload cciss: Fix problem with remove_from_scan_list that on driver unload it doesn't remove the controller from the scan list correctly if the controller is currently being scanned for new devices. Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 92b126394fa..81c21875eb7 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3513,28 +3513,33 @@ static int add_to_scan_list(struct ctlr_info *h) * @h: Pointer to the controller. * * Removes the controller from the rescan queue if present. Blocks if - * the controller is currently conducting a rescan. + * the controller is currently conducting a rescan. The controller + * can be in one of three states: + * 1. Doesn't need a scan + * 2. On the scan list, but not scanning yet (we remove it) + * 3. Busy scanning (and not on the list). In this case we want to wait for + * the scan to complete to make sure the scanning thread for this + * controller is completely idle. **/ static void remove_from_scan_list(struct ctlr_info *h) { struct ctlr_info *test_h, *tmp_h; - int scanning = 0; mutex_lock(&scan_mutex); list_for_each_entry_safe(test_h, tmp_h, &scan_q, scan_list) { - if (test_h == h) { + if (test_h == h) { /* state 2. */ list_del(&h->scan_list); complete_all(&h->scan_wait); mutex_unlock(&scan_mutex); return; } } - if (&h->busy_scanning) - scanning = 0; - mutex_unlock(&scan_mutex); - - if (scanning) + if (h->busy_scanning) { /* state 3. */ + mutex_unlock(&scan_mutex); wait_for_completion(&h->scan_wait); + } else { /* state 1, nothing to do. */ + mutex_unlock(&scan_mutex); + } } /** From c08fac6500b658c16834aceb13a08ebddd908333 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:25 -0600 Subject: [PATCH 047/113] cciss: Retry driver initiated cmds with unit attention condition cciss: Retry driver initiated cmds with unit attention condition Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 81c21875eb7..429b9b6ff59 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -2531,6 +2531,8 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c) case 0: return IO_OK; /* no sense */ case 1: return IO_OK; /* recovered error */ default: + if (check_for_unit_attention(h, c)) + return IO_NEEDS_RETRY; printk(KERN_WARNING "cciss%d: cmd 0x%02x " "check condition, sense key = 0x%02x\n", h->ctlr, c->Request.CDB[0], From 7b838bde922730b9cfeaa93ba80bd31173941495 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:30 -0600 Subject: [PATCH 048/113] cciss: Remove the "withirq" parameter from various functions where possible cciss: Remove the "withirq" parameter from various functions where possible Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 89 ++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 60 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 429b9b6ff59..4321c94b552 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -179,12 +179,12 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time, int via_ioctl); static int deregister_disk(ctlr_info_t *h, int drv_index, int clear_all, int via_ioctl); -static void cciss_read_capacity(int ctlr, int logvol, int withirq, +static void cciss_read_capacity(int ctlr, int logvol, sector_t *total_size, unsigned int *block_size); -static void cciss_read_capacity_16(int ctlr, int logvol, int withirq, +static void cciss_read_capacity_16(int ctlr, int logvol, sector_t *total_size, unsigned int *block_size); static void cciss_geometry_inquiry(int ctlr, int logvol, - int withirq, sector_t total_size, + sector_t total_size, unsigned int block_size, InquiryData_struct *inq_buff, drive_info_struct *drv); static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, @@ -1701,7 +1701,7 @@ static inline void log_unit_to_scsi3addr(ctlr_info_t *h, * via the inquiry page 0. Model, vendor, and rev are set to empty strings if * they cannot be read. */ -static void cciss_get_device_descr(int ctlr, int logvol, int withirq, +static void cciss_get_device_descr(int ctlr, int logvol, char *vendor, char *model, char *rev) { int rc; @@ -1717,14 +1717,8 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq, return; log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, - sizeof(InquiryData_struct), 0, - scsi3addr, TYPE_CMD); - else - rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf, - sizeof(InquiryData_struct), 0, - scsi3addr, TYPE_CMD); + rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf, sizeof(*inq_buf), 0, + scsi3addr, TYPE_CMD); if (rc == IO_OK) { memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN); vendor[VENDOR_LEN] = '\0'; @@ -1743,7 +1737,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq, * number cannot be had, for whatever reason, 16 bytes of 0xff * are returned instead. */ -static void cciss_get_serial_no(int ctlr, int logvol, int withirq, +static void cciss_get_serial_no(int ctlr, int logvol, unsigned char *serial_no, int buflen) { #define PAGE_83_INQ_BYTES 64 @@ -1759,12 +1753,8 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq, return; memset(serial_no, 0, buflen); log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, - PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); - else - rc = sendcmd(CISS_INQUIRY, ctlr, buf, - PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); + rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf, + PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD); if (rc == IO_OK) memcpy(serial_no, &buf[8], buflen); kfree(buf); @@ -1852,18 +1842,16 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time, /* testing to see if 16-byte CDBs are already being used */ if (h->cciss_read == CCISS_READ_16) { - cciss_read_capacity_16(h->ctlr, drv_index, 1, + cciss_read_capacity_16(h->ctlr, drv_index, &total_size, &block_size); } else { - cciss_read_capacity(ctlr, drv_index, 1, - &total_size, &block_size); - + cciss_read_capacity(ctlr, drv_index, &total_size, &block_size); /* if read_capacity returns all F's this volume is >2TB */ /* in size so we switch to 16-byte CDB's for all */ /* read/write ops */ if (total_size == 0xFFFFFFFFULL) { - cciss_read_capacity_16(ctlr, drv_index, 1, + cciss_read_capacity_16(ctlr, drv_index, &total_size, &block_size); h->cciss_read = CCISS_READ_16; h->cciss_write = CCISS_WRITE_16; @@ -1873,14 +1861,14 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time, } } - cciss_geometry_inquiry(ctlr, drv_index, 1, total_size, block_size, + cciss_geometry_inquiry(ctlr, drv_index, total_size, block_size, inq_buff, drvinfo); drvinfo->block_size = block_size; drvinfo->nr_blocks = total_size + 1; - cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor, + cciss_get_device_descr(ctlr, drv_index, drvinfo->vendor, drvinfo->model, drvinfo->rev); - cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no, + cciss_get_serial_no(ctlr, drv_index, drvinfo->serial_no, sizeof(drvinfo->serial_no)); /* Save the lunid in case we deregister the disk, below. */ memcpy(drvinfo->LunID, h->drv[drv_index]->LunID, @@ -2674,7 +2662,7 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, } static void cciss_geometry_inquiry(int ctlr, int logvol, - int withirq, sector_t total_size, + sector_t total_size, unsigned int block_size, InquiryData_struct *inq_buff, drive_info_struct *drv) @@ -2685,14 +2673,8 @@ static void cciss_geometry_inquiry(int ctlr, int logvol, memset(inq_buff, 0, sizeof(InquiryData_struct)); log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, - inq_buff, sizeof(*inq_buff), - 0xC1, scsi3addr, TYPE_CMD); - else - return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff, - sizeof(*inq_buff), 0xC1, scsi3addr, - TYPE_CMD); + return_code = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buff, + sizeof(*inq_buff), 0xC1, scsi3addr, TYPE_CMD); if (return_code == IO_OK) { if (inq_buff->data_byte[8] == 0xFF) { printk(KERN_WARNING @@ -2725,7 +2707,7 @@ static void cciss_geometry_inquiry(int ctlr, int logvol, } static void -cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, +cciss_read_capacity(int ctlr, int logvol, sector_t *total_size, unsigned int *block_size) { ReadCapdata_struct *buf; @@ -2739,14 +2721,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, } log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) - return_code = sendcmd_withirq(CCISS_READ_CAPACITY, - ctlr, buf, sizeof(ReadCapdata_struct), - 0, scsi3addr, TYPE_CMD); - else - return_code = sendcmd(CCISS_READ_CAPACITY, - ctlr, buf, sizeof(ReadCapdata_struct), - 0, scsi3addr, TYPE_CMD); + return_code = sendcmd_withirq(CCISS_READ_CAPACITY, ctlr, buf, + sizeof(ReadCapdata_struct), 0, scsi3addr, TYPE_CMD); if (return_code == IO_OK) { *total_size = be32_to_cpu(*(__be32 *) buf->total_size); *block_size = be32_to_cpu(*(__be32 *) buf->block_size); @@ -2758,8 +2734,8 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size, kfree(buf); } -static void -cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, unsigned int *block_size) +static void cciss_read_capacity_16(int ctlr, int logvol, + sector_t *total_size, unsigned int *block_size) { ReadCapdata_struct_16 *buf; int return_code; @@ -2772,16 +2748,9 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size, } log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol); - if (withirq) { - return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, - ctlr, buf, sizeof(ReadCapdata_struct_16), - 0, scsi3addr, TYPE_CMD); - } - else { - return_code = sendcmd(CCISS_READ_CAPACITY_16, - ctlr, buf, sizeof(ReadCapdata_struct_16), - 0, scsi3addr, TYPE_CMD); - } + return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16, + ctlr, buf, sizeof(ReadCapdata_struct_16), + 0, scsi3addr, TYPE_CMD); if (return_code == IO_OK) { *total_size = be64_to_cpu(*(__be64 *) buf->total_size); *block_size = be32_to_cpu(*(__be32 *) buf->block_size); @@ -2822,13 +2791,13 @@ static int cciss_revalidate(struct gendisk *disk) return 1; } if (h->cciss_read == CCISS_READ_10) { - cciss_read_capacity(h->ctlr, logvol, 1, + cciss_read_capacity(h->ctlr, logvol, &total_size, &block_size); } else { - cciss_read_capacity_16(h->ctlr, logvol, 1, + cciss_read_capacity_16(h->ctlr, logvol, &total_size, &block_size); } - cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, + cciss_geometry_inquiry(h->ctlr, logvol, total_size, block_size, inq_buff, drv); blk_queue_logical_block_size(drv->queue, drv->block_size); From 29009a036f2feb07d8a9b3c715a6365dddd82a7a Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:35 -0600 Subject: [PATCH 049/113] cciss: clean up code in cciss_shutdown cciss: clean up code in cciss_shutdown. Send the flush cache command down with interrupts still enabled, and do not do DMA from the stack. Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 4321c94b552..f804542c1cf 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4376,30 +4376,28 @@ clean_no_release_regions: static void cciss_shutdown(struct pci_dev *pdev) { - ctlr_info_t *tmp_ptr; - int i; - char flush_buf[4]; + ctlr_info_t *h; + char *flush_buf; int return_code; - tmp_ptr = pci_get_drvdata(pdev); - if (tmp_ptr == NULL) + h = pci_get_drvdata(pdev); + flush_buf = kzalloc(4, GFP_KERNEL); + if (!flush_buf) { + printk(KERN_WARNING + "cciss:%d cache not flushed, out of memory.\n", + h->ctlr); return; - i = tmp_ptr->ctlr; - if (hba[i] == NULL) - return; - - /* Turn board interrupts off and send the flush cache command */ - /* sendcmd will turn off interrupt, and send the flush... - * To write all data in the battery backed cache to disks */ - memset(flush_buf, 0, 4); - return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0, - CTLR_LUNID, TYPE_CMD); - if (return_code == IO_OK) { - printk(KERN_INFO "Completed flushing cache on controller %d\n", i); - } else { - printk(KERN_WARNING "Error flushing cache on controller %d\n", i); } - free_irq(hba[i]->intr[2], hba[i]); + /* write all data in the battery backed cache to disk */ + memset(flush_buf, 0, 4); + return_code = sendcmd_withirq(CCISS_CACHE_FLUSH, h->ctlr, flush_buf, + 4, 0, CTLR_LUNID, TYPE_CMD); + kfree(flush_buf); + if (return_code != IO_OK) + printk(KERN_WARNING "cciss%d: Error flushing cache\n", + h->ctlr); + h->access.set_intr_mask(h, CCISS_INTR_OFF); + free_irq(h->intr[2], h); } static void __devexit cciss_remove_one(struct pci_dev *pdev) From aa43f11147141fcd0e5f2fca45a4d71eab3fbe88 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:40 -0600 Subject: [PATCH 050/113] cciss: remove sendcmd() as it is no longer used. cciss: remove sendcmd() as it is no longer used. Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 163 ------------------------------------------ 1 file changed, 163 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index f804542c1cf..23c2910aa7b 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -190,8 +190,6 @@ static void cciss_geometry_inquiry(int ctlr, int logvol, static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *, __u32); static void start_io(ctlr_info_t *h); -static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, - __u8 page_code, unsigned char *scsi3addr, int cmd_type); static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size, __u8 page_code, unsigned char scsi3addr[], int cmd_type); @@ -2807,167 +2805,6 @@ static int cciss_revalidate(struct gendisk *disk) return 0; } -/* - * Wait polling for a command to complete. - * The memory mapped FIFO is polled for the completion. - * Used only at init time, interrupts from the HBA are disabled. - */ -static unsigned long pollcomplete(int ctlr) -{ - unsigned long done; - int i; - - /* Wait (up to 20 seconds) for a command to complete */ - - for (i = 20 * HZ; i > 0; i--) { - done = hba[ctlr]->access.command_completed(hba[ctlr]); - if (done == FIFO_EMPTY) - schedule_timeout_uninterruptible(1); - else - return done; - } - /* Invalid address to tell caller we ran out of time */ - return 1; -} - -/* Send command c to controller h and poll for it to complete. - * Turns interrupts off on the board. Used at driver init time - * and during SCSI error recovery. - */ -static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c) -{ - int i; - unsigned long complete; - int status = IO_ERROR; - u64bit buff_dma_handle; - -resend_cmd1: - - /* Disable interrupt on the board. */ - h->access.set_intr_mask(h, CCISS_INTR_OFF); - - /* Make sure there is room in the command FIFO */ - /* Actually it should be completely empty at this time */ - /* unless we are in here doing error handling for the scsi */ - /* tape side of the driver. */ - for (i = 200000; i > 0; i--) { - /* if fifo isn't full go */ - if (!(h->access.fifo_full(h))) - break; - udelay(10); - printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full," - " waiting!\n", h->ctlr); - } - h->access.submit_command(h, c); /* Send the cmd */ - do { - complete = pollcomplete(h->ctlr); - -#ifdef CCISS_DEBUG - printk(KERN_DEBUG "cciss: command completed\n"); -#endif /* CCISS_DEBUG */ - - if (complete == 1) { - printk(KERN_WARNING - "cciss cciss%d: SendCmd Timeout out, " - "No command list address returned!\n", h->ctlr); - status = IO_ERROR; - break; - } - - /* Make sure it's the command we're expecting. */ - if ((complete & ~CISS_ERROR_BIT) != c->busaddr) { - printk(KERN_WARNING "cciss%d: Unexpected command " - "completion.\n", h->ctlr); - continue; - } - - /* It is our command. If no error, we're done. */ - if (!(complete & CISS_ERROR_BIT)) { - status = IO_OK; - break; - } - - /* There is an error... */ - - /* if data overrun or underun on Report command ignore it */ - if (((c->Request.CDB[0] == CISS_REPORT_LOG) || - (c->Request.CDB[0] == CISS_REPORT_PHYS) || - (c->Request.CDB[0] == CISS_INQUIRY)) && - ((c->err_info->CommandStatus == CMD_DATA_OVERRUN) || - (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) { - complete = c->busaddr; - status = IO_OK; - break; - } - - if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) { - printk(KERN_WARNING "cciss%d: unsolicited abort %p\n", - h->ctlr, c); - if (c->retry_count < MAX_CMD_RETRIES) { - printk(KERN_WARNING "cciss%d: retrying %p\n", - h->ctlr, c); - c->retry_count++; - /* erase the old error information */ - memset(c->err_info, 0, sizeof(c->err_info)); - goto resend_cmd1; - } - printk(KERN_WARNING "cciss%d: retried %p too many " - "times\n", h->ctlr, c); - status = IO_ERROR; - break; - } - - if (c->err_info->CommandStatus == CMD_UNABORTABLE) { - printk(KERN_WARNING "cciss%d: command could not be " - "aborted.\n", h->ctlr); - status = IO_ERROR; - break; - } - - if (c->err_info->CommandStatus == CMD_TARGET_STATUS) { - status = check_target_status(h, c); - break; - } - - printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr); - printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n", - c->Request.CDB[0], c->err_info->CommandStatus); - status = IO_ERROR; - break; - - } while (1); - - /* unlock the data buffer from DMA */ - buff_dma_handle.val32.lower = c->SG[0].Addr.lower; - buff_dma_handle.val32.upper = c->SG[0].Addr.upper; - pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val, - c->SG[0].Len, PCI_DMA_BIDIRECTIONAL); - return status; -} - -/* - * Send a command to the controller, and wait for it to complete. - * Used at init time, and during SCSI error recovery. - */ -static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, - __u8 page_code, unsigned char *scsi3addr, int cmd_type) -{ - CommandList_struct *c; - int status; - - c = cmd_alloc(hba[ctlr], 1); - if (!c) { - printk(KERN_WARNING "cciss: unable to get memory"); - return IO_ERROR; - } - status = fill_cmd(c, cmd, ctlr, buff, size, page_code, - scsi3addr, cmd_type); - if (status == IO_OK) - status = sendcmd_core(hba[ctlr], c); - cmd_free(hba[ctlr], c, 1); - return status; -} - /* * Map (physical) PCI mem into (virtual) kernel space */ From b0e15f6db1110319cb2e747e59e1200450a5ba3e Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:45 -0600 Subject: [PATCH 051/113] cciss: fix typo that causes scsi status to be lost. cciss: fix typo that causes scsi status to be lost. Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss_scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 3315268b4ec..237d2b35365 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -755,7 +755,7 @@ complete_scsi_command( CommandList_struct *cp, int timeout, __u32 tag) cp, ei->ScsiStatus); #endif - cmd->result |= (ei->ScsiStatus < 1); + cmd->result |= (ei->ScsiStatus << 1); } else { /* scsi status is zero??? How??? */ From d06dfbd236795acbb67e22e51bb8af12e953ced3 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:50 -0600 Subject: [PATCH 052/113] cciss: Remove unnecessary check in scan_thread cciss: Remove unnecessary check in scan_thread Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 23c2910aa7b..42eaddb5438 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3386,13 +3386,11 @@ static int scan_thread(void *data) h->busy_scanning = 1; mutex_unlock(&scan_mutex); - if (h) { - rebuild_lun_table(h, 0, 0); - complete_all(&h->scan_wait); - mutex_lock(&scan_mutex); - h->busy_scanning = 0; - mutex_unlock(&scan_mutex); - } + rebuild_lun_table(h, 0, 0); + complete_all(&h->scan_wait); + mutex_lock(&scan_mutex); + h->busy_scanning = 0; + mutex_unlock(&scan_mutex); } } From da0021841c3ea6a82588efae3260015847ea5d33 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:49:55 -0600 Subject: [PATCH 053/113] cciss: Do not automatically rescan on UNIT ATTENTION/LUN DATA CHANGED cciss: Do not automatically rescan on UNIT ATTENTION/LUN DATA CHANGED There are problems with doing this. If, say, several logical drives are deleted at once, several such UNIT ATTENTIONS will be encountered, often during the rescan triggered by the first such UNIT ATTENTION. The block layer may be in the midst of trying to add logical drives which were just deleted (resulting in the subsequent UNIT ATTENTION(s).) Making the rescan code robust enough to tolerate this kind of thing is too complicated for the moment. So, for now, we just don't do it. Note: This UNIT ATTENTION/LUN DATA CHANGED situation only occurs on the MSA2012. Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 42eaddb5438..bf2d1c80b78 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -3416,8 +3416,22 @@ static int check_for_unit_attention(ctlr_info_t *h, CommandList_struct *c) case REPORT_LUNS_CHANGED: printk(KERN_WARNING "cciss%d: report LUN data " "changed\n", h->ctlr); - add_to_scan_list(h); - wake_up_process(cciss_scan_thread); + /* + * Here, we could call add_to_scan_list and wake up the scan thread, + * except that it's quite likely that we will get more than one + * REPORT_LUNS_CHANGED condition in quick succession, which means + * that those which occur after the first one will likely happen + * *during* the scan_thread's rescan. And the rescan code is not + * robust enough to restart in the middle, undoing what it has already + * done, and it's not clear that it's even possible to do this, since + * part of what it does is notify the block layer, which starts + * doing it's own i/o to read partition tables and so on, and the + * driver doesn't have visibility to know what might need undoing. + * In any event, if possible, it is horribly complicated to get right + * so we just don't do it for now. + * + * Note: this REPORT_LUNS_CHANGED condition only occurs on the MSA2012. + */ return 1; break; case POWER_OR_RESET: From 5c07a311a80adb0138fc08e8279c60255d88d0b8 Mon Sep 17 00:00:00 2001 From: Don Brace Date: Thu, 12 Nov 2009 12:50:01 -0600 Subject: [PATCH 054/113] cciss: Add enhanced scatter-gather support. cciss: Add enhanced scatter-gather support. For controllers which supported, more than 512 scatter-gather elements per command may be used, and the max transfer size can be increased to 8192 blocks. Signed-off-by: Don Brace Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 183 ++++++++++++++++++++++++++++++++++---- drivers/block/cciss.h | 18 +++- drivers/block/cciss_cmd.h | 7 +- 3 files changed, 188 insertions(+), 20 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index bf2d1c80b78..1bd313dcf6a 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1655,9 +1655,11 @@ static void cciss_softirq_done(struct request *rq) { CommandList_struct *cmd = rq->completion_data; ctlr_info_t *h = hba[cmd->ctlr]; + SGDescriptor_struct *curr_sg = cmd->SG; unsigned long flags; u64bit temp64; int i, ddir; + int sg_index = 0; if (cmd->Request.Type.Direction == XFER_READ) ddir = PCI_DMA_FROMDEVICE; @@ -1667,9 +1669,22 @@ static void cciss_softirq_done(struct request *rq) /* command did not need to be retried */ /* unmap the DMA mapping for all the scatter gather elements */ for (i = 0; i < cmd->Header.SGList; i++) { - temp64.val32.lower = cmd->SG[i].Addr.lower; - temp64.val32.upper = cmd->SG[i].Addr.upper; - pci_unmap_page(h->pdev, temp64.val, cmd->SG[i].Len, ddir); + if (curr_sg[sg_index].Ext == CCISS_SG_CHAIN) { + temp64.val32.lower = cmd->SG[i].Addr.lower; + temp64.val32.upper = cmd->SG[i].Addr.upper; + pci_dma_sync_single_for_cpu(h->pdev, temp64.val, + cmd->SG[i].Len, ddir); + pci_unmap_single(h->pdev, temp64.val, + cmd->SG[i].Len, ddir); + /* Point to the next block */ + curr_sg = h->cmd_sg_list[cmd->cmdindex]->sgchain; + sg_index = 0; + } + temp64.val32.lower = curr_sg[sg_index].Addr.lower; + temp64.val32.upper = curr_sg[sg_index].Addr.upper; + pci_unmap_page(h->pdev, temp64.val, curr_sg[sg_index].Len, + ddir); + ++sg_index; } #ifdef CCISS_DEBUG @@ -1781,10 +1796,10 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); /* This is a hardware imposed limit. */ - blk_queue_max_hw_segments(disk->queue, MAXSGENTRIES); + blk_queue_max_hw_segments(disk->queue, h->maxsgentries); /* This is a limit in the driver and could be eliminated. */ - blk_queue_max_phys_segments(disk->queue, MAXSGENTRIES); + blk_queue_max_phys_segments(disk->queue, h->maxsgentries); blk_queue_max_sectors(disk->queue, h->cciss_max_sectors); @@ -3063,9 +3078,13 @@ static void do_cciss_request(struct request_queue *q) int seg; struct request *creq; u64bit temp64; - struct scatterlist tmp_sg[MAXSGENTRIES]; + struct scatterlist *tmp_sg; + SGDescriptor_struct *curr_sg; drive_info_struct *drv; int i, dir; + int nseg = 0; + int sg_index = 0; + int chained = 0; /* We call start_io here in case there is a command waiting on the * queue that has not been sent. @@ -3078,13 +3097,14 @@ static void do_cciss_request(struct request_queue *q) if (!creq) goto startio; - BUG_ON(creq->nr_phys_segments > MAXSGENTRIES); + BUG_ON(creq->nr_phys_segments > h->maxsgentries); if ((c = cmd_alloc(h, 1)) == NULL) goto full; blk_start_request(creq); + tmp_sg = h->scatter_list[c->cmdindex]; spin_unlock_irq(q->queue_lock); c->cmd_type = CMD_RWREQ; @@ -3113,7 +3133,7 @@ static void do_cciss_request(struct request_queue *q) (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); #endif /* CCISS_DEBUG */ - sg_init_table(tmp_sg, MAXSGENTRIES); + sg_init_table(tmp_sg, h->maxsgentries); seg = blk_rq_map_sg(q, creq, tmp_sg); /* get the DMA records for the setup */ @@ -3122,25 +3142,70 @@ static void do_cciss_request(struct request_queue *q) else dir = PCI_DMA_TODEVICE; + curr_sg = c->SG; + sg_index = 0; + chained = 0; + for (i = 0; i < seg; i++) { - c->SG[i].Len = tmp_sg[i].length; + if (((sg_index+1) == (h->max_cmd_sgentries)) && + !chained && ((seg - i) > 1)) { + nseg = seg - i; + curr_sg[sg_index].Len = (nseg) * + sizeof(SGDescriptor_struct); + curr_sg[sg_index].Ext = CCISS_SG_CHAIN; + + /* Point to next chain block. */ + curr_sg = h->cmd_sg_list[c->cmdindex]->sgchain; + sg_index = 0; + chained = 1; + } + curr_sg[sg_index].Len = tmp_sg[i].length; temp64.val = (__u64) pci_map_page(h->pdev, sg_page(&tmp_sg[i]), - tmp_sg[i].offset, - tmp_sg[i].length, dir); - c->SG[i].Addr.lower = temp64.val32.lower; - c->SG[i].Addr.upper = temp64.val32.upper; - c->SG[i].Ext = 0; // we are not chaining + tmp_sg[i].offset, + tmp_sg[i].length, dir); + curr_sg[sg_index].Addr.lower = temp64.val32.lower; + curr_sg[sg_index].Addr.upper = temp64.val32.upper; + curr_sg[sg_index].Ext = 0; /* we are not chaining */ + + ++sg_index; } + + if (chained) { + int len; + curr_sg = c->SG; + sg_index = h->max_cmd_sgentries - 1; + len = curr_sg[sg_index].Len; + /* Setup pointer to next chain block. + * Fill out last element in current chain + * block with address of next chain block. + */ + temp64.val = pci_map_single(h->pdev, + h->cmd_sg_list[c->cmdindex]->sgchain, + len, dir); + + h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val; + curr_sg[sg_index].Addr.lower = temp64.val32.lower; + curr_sg[sg_index].Addr.upper = temp64.val32.upper; + + pci_dma_sync_single_for_device(h->pdev, + h->cmd_sg_list[c->cmdindex]->sg_chain_dma, + len, dir); + } + /* track how many SG entries we are using */ if (seg > h->maxSG) h->maxSG = seg; #ifdef CCISS_DEBUG - printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n", - blk_rq_sectors(creq), seg); + printk(KERN_DEBUG "cciss: Submitting %ld sectors in %d segments " + "chained[%d]\n", + blk_rq_sectors(creq), seg, chained); #endif /* CCISS_DEBUG */ - c->Header.SGList = c->Header.SGTotal = seg; + c->Header.SGList = c->Header.SGTotal = seg + chained; + if (seg > h->max_cmd_sgentries) + c->Header.SGList = h->max_cmd_sgentries; + if (likely(blk_fs_request(creq))) { if(h->cciss_read == CCISS_READ_10) { c->Request.CDB[1] = 0; @@ -3713,6 +3778,23 @@ static int __devinit cciss_pci_init(ctlr_info_t *c, struct pci_dev *pdev) * leave a little room for ioctl calls. */ c->max_commands = readl(&(c->cfgtable->CmdsOutMax)); + c->maxsgentries = readl(&(c->cfgtable->MaxSGElements)); + + /* + * Limit native command to 32 s/g elements to save dma'able memory. + * Howvever spec says if 0, use 31 + */ + + c->max_cmd_sgentries = 31; + if (c->maxsgentries > 512) { + c->max_cmd_sgentries = 32; + c->chainsize = c->maxsgentries - c->max_cmd_sgentries + 1; + c->maxsgentries -= 1; /* account for chain pointer */ + } else { + c->maxsgentries = 31; /* Default to traditional value */ + c->chainsize = 0; /* traditional */ + } + c->product_name = products[prod_index].product_name; c->access = *(products[prod_index].access); c->nr_cmds = c->max_commands - 4; @@ -4039,6 +4121,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, { int i; int j = 0; + int k = 0; int rc; int dac, return_code; InquiryData_struct *inq_buff; @@ -4142,6 +4225,53 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, printk(KERN_ERR "cciss: out of memory"); goto clean4; } + + /* Need space for temp scatter list */ + hba[i]->scatter_list = kmalloc(hba[i]->max_commands * + sizeof(struct scatterlist *), + GFP_KERNEL); + for (k = 0; k < hba[i]->nr_cmds; k++) { + hba[i]->scatter_list[k] = kmalloc(sizeof(struct scatterlist) * + hba[i]->maxsgentries, + GFP_KERNEL); + if (hba[i]->scatter_list[k] == NULL) { + printk(KERN_ERR "cciss%d: could not allocate " + "s/g lists\n", i); + goto clean4; + } + } + hba[i]->cmd_sg_list = kmalloc(sizeof(struct Cmd_sg_list *) * + hba[i]->nr_cmds, + GFP_KERNEL); + if (!hba[i]->cmd_sg_list) { + printk(KERN_ERR "cciss%d: Cannot get memory for " + "s/g chaining.\n", i); + goto clean4; + } + /* Build up chain blocks for each command */ + if (hba[i]->chainsize > 0) { + for (j = 0; j < hba[i]->nr_cmds; j++) { + hba[i]->cmd_sg_list[j] = + kmalloc(sizeof(struct Cmd_sg_list), + GFP_KERNEL); + if (!hba[i]->cmd_sg_list[j]) { + printk(KERN_ERR "cciss%d: Cannot get memory " + "for chain block.\n", i); + goto clean4; + } + /* Need a block of chainsized s/g elements. */ + hba[i]->cmd_sg_list[j]->sgchain = + kmalloc((hba[i]->chainsize * + sizeof(SGDescriptor_struct)), + GFP_KERNEL); + if (!hba[i]->cmd_sg_list[j]->sgchain) { + printk(KERN_ERR "cciss%d: Cannot get memory " + "for s/g chains\n", i); + goto clean4; + } + } + } + spin_lock_init(&hba[i]->lock); /* Initialize the pdev driver private data. @@ -4187,7 +4317,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, cciss_procinit(i); - hba[i]->cciss_max_sectors = 2048; + hba[i]->cciss_max_sectors = 8192; rebuild_lun_table(hba[i], 1, 0); hba[i]->busy_initializing = 0; @@ -4195,6 +4325,15 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, clean4: kfree(hba[i]->cmd_pool_bits); + /* Free up sg elements */ + for (k = 0; k < hba[i]->nr_cmds; k++) + kfree(hba[i]->scatter_list[k]); + kfree(hba[i]->scatter_list); + for (j = 0; j < hba[i]->nr_cmds; j++) { + if (hba[i]->cmd_sg_list[j]) + kfree(hba[i]->cmd_sg_list[j]->sgchain); + kfree(hba[i]->cmd_sg_list[j]); + } if (hba[i]->cmd_pool) pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(CommandList_struct), @@ -4308,6 +4447,14 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev) pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct), hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle); kfree(hba[i]->cmd_pool_bits); + /* Free up sg elements */ + for (j = 0; j < hba[i]->nr_cmds; j++) + kfree(hba[i]->scatter_list[j]); + kfree(hba[i]->scatter_list); + for (j = 0; j < hba[i]->nr_cmds; j++) { + kfree(hba[i]->cmd_sg_list[j]->sgchain); + kfree(hba[i]->cmd_sg_list[j]); + } /* * Deliberately omit pci_disable_device(): it does something nasty to * Smart Array controllers that pci_enable_device does not undo diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 31524cf42c7..e5c63e579ff 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -55,7 +55,13 @@ typedef struct _drive_info_struct char device_initialized; /* indicates whether dev is initialized */ } drive_info_struct; -struct ctlr_info +struct Cmd_sg_list { + SGDescriptor_struct *sgchain; + dma64_addr_t sg_chain_dma; + int chain_block_size; +}; + +struct ctlr_info { int ctlr; char devname[8]; @@ -75,6 +81,16 @@ struct ctlr_info int num_luns; int highest_lun; int usage_count; /* number of opens all all minor devices */ + /* Need space for temp sg list + * number of scatter/gathers supported + * number of scatter/gathers in chained block + */ + struct scatterlist **scatter_list; + int maxsgentries; + int chainsize; + int max_cmd_sgentries; + struct Cmd_sg_list **cmd_sg_list; + # define DOORBELL_INT 0 # define PERF_MODE_INT 1 # define SIMPLE_MODE_INT 2 diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h index dbaed1ea0da..b50a9b261b8 100644 --- a/drivers/block/cciss_cmd.h +++ b/drivers/block/cciss_cmd.h @@ -7,7 +7,8 @@ //general boundary defintions #define SENSEINFOBYTES 32//note that this value may vary between host implementations -#define MAXSGENTRIES 31 +#define MAXSGENTRIES 32 +#define CCISS_SG_CHAIN 0x80000000 #define MAXREPLYQS 256 //Command Status value @@ -319,6 +320,10 @@ typedef struct _CfgTable_struct { BYTE ServerName[16]; DWORD HeartBeat; DWORD SCSI_Prefetch; + DWORD MaxSGElements; + DWORD MaxLogicalUnits; + DWORD MaxPhysicalDrives; + DWORD MaxPhysicalDrivesPerLogicalUnit; } CfgTable_struct; #pragma pack() #endif // CCISS_CMD_H From 8721c81f6480e2c9acbf92078383953f825d1057 Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Thu, 12 Nov 2009 12:50:06 -0600 Subject: [PATCH 055/113] cciss: Fix weird usage of ENXIO in cciss_scsi.c cciss: Fix weird usage of ENXIO in cciss_scsi.c Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 7 ++----- drivers/block/cciss_scsi.c | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 1bd313dcf6a..eab81c6c0ca 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -422,12 +422,9 @@ cciss_proc_write(struct file *file, const char __user *buf, if (strncmp(ENGAGE_SCSI, buffer, sizeof ENGAGE_SCSI - 1) == 0) { struct seq_file *seq = file->private_data; ctlr_info_t *h = seq->private; - int rc; - rc = cciss_engage_scsi(h->ctlr); - if (rc != 0) - err = -rc; - else + err = cciss_engage_scsi(h->ctlr); + if (err == 0) err = length; } else #endif /* CONFIG_CISS_SCSI_TAPE */ diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index 237d2b35365..5d0e46dc363 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -1547,7 +1547,7 @@ cciss_engage_scsi(int ctlr) if (sa->registered) { printk("cciss%d: SCSI subsystem already engaged.\n", ctlr); spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); - return ENXIO; + return -ENXIO; } sa->registered = 1; spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags); From 7d13af3279985f554784a45cc961f706dbcdbdd1 Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Mon, 23 Nov 2009 09:29:13 +0100 Subject: [PATCH 056/113] partitions: use sector size for EFI GPT Currently, kernel uses strictly 512-byte sectors for EFI GPT parsing. That's wrong. UEFI standard (version 2.3, May 2009, 5.3.1 GUID Format overview, page 95) defines that LBA is always based on the logical block size. It means bdev_logical_block_size() (aka BLKSSZGET) for Linux. This patch removes static sector size from EFI GPT parser. The problem is reproducible with the latest GNU Parted: # modprobe scsi_debug dev_size_mb=50 sector_size=4096 # ./parted /dev/sdb print Model: Linux scsi_debug (scsi) Disk /dev/sdb: 52.4MB Sector size (logical/physical): 4096B/4096B Partition Table: gpt Number Start End Size File system Name Flags 1 24.6kB 3002kB 2978kB primary 2 3002kB 6001kB 2998kB primary 3 6001kB 9003kB 3002kB primary # blockdev --rereadpt /dev/sdb # dmesg | tail -1 sdb: unknown partition table <---- !!! with this patch: # blockdev --rereadpt /dev/sdb # dmesg | tail -1 sdb: sdb1 sdb2 sdb3 Signed-off-by: Karel Zak Signed-off-by: Jens Axboe --- fs/partitions/efi.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 038a6022152..80eeff5fdfe 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -1,7 +1,9 @@ /************************************************************ * EFI GUID Partition Table handling - * Per Intel EFI Specification v1.02 - * http://developer.intel.com/technology/efi/efi.htm + * + * http://www.uefi.org/specs/ + * http://www.intel.com/technology/efi/ + * * efi.[ch] by Matt Domsch * Copyright 2000,2001,2002,2004 Dell Inc. * @@ -92,6 +94,7 @@ * ************************************************************/ #include +#include #include "check.h" #include "efi.h" @@ -141,7 +144,8 @@ last_lba(struct block_device *bdev) { if (!bdev || !bdev->bd_inode) return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; + return div_u64(bdev->bd_inode->i_size, + bdev_logical_block_size(bdev)) - 1ULL; } static inline int @@ -188,6 +192,7 @@ static size_t read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) { size_t totalreadcount = 0; + sector_t n = lba * (bdev_logical_block_size(bdev) / 512); if (!bdev || !buffer || lba > last_lba(bdev)) return 0; @@ -195,7 +200,7 @@ read_lba(struct block_device *bdev, u64 lba, u8 * buffer, size_t count) while (count) { int copied = 512; Sector sect; - unsigned char *data = read_dev_sector(bdev, lba++, §); + unsigned char *data = read_dev_sector(bdev, n++, §); if (!data) break; if (copied > count) @@ -601,6 +606,7 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; + unsigned ssz = bdev_logical_block_size(bdev) / 512; if (!find_valid_gpt(bdev, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -611,13 +617,14 @@ efi_partition(struct parsed_partitions *state, struct block_device *bdev) pr_debug("GUID Partition Table is valid! Yea!\n"); for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + u64 start = le64_to_cpu(ptes[i].starting_lba); + u64 size = le64_to_cpu(ptes[i].ending_lba) - + le64_to_cpu(ptes[i].starting_lba) + 1ULL; + if (!is_pte_valid(&ptes[i], last_lba(bdev))) continue; - put_partition(state, i+1, le64_to_cpu(ptes[i].starting_lba), - (le64_to_cpu(ptes[i].ending_lba) - - le64_to_cpu(ptes[i].starting_lba) + - 1ULL)); + put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ if (!efi_guidcmp(ptes[i].partition_type_guid, From 87038c2d5bda2418fda8b1456a0ae81cc3ff5bd8 Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Mon, 23 Nov 2009 09:29:58 +0100 Subject: [PATCH 057/113] partitions: read whole sector with EFI GPT header The size of EFI GPT header is not static, but whole sector is allocated for the header. The HeaderSize field must be greater than 92 (= sizeof(struct gpt_header) and must be less than or equal to the logical block size. It means we have to read whole sector with the header, because the header crc32 checksum is calculated according to HeaderSize. For more details see UEFI standard (version 2.3, May 2009): - 5.3.1 GUID Format overview, page 93 - Table 13. GUID Partition Table Header, page 96 Signed-off-by: Karel Zak Signed-off-by: Jens Axboe --- fs/partitions/efi.c | 7 ++++--- fs/partitions/efi.h | 8 ++++++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/partitions/efi.c b/fs/partitions/efi.c index 80eeff5fdfe..49cfd5f5423 100644 --- a/fs/partitions/efi.c +++ b/fs/partitions/efi.c @@ -262,15 +262,16 @@ static gpt_header * alloc_read_gpt_header(struct block_device *bdev, u64 lba) { gpt_header *gpt; + unsigned ssz = bdev_logical_block_size(bdev); + if (!bdev) return NULL; - gpt = kzalloc(sizeof (gpt_header), GFP_KERNEL); + gpt = kzalloc(ssz, GFP_KERNEL); if (!gpt) return NULL; - if (read_lba(bdev, lba, (u8 *) gpt, - sizeof (gpt_header)) < sizeof (gpt_header)) { + if (read_lba(bdev, lba, (u8 *) gpt, ssz) < ssz) { kfree(gpt); gpt=NULL; return NULL; diff --git a/fs/partitions/efi.h b/fs/partitions/efi.h index 2cc89d0475b..6998b589abf 100644 --- a/fs/partitions/efi.h +++ b/fs/partitions/efi.h @@ -37,7 +37,6 @@ #define EFI_PMBR_OSTYPE_EFI 0xEF #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE -#define GPT_BLOCK_SIZE 512 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL #define GPT_HEADER_REVISION_V1 0x00010000 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 @@ -79,7 +78,12 @@ typedef struct _gpt_header { __le32 num_partition_entries; __le32 sizeof_partition_entry; __le32 partition_entry_array_crc32; - u8 reserved2[GPT_BLOCK_SIZE - 92]; + + /* The rest of the logical block is reserved by UEFI and must be zero. + * EFI standard handles this by: + * + * uint8_t reserved2[ BlockSize - 92 ]; + */ } __attribute__ ((packed)) gpt_header; typedef struct _gpt_entry_attributes { From d61c42690c6e2ff093a3d01338dad49f35b1e27b Mon Sep 17 00:00:00 2001 From: "Stephen M. Cameron" Date: Mon, 23 Nov 2009 09:31:48 +0100 Subject: [PATCH 058/113] cciss: fix scatter gather cleanup problems On driver unload, only free up the extra scatter gather data if they were allocated in the first place (the controller supports it) and don't forget to free up the sg_cmd_list array of pointers. Signed-off-by: Don Brace Signed-off-by: Stephen M. Cameron Signed-off-by: Jens Axboe --- drivers/block/cciss.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index eab81c6c0ca..873e594860d 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4326,10 +4326,15 @@ clean4: for (k = 0; k < hba[i]->nr_cmds; k++) kfree(hba[i]->scatter_list[k]); kfree(hba[i]->scatter_list); - for (j = 0; j < hba[i]->nr_cmds; j++) { - if (hba[i]->cmd_sg_list[j]) - kfree(hba[i]->cmd_sg_list[j]->sgchain); - kfree(hba[i]->cmd_sg_list[j]); + /* Only free up extra s/g lists if controller supports them */ + if (hba[i]->chainsize > 0) { + for (j = 0; j < hba[i]->nr_cmds; j++) { + if (hba[i]->cmd_sg_list[j]) { + kfree(hba[i]->cmd_sg_list[j]->sgchain); + kfree(hba[i]->cmd_sg_list[j]); + } + } + kfree(hba[i]->cmd_sg_list); } if (hba[i]->cmd_pool) pci_free_consistent(hba[i]->pdev, @@ -4448,9 +4453,15 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev) for (j = 0; j < hba[i]->nr_cmds; j++) kfree(hba[i]->scatter_list[j]); kfree(hba[i]->scatter_list); - for (j = 0; j < hba[i]->nr_cmds; j++) { - kfree(hba[i]->cmd_sg_list[j]->sgchain); - kfree(hba[i]->cmd_sg_list[j]); + /* Only free up extra s/g lists if controller supports them */ + if (hba[i]->chainsize > 0) { + for (j = 0; j < hba[i]->nr_cmds; j++) { + if (hba[i]->cmd_sg_list[j]) { + kfree(hba[i]->cmd_sg_list[j]->sgchain); + kfree(hba[i]->cmd_sg_list[j]); + } + } + kfree(hba[i]->cmd_sg_list); } /* * Deliberately omit pci_disable_device(): it does something nasty to From 32a87c0114f37871aefb12a30de3e0c3300e3646 Mon Sep 17 00:00:00 2001 From: Alex Chiang Date: Mon, 23 Nov 2009 09:35:06 +0100 Subject: [PATCH 059/113] cciss: change Cmd_sg_list.sg_chain_dma type to dma_addr_t A recent commit broke the ia64 build: Author: Don Brace Date: Thu Nov 12 12:50:01 2009 -0600 cciss: Add enhanced scatter-gather support. because of this hunk: --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h +struct Cmd_sg_list { + SGDescriptor_struct *sgchain; + dma64_addr_t sg_chain_dma; + int chain_block_size; +}; The issue is that dma64_addr_t isn't #define'd on ia64. The way that we're using Cmd_sg_list.sg_chain_dma is to hold an address returned from pci_map_single(). + temp64.val = pci_map_single(h->pdev, + h->cmd_sg_list[c->cmdindex]->sgchain, + len, dir); + + h->cmd_sg_list[c->cmdindex]->sg_chain_dma = temp64.val; pci_map_single() returns a dma_addr_t too. This code will still work even on a 32-bit x86 build, where dma_addr_t is defined to be a u32 because it will simply be promoted to the __u64 that temp64.val is defined as. Thus, declaring Cmd_sg_list.sg_chain_dma as dma_addr_t is safe. Cc: Don Brace Cc: Stephen M. Cameron Signed-off-by: Alex Chiang Signed-off-by: Jens Axboe --- drivers/block/cciss.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index e5c63e579ff..1d95db25406 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -57,7 +57,7 @@ typedef struct _drive_info_struct struct Cmd_sg_list { SGDescriptor_struct *sgchain; - dma64_addr_t sg_chain_dma; + dma_addr_t sg_chain_dma; int chain_block_size; }; From 6c6c7951be7652f86109f2193651b78d90907c0d Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 16 Nov 2009 15:48:54 +0100 Subject: [PATCH 060/113] fix in-kernel configuration serialization this is uncritical, as we still also serialize in userland, but to correctly serialize on the CONFIG_PENDING bit, it must be wait_event(state_wait, \!test_and_set_bit) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_nl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e2a5875a07b..436a090b532 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -733,7 +733,7 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu */ static void drbd_reconfig_start(struct drbd_conf *mdev) { - wait_event(mdev->state_wait, test_and_set_bit(CONFIG_PENDING, &mdev->flags)); + wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags)); wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags)); drbd_thread_start(&mdev->worker); } From 0b33a9164aca6332bf4a117af5528dea9675d782 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 16 Nov 2009 15:58:04 +0100 Subject: [PATCH 061/113] add missing state change on corrupt packet header in drbd_recv_header Otherwise the 'state fixup' in the receiver will change to Unconnected, but the receiver will terminate itself, and any attempt at 'down'ing that drbd later will block forever. see also Bugz. #259 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 9bbc509443e..fb29a75053e 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -3499,8 +3499,10 @@ static void drbdd(struct drbd_conf *mdev) while (get_t_state(&mdev->receiver) == Running) { drbd_thread_current_set_cpu(mdev); - if (!drbd_recv_header(mdev, header)) + if (!drbd_recv_header(mdev, header)) { + drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR)); break; + } if (header->command < P_MAX_CMD) handler = drbd_cmd_handler[header->command]; From d8c2a36b774defd4e230353d91f0f609c128bd78 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 18 Nov 2009 15:52:51 +0100 Subject: [PATCH 062/113] Fixed a regression in resync decission code drbd_uuid_compare() [Bugz 260] Since 8.3.3 we fail to do the resync when a partial resynch is not possible, but a full synch is necessary. This regression was introduced with 7101539930c0a89146959e7a39c09ad9c3516434 Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_receiver.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index fb29a75053e..c548f24f54a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2400,6 +2400,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l *rule_nr = 80; + peer = mdev->p_uuid[UI_CURRENT] & ~((u64)1); for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) { self = mdev->ldev->md.uuid[i] & ~((u64)1); if (self == peer) From ad85dfe67bbf13d5fa20764e4ce801a1e6e526d8 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 18 Nov 2009 15:52:51 +0100 Subject: [PATCH 063/113] DRBD: Now the code is 8.3.6 + 3 fixes (without compat crap) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 18942ad115d..99a4d76694e 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.5" +#define REL_VERSION "8.3.6" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 91 From 35a8a3fdcd4f973a5430e868f2f2a5c363803a5b Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 Nov 2009 17:50:00 +0100 Subject: [PATCH 064/113] drbd: moved CN_IDX_DRBD and CN_VAL_DRBD to the right file Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/connector.h | 2 ++ include/linux/drbd.h | 7 ------- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/connector.h b/include/linux/connector.h index 3a14615fd35..72ba63eb83c 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -43,6 +43,8 @@ #define CN_DST_VAL 0x1 #define CN_IDX_DM 0x7 /* Device Mapper */ #define CN_VAL_DM_USERSPACE_LOG 0x1 +#define CN_IDX_DRBD 0x8 +#define CN_VAL_DRBD 0x1 #define CN_NETLINK_USERS 8 diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 99a4d76694e..e84f4733cb5 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -322,13 +322,6 @@ enum drbd_timeout_flag { #define DRBD_NL_CREATE_DEVICE 0x01 #define DRBD_NL_SET_DEFAULTS 0x02 -/* The following line should be moved over to linux/connector.h - * when the time comes */ -#ifndef CN_IDX_DRBD -# define CN_IDX_DRBD 0x4 -/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */ -#endif -#define CN_VAL_DRBD 0x1 /* For searching a vacant cn_idx value */ #define CN_IDX_STEP 6977 From 3586e917f2c7df769d173c4ec99554cb40a911e5 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Thu, 26 Nov 2009 09:14:11 +0100 Subject: [PATCH 065/113] cfq: Make use of service count to estimate the rb_key offset For the moment, different workload cfq queues are put into different service trees. But CFQ still uses "busy_queues" to estimate rb_key offset when inserting a cfq queue into a service tree. I think this isn't appropriate, and it should make use of service tree count to do this estimation. This patch is for for-2.6.33 branch. Signed-off-by: Gui Jianfeng Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1bcbd8c7989..467981e19d7 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -600,11 +600,15 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static unsigned long cfq_slice_offset(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_rb_root *service_tree; + + service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); + /* * just an approximation, should be ok. */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); + return service_tree->count * (cfq_prio_slice(cfqd, 1, 0) - + cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } /* From 2d4dc890b5c8fabd818a8586607e6843c4375e62 Mon Sep 17 00:00:00 2001 From: Ilya Loginov Date: Thu, 26 Nov 2009 09:16:19 +0100 Subject: [PATCH 066/113] block: add helpers to run flush_dcache_page() against a bio and a request's pages Mtdblock driver doesn't call flush_dcache_page for pages in request. So, this causes problems on architectures where the icache doesn't fill from the dcache or with dcache aliases. The patch fixes this. The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid pointless empty cache-thrashing loops on architectures for which flush_dcache_page() is a no-op. Every architecture was provided with this flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is equal 1 or do nothing otherwise. See "fix mtd_blkdevs problem with caches on some architectures" discussion on LKML for more information. Signed-off-by: Ilya Loginov Cc: Ingo Molnar Cc: David Woodhouse Cc: Peter Horton Cc: "Ed L. Cashin" Signed-off-by: Jens Axboe --- arch/alpha/include/asm/cacheflush.h | 1 + arch/arm/include/asm/cacheflush.h | 1 + arch/avr32/include/asm/cacheflush.h | 1 + arch/blackfin/include/asm/cacheflush.h | 2 ++ arch/cris/include/asm/cacheflush.h | 1 + arch/frv/include/asm/cacheflush.h | 1 + arch/h8300/include/asm/cacheflush.h | 1 + arch/ia64/include/asm/cacheflush.h | 1 + arch/m32r/include/asm/cacheflush.h | 3 +++ arch/m68k/include/asm/cacheflush_mm.h | 1 + arch/m68k/include/asm/cacheflush_no.h | 1 + arch/microblaze/include/asm/cacheflush.h | 1 + arch/mips/include/asm/cacheflush.h | 1 + arch/mn10300/include/asm/cacheflush.h | 1 + arch/parisc/include/asm/cacheflush.h | 1 + arch/powerpc/include/asm/cacheflush.h | 1 + arch/s390/include/asm/cacheflush.h | 1 + arch/score/include/asm/cacheflush.h | 1 + arch/sh/include/asm/cacheflush.h | 1 + arch/sparc/include/asm/cacheflush_32.h | 1 + arch/sparc/include/asm/cacheflush_64.h | 1 + arch/x86/include/asm/cacheflush.h | 1 + arch/xtensa/include/asm/cacheflush.h | 1 + block/blk-core.c | 19 +++++++++++++++++++ drivers/mtd/mtd_blkdevs.c | 2 ++ fs/bio.c | 12 ++++++++++++ include/asm-generic/cacheflush.h | 1 + include/linux/bio.h | 12 ++++++++++++ include/linux/blkdev.h | 11 +++++++++++ 29 files changed, 83 insertions(+) diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index b686cc7fc44..01d71e1c8a9 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -9,6 +9,7 @@ #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index fd03fb63a33..247b7b0adc2 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -408,6 +408,7 @@ extern void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, * about to change to user space. This is the same method as used on SPARC64. * See update_mmu_cache for the user space part. */ +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); extern void __flush_dcache_page(struct address_space *mapping, struct page *page); diff --git a/arch/avr32/include/asm/cacheflush.h b/arch/avr32/include/asm/cacheflush.h index 670674749b2..96e53820bbb 100644 --- a/arch/avr32/include/asm/cacheflush.h +++ b/arch/avr32/include/asm/cacheflush.h @@ -107,6 +107,7 @@ extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); * do something here, but only for certain configurations. No such * configurations exist at this time. */ +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(page) do { } while (0) #define flush_dcache_mmap_unlock(page) do { } while (0) diff --git a/arch/blackfin/include/asm/cacheflush.h b/arch/blackfin/include/asm/cacheflush.h index af03a36c7a4..417eaac7fe9 100644 --- a/arch/blackfin/include/asm/cacheflush.h +++ b/arch/blackfin/include/asm/cacheflush.h @@ -68,9 +68,11 @@ do { memcpy(dst, src, len); \ #endif #if defined(CONFIG_BFIN_EXTMEM_WRITEBACK) || defined(CONFIG_BFIN_L2_WRITEBACK) # define flush_dcache_range(start,end) blackfin_dcache_flush_range((start), (end)) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 # define flush_dcache_page(page) blackfin_dflush_page(page_address(page)) #else # define flush_dcache_range(start,end) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 # define flush_dcache_page(page) do { } while (0) #endif diff --git a/arch/cris/include/asm/cacheflush.h b/arch/cris/include/asm/cacheflush.h index cf60e3f69f8..36795bca605 100644 --- a/arch/cris/include/asm/cacheflush.h +++ b/arch/cris/include/asm/cacheflush.h @@ -12,6 +12,7 @@ #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/frv/include/asm/cacheflush.h b/arch/frv/include/asm/cacheflush.h index 432a69e7f3d..edbac54ae01 100644 --- a/arch/frv/include/asm/cacheflush.h +++ b/arch/frv/include/asm/cacheflush.h @@ -47,6 +47,7 @@ static inline void __flush_cache_all(void) } /* dcache/icache coherency... */ +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #ifdef CONFIG_MMU extern void flush_dcache_page(struct page *page); #else diff --git a/arch/h8300/include/asm/cacheflush.h b/arch/h8300/include/asm/cacheflush.h index 5ffdca217b9..4cf2df20c1c 100644 --- a/arch/h8300/include/asm/cacheflush.h +++ b/arch/h8300/include/asm/cacheflush.h @@ -15,6 +15,7 @@ #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma,a,b) #define flush_cache_page(vma,p,pfn) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) #define flush_dcache_mmap_lock(mapping) #define flush_dcache_mmap_unlock(mapping) diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index c8ce2719fee..429eefc93ee 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -25,6 +25,7 @@ #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) \ do { \ clear_bit(PG_arch_1, &(page)->flags); \ diff --git a/arch/m32r/include/asm/cacheflush.h b/arch/m32r/include/asm/cacheflush.h index 78587c95814..8e8e04516c3 100644 --- a/arch/m32r/include/asm/cacheflush.h +++ b/arch/m32r/include/asm/cacheflush.h @@ -12,6 +12,7 @@ extern void _flush_cache_copyback_all(void); #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) @@ -33,6 +34,7 @@ extern void smp_flush_cache_all(void); #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) @@ -46,6 +48,7 @@ extern void smp_flush_cache_all(void); #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 16bf375fdbe..73de7c89d8e 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -128,6 +128,7 @@ static inline void __flush_page_to_ram(void *vaddr) } } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) __flush_page_to_ram(page_address(page)) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/m68k/include/asm/cacheflush_no.h b/arch/m68k/include/asm/cacheflush_no.h index c65f00a9455..89f195656be 100644 --- a/arch/m68k/include/asm/cacheflush_no.h +++ b/arch/m68k/include/asm/cacheflush_no.h @@ -12,6 +12,7 @@ #define flush_cache_range(vma, start, end) __flush_cache_all() #define flush_cache_page(vma, vmaddr) do { } while (0) #define flush_dcache_range(start,len) __flush_cache_all() +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h index f989d6aad64..088076e657b 100644 --- a/arch/microblaze/include/asm/cacheflush.h +++ b/arch/microblaze/include/asm/cacheflush.h @@ -37,6 +37,7 @@ #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) #define flush_dcache_range(start, end) __invalidate_dcache_range(start, end) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 03b1d69b142..40bb9fde205 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -38,6 +38,7 @@ extern void (*flush_cache_range)(struct vm_area_struct *vma, extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); extern void __flush_dcache_page(struct page *page); +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_page(struct page *page) { if (cpu_has_dc_aliases || !cpu_has_ic_fills_f_dc) diff --git a/arch/mn10300/include/asm/cacheflush.h b/arch/mn10300/include/asm/cacheflush.h index 1a55d61f0d0..29e692f7f03 100644 --- a/arch/mn10300/include/asm/cacheflush.h +++ b/arch/mn10300/include/asm/cacheflush.h @@ -26,6 +26,7 @@ #define flush_cache_page(vma, vmaddr, pfn) do {} while (0) #define flush_cache_vmap(start, end) do {} while (0) #define flush_cache_vunmap(start, end) do {} while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do {} while (0) #define flush_dcache_mmap_lock(mapping) do {} while (0) #define flush_dcache_mmap_unlock(mapping) do {} while (0) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 724395143f2..7a73b615c23 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -42,6 +42,7 @@ void flush_cache_mm(struct mm_struct *mm); #define flush_cache_vmap(start, end) flush_cache_all() #define flush_cache_vunmap(start, end) flush_cache_all() +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_lock(mapping) \ diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index ba667a383b8..ab9e402518e 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -25,6 +25,7 @@ #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/s390/include/asm/cacheflush.h b/arch/s390/include/asm/cacheflush.h index 49d5af916d0..405cc97c624 100644 --- a/arch/s390/include/asm/cacheflush.h +++ b/arch/s390/include/asm/cacheflush.h @@ -10,6 +10,7 @@ #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/score/include/asm/cacheflush.h b/arch/score/include/asm/cacheflush.h index 07cc8fc457c..caaba24036e 100644 --- a/arch/score/include/asm/cacheflush.h +++ b/arch/score/include/asm/cacheflush.h @@ -16,6 +16,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end); extern void flush_dcache_range(unsigned long start, unsigned long end); #define flush_cache_dup_mm(mm) do {} while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do {} while (0) #define flush_dcache_mmap_lock(mapping) do {} while (0) #define flush_dcache_mmap_unlock(mapping) do {} while (0) diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index c29918f3c81..dda96eb3e7c 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -42,6 +42,7 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); extern void flush_icache_range(unsigned long start, unsigned long end); extern void flush_icache_page(struct vm_area_struct *vma, diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index 68ac1091027..2e468773f25 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -75,6 +75,7 @@ BTFIXUPDEF_CALL(void, flush_sig_insns, struct mm_struct *, unsigned long) extern void sparc_flush_page_to_ram(struct page *page); +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) sparc_flush_page_to_ram(page) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index c43321729b3..b95384033e8 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -37,6 +37,7 @@ extern void flush_dcache_page_all(struct mm_struct *mm, struct page *page); #endif extern void __flush_dcache_range(unsigned long start, unsigned long end); +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); #define flush_icache_page(vma, pg) do { } while(0) diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index b54f6afe7ec..9076add593a 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { } static inline void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn) { } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 static inline void flush_dcache_page(struct page *page) { } static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index b7b8fbe47c7..a508f2f73bd 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -101,6 +101,7 @@ static inline void __invalidate_icache_page_alias(unsigned long virt, #define flush_cache_vmap(start,end) flush_cache_all() #define flush_cache_vunmap(start,end) flush_cache_all() +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page*); extern void flush_cache_range(struct vm_area_struct*, ulong, ulong); extern void flush_cache_page(struct vm_area_struct*, unsigned long, unsigned long); diff --git a/block/blk-core.c b/block/blk-core.c index 71da5111120..718897e6d37 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2358,6 +2358,25 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, rq->rq_disk = bio->bi_bdev->bd_disk; } +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +/** + * rq_flush_dcache_pages - Helper function to flush all pages in a request + * @rq: the request to be flushed + * + * Description: + * Flush all pages in @rq. + */ +void rq_flush_dcache_pages(struct request *rq) +{ + struct req_iterator iter; + struct bio_vec *bvec; + + rq_for_each_segment(bvec, rq, iter) + flush_dcache_page(bvec->bv_page); +} +EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); +#endif + /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 8ca17a3e96e..64e2b379a35 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -59,12 +59,14 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->readsect(dev, block, buf)) return -EIO; + rq_flush_dcache_pages(req); return 0; case WRITE: if (!tr->writesect) return -EIO; + rq_flush_dcache_pages(req); for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->writesect(dev, block, buf)) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index 12da5db8682..e23a63f4f7d 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1393,6 +1393,18 @@ void bio_check_pages_dirty(struct bio *bio) } } +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +void bio_flush_dcache_pages(struct bio *bi) +{ + int i; + struct bio_vec *bvec; + + bio_for_each_segment(bvec, bi, i) + flush_dcache_page(bvec->bv_page); +} +EXPORT_SYMBOL(bio_flush_dcache_pages); +#endif + /** * bio_endio - end I/O on a bio * @bio: bio diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index ba4ec39a113..57b5c3c82e8 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -13,6 +13,7 @@ #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/include/linux/bio.h b/include/linux/bio.h index 474792b825d..7fc5606e6ea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -391,6 +391,18 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, gfp_t, int); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); + +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" +#endif +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +extern void bio_flush_dcache_pages(struct bio *bi); +#else +static inline void bio_flush_dcache_pages(struct bio *bi) +{ +} +#endif + extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, unsigned long, unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1cc02972fbe..e727f6c44c4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -752,6 +752,17 @@ struct req_iterator { #define rq_iter_last(rq, _iter) \ (_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1) +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" +#endif +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +extern void rq_flush_dcache_pages(struct request *rq); +#else +static inline void rq_flush_dcache_pages(struct request *rq) +{ +} +#endif + extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern void register_disk(struct gendisk *dev); From c16632bab1a17e357cec66920ceb3f0630009360 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Thu, 26 Nov 2009 09:41:21 +0100 Subject: [PATCH 067/113] cfq-iosched: cleanup unreachable code cfq_should_idle returns false for no-idle queues that are not the last, so the control flow will never reach the removed code in a state that satisfies the if condition. The unreachable code was added to emulate previous cfq behaviour for non-NCQ rotational devices. My tests show that even without it, the performances and fairness are comparable with previous cfq, thanks to the fact that all seeky queues are grouped together, and that we idle at the end of the tree. Signed-off-by: Corrado Zoccolo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 467981e19d7..c2ef5d17608 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1269,19 +1269,6 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) cfq_mark_cfqq_wait_request(cfqq); sl = cfqd->cfq_slice_idle; - /* are we servicing noidle tree, and there are more queues? - * non-rotational or NCQ: no idle - * non-NCQ rotational : very small idle, to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. - */ - if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && - service_tree_for(cfqd->serving_prio, SYNC_NOIDLE_WORKLOAD, cfqd) - ->count > 0) { - if (blk_queue_nonrot(cfqd->queue) || cfqd->hw_tag) - return; - sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); - } mod_timer(&cfqd->idle_slice_timer, jiffies + sl); cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); From d9449ce35a1e8fb58dd2d419f9215562a14ecca0 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 26 Nov 2009 09:45:40 +0100 Subject: [PATCH 068/113] Fix regression in direct writes performance due to WRITE_ODIRECT flag removal There seems to be a regression in direct write path due to following commit in for-2.6.33 branch of block tree. commit 1af60fbd759d31f565552fea315c2033947cfbe6 Author: Jeff Moyer Date: Fri Oct 2 18:56:53 2009 -0400 block: get rid of the WRITE_ODIRECT flag Marking direct writes as WRITE_SYNC_PLUG instead of WRITE_ODIRECT, sets the NOIDLE flag in bio and hence in request. This tells CFQ to not expect more request from the queue and not idle on it (despite the fact that queue's think time is less and it is not seeky). So direct writers lose big time when competing with sequential readers. Using fio, I have run one direct writer and two sequential readers and following are the results with 2.6.32-rc7 kernel and with for-2.6.33 branch. Test ==== 1 direct writer and 2 sequential reader running simultaneously. [global] directory=/mnt/sdc/fio/ runtime=10 [seqwrite] rw=write size=4G direct=1 [seqread] rw=read size=2G numjobs=2 2.6.32-rc7 ========== direct writes: aggrb=2,968KB/s readers : aggrb=101MB/s for-2.6.33 branch ================= direct write: aggrb=19KB/s readers aggrb=137MB/s This patch brings back the WRITE_ODIRECT flag, with the difference that we don't set the BIO_RW_UNPLUG flag so that device is not unplugged after submission of request and an explicit unplug from submitter is required. That way we fix the jeff's issue of not enough merging taking place in aio path as well as make sure direct writes get their fair share. After the fix ============= for-2.6.33 + fix ---------------- direct writes: aggrb=2,728KB/s reads: aggrb=103MB/s Thanks Vivek Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- fs/direct-io.c | 2 +- include/linux/fs.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 3af761c8c5c..b912270942f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1124,7 +1124,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, int acquire_i_mutex = 0; if (rw & WRITE) - rw = WRITE_SYNC_PLUG; + rw = WRITE_ODIRECT_PLUG; if (bdev) bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); diff --git a/include/linux/fs.h b/include/linux/fs.h index 2f5fca4147c..79cea805173 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -129,6 +129,7 @@ struct inodes_stat_t { * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device * immediately after submission. The write equivalent * of READ_SYNC. + * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. * SWRITE_SYNC * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. * See SWRITE. @@ -150,6 +151,7 @@ struct inodes_stat_t { #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) +#define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) #define SWRITE_SYNC_PLUG \ (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) From e459dd08f45d2aa68abb0c02f8ab045cf8a598b8 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Thu, 26 Nov 2009 10:02:57 +0100 Subject: [PATCH 069/113] cfq-iosched: fix ncq detection code CFQ's detection of queueing devices initially assumes a queuing device and detects if the queue depth reaches a certain threshold. However, it will reconsider this choice periodically. Unfortunately, if device is considered not queuing, CFQ will force a unit queue depth for some workloads, thus defeating the detection logic. This leads to poor performance on queuing hardware, since the idle window remains enabled. Given this premise, switching to hw_tag = 0 after we have proved at least once that the device is NCQ capable is not a good choice. The new detection code starts in an indeterminate state, in which CFQ behaves as if hw_tag = 1, and then, if for a long observation period we never saw large depth, we switch to hw_tag = 0, otherwise we stick to hw_tag = 1, without reconsidering it again. Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index c2ef5d17608..47abd24617b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -191,8 +191,14 @@ struct cfq_data { */ int rq_queued; int hw_tag; - int hw_tag_samples; - int rq_in_driver_peak; + /* + * hw_tag can be + * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) + * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) + * 0 => no NCQ + */ + int hw_tag_est_depth; + unsigned int hw_tag_samples; /* * idle window management @@ -2518,8 +2524,11 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; - if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = rq_in_driver(cfqd); + if (rq_in_driver(cfqd) > cfqd->hw_tag_est_depth) + cfqd->hw_tag_est_depth = rq_in_driver(cfqd); + + if (cfqd->hw_tag == 1) + return; if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) @@ -2538,13 +2547,10 @@ static void cfq_update_hw_tag(struct cfq_data *cfqd) if (cfqd->hw_tag_samples++ < 50) return; - if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) + if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) cfqd->hw_tag = 1; else cfqd->hw_tag = 0; - - cfqd->hw_tag_samples = 0; - cfqd->rq_in_driver_peak = 0; } static void cfq_completed_request(struct request_queue *q, struct request *rq) @@ -2951,7 +2957,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_latency = 1; - cfqd->hw_tag = 1; + cfqd->hw_tag = -1; cfqd->last_end_sync_rq = jiffies; return cfqd; } From e4a229196a7c676514c78f6783f8994f64bf681c Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Thu, 26 Nov 2009 10:02:58 +0100 Subject: [PATCH 070/113] cfq-iosched: fix no-idle preemption logic An incoming no-idle queue should preempt the active no-idle queue only if the active queue is idling due to service tree empty. Previous code was buggy in two ways: * it relied on service_tree field to be set on the active queue, while it is not set when the code is idling for a new request * it didn't check for the service tree empty condition, so could lead to LIFO behaviour if multiple queues with depth > 1 were preempting each other on an non-NCQ device. Reported-by: Vivek Goyal Signed-off-by: Corrado Zoccolo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 47abd24617b..2c1086acddf 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2392,8 +2392,9 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_idle(cfqq)) return true; - if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD - && new_cfqq->service_tree == cfqq->service_tree) + if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && + cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && + new_cfqq->service_tree->count == 1) return true; /* From 76280aff1c7e9ae761cac4b48591c43cd7d69159 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Thu, 26 Nov 2009 10:02:58 +0100 Subject: [PATCH 071/113] cfq-iosched: idling on deep seeky sync queues Seeky sync queues with large depth can gain unfairly big share of disk time, at the expense of other seeky queues. This patch ensures that idling will be enabled for queues with I/O depth at least 4, and small think time. The decision to enable idling is sticky, until an idle window times out without seeing a new request. The reasoning behind the decision is that, if an application is using large I/O depth, it is already optimized to make full utilization of the hardware, and therefore we reserve a slice of exclusive use for it. Reported-by: Vivek Goyal Signed-off-by: Corrado Zoccolo Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 2c1086acddf..15f7238f527 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -260,6 +260,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ + CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ }; #define CFQ_CFQQ_FNS(name) \ @@ -286,6 +287,7 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); +CFQ_CFQQ_FNS(deep); #undef CFQ_CFQQ_FNS #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ @@ -2350,8 +2352,12 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); + if (cfqq->queued[0] + cfqq->queued[1] >= 4) + cfq_mark_cfqq_deep(cfqq); + if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (sample_valid(cfqq->seek_samples) && CFQQ_SEEKY(cfqq))) + (!cfq_cfqq_deep(cfqq) && sample_valid(cfqq->seek_samples) + && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { if (cic->ttime_mean > cfqd->cfq_slice_idle) @@ -2849,6 +2855,11 @@ static void cfq_idle_slice_timer(unsigned long data) */ if (!RB_EMPTY_ROOT(&cfqq->sort_list)) goto out_kick; + + /* + * Queue depth flag is reset only when the idle didn't succeed + */ + cfq_clear_cfqq_deep(cfqq); } expire: cfq_slice_expired(cfqd, timed_out); From 8e550632cccae34e265cb066691945515eaa7fb5 Mon Sep 17 00:00:00 2001 From: Corrado Zoccolo Date: Thu, 26 Nov 2009 10:02:58 +0100 Subject: [PATCH 072/113] cfq-iosched: fix corner cases in idling logic Idling logic was disabled in some corner cases, leading to unfair share for noidle queues. * the idle timer was not armed if there were other requests in the driver. unfortunately, those requests could come from other workloads, or queues for which we don't enable idling. So we will check only pending requests from the active queue * rq_noidle check on no-idle queue could disable the end of tree idle if the last completed request was rq_noidle. Now, we will disable that idle only if all the queues served in the no-idle tree had rq_noidle requests. Reported-by: Vivek Goyal Signed-off-by: Corrado Zoccolo Acked-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 15f7238f527..a5de31f76d3 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -172,6 +172,7 @@ struct cfq_data { enum wl_prio_t serving_prio; enum wl_type_t serving_type; unsigned long workload_expires; + bool noidle_tree_requires_idle; /* * Each priority tree is sorted by next_request position. These @@ -1253,9 +1254,9 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd) return; /* - * still requests with the driver, don't idle + * still active requests from this queue, don't idle */ - if (rq_in_driver(cfqd)) + if (cfqq->dispatched) return; /* @@ -1478,6 +1479,7 @@ static void choose_service_tree(struct cfq_data *cfqd) slice = max_t(unsigned, slice, CFQ_MIN_TT); cfqd->workload_expires = jiffies + slice; + cfqd->noidle_tree_requires_idle = false; } /* @@ -2597,17 +2599,27 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_clear_cfqq_slice_new(cfqq); } /* - * If there are no requests waiting in this queue, and - * there are other queues ready to issue requests, AND - * those other queues are issuing requests within our - * mean seek distance, give them a chance to run instead - * of idling. + * Idling is not enabled on: + * - expired queues + * - idle-priority queues + * - async queues + * - queues with still some requests queued + * - when there is a close cooperator */ if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) cfq_slice_expired(cfqd, 1); - else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq) && - sync && !rq_noidle(rq)) - cfq_arm_slice_timer(cfqd); + else if (sync && cfqq_empty && + !cfq_close_cooperator(cfqd, cfqq)) { + cfqd->noidle_tree_requires_idle |= !rq_noidle(rq); + /* + * Idling is enabled for SYNC_WORKLOAD. + * SYNC_NOIDLE_WORKLOAD idles at the end of the tree + * only if we processed at least one !rq_noidle request + */ + if (cfqd->serving_type == SYNC_WORKLOAD + || cfqd->noidle_tree_requires_idle) + cfq_arm_slice_timer(cfqd); + } } if (!rq_in_driver(cfqd)) From 464191c65b85a8ec68a6e1a6293af625287c807e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 30 Nov 2009 09:38:13 +0100 Subject: [PATCH 073/113] Revert "cfq: Make use of service count to estimate the rb_key offset" This reverts commit 3586e917f2c7df769d173c4ec99554cb40a911e5. Corrado Zoccolo correctly points out, that we need consistency of rb_key offset across groups. This means we cannot properly use the per-service_tree service count. Revert this change. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a5de31f76d3..71446497d7b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -609,15 +609,11 @@ cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static unsigned long cfq_slice_offset(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - struct cfq_rb_root *service_tree; - - service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); - /* * just an approximation, should be ok. */ - return service_tree->count * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); + return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - + cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } /* From 98262f2762f0067375f83824d81ea929e37e6bfe Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 3 Dec 2009 09:24:48 +0100 Subject: [PATCH 074/113] block: Allow devices to indicate whether discarded blocks are zeroed The discard ioctl is used by mkfs utilities to clear a block device prior to putting metadata down. However, not all devices return zeroed blocks after a discard. Some drives return stale data, potentially containing old superblocks. It is therefore important to know whether discarded blocks are properly zeroed. Both ATA and SCSI drives have configuration bits that indicate whether zeroes are returned after a discard operation. Implement a block level interface that allows this information to be bubbled up the stack and queried via a new block device ioctl. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 2 ++ block/blk-sysfs.c | 11 +++++++++++ block/compat_ioctl.c | 2 ++ block/ioctl.c | 2 ++ include/linux/blkdev.h | 14 ++++++++++++++ include/linux/fs.h | 1 + 6 files changed, 32 insertions(+) diff --git a/block/blk-settings.c b/block/blk-settings.c index 1ebc1fdb914..dd1f1e0e196 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -101,6 +101,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->discard_granularity = 0; lim->discard_alignment = 0; lim->discard_misaligned = 0; + lim->discard_zeroes_data = -1; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -544,6 +545,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->io_min = max(t->io_min, b->io_min); t->no_cluster |= b->no_cluster; + t->discard_zeroes_data &= b->discard_zeroes_data; /* Bottom device offset aligned? */ if (offset && diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3147145edc1..8606c9543fd 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -136,6 +136,11 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page) return queue_var_show(q->limits.max_discard_sectors << 9, page); } +static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_discard_zeroes_data(q), page); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -313,6 +318,11 @@ static struct queue_sysfs_entry queue_discard_max_entry = { .show = queue_discard_max_show, }; +static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { + .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO }, + .show = queue_discard_zeroes_data_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_nonrot_show, @@ -350,6 +360,7 @@ static struct attribute *default_attrs[] = { &queue_io_opt_entry.attr, &queue_discard_granularity_entry.attr, &queue_discard_max_entry.attr, + &queue_discard_zeroes_data_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 9bd086c1a4d..4eb8e9ea4af 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return compat_put_uint(arg, bdev_io_opt(bdev)); case BLKALIGNOFF: return compat_put_int(arg, bdev_alignment_offset(bdev)); + case BLKDISCARDZEROES: + return compat_put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKFLSBUF: case BLKROSET: case BLKDISCARD: diff --git a/block/ioctl.c b/block/ioctl.c index 1f4d1de12b0..be48ea51fae 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -280,6 +280,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return put_uint(arg, bdev_io_opt(bdev)); case BLKALIGNOFF: return put_int(arg, bdev_alignment_offset(bdev)); + case BLKDISCARDZEROES: + return put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKSECTGET: return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); case BLKRASET: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e727f6c44c4..784a919aa0d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -322,6 +322,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char no_cluster; + signed char discard_zeroes_data; }; struct request_queue @@ -1150,6 +1151,19 @@ static inline int queue_sector_discard_alignment(struct request_queue *q, & (q->limits.discard_granularity - 1); } +static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) +{ + if (q->limits.discard_zeroes_data == 1) + return 1; + + return 0; +} + +static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) +{ + return queue_discard_zeroes_data(bdev_get_queue(bdev)); +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/fs.h b/include/linux/fs.h index 79cea805173..891f7d642e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -304,6 +304,7 @@ struct inodes_stat_t { #define BLKIOOPT _IO(0x12,121) #define BLKALIGNOFF _IO(0x12,122) #define BLKPBSZGET _IO(0x12,123) +#define BLKDISCARDZEROES _IO(0x12,124) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ From 474b18ccc264c472abeec50f48469b6477202699 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 3 Dec 2009 12:58:05 +0100 Subject: [PATCH 075/113] cfq-iosched: no dispatch limit for single queue Since commit 2f5cb7381b737e24c8046fd4aeab571fb71315f5, each queue can send up to 4 * 4 requests if only one queue exists. I wonder why we have such limit. Device supports tag can send more requests. For example, AHCI can send 31 requests. Test (direct aio randread) shows the limits reduce about 4% disk thoughput. On the other hand, since we send one request one time, if other queue pop when current is sending more than cfq_quantum requests, current queue will stop send requests soon after one request, so sounds there is no big latency. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 71446497d7b..f5b59e18ebd 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1618,9 +1618,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) return false; /* - * Sole queue user, allow bigger slice + * Sole queue user, no limit */ - max_dispatch *= 4; + max_dispatch = -1; } /* From bf7ec5bb6114b2f086e536e24486fdacd1c0d339 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 3 Dec 2009 13:49:43 +0100 Subject: [PATCH 076/113] flusher: Fix PF_FROZEN race To touch task->flags directly is racy. thaw_process() still has race (changing non_current->flags, but this is another issue) though, I think it's much better off. So, use thaw_process() instead. Signed-off-by: OGAWA Hirofumi Signed-off-by: Jens Axboe --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 67a33a5a1a9..0e8ca034770 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -609,7 +609,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) * it would never exet if it is currently stuck in the refrigerator. */ list_for_each_entry(wb, &bdi->wb_list, list) { - wb->task->flags &= ~PF_FROZEN; + thaw_process(wb->task); kthread_stop(wb->task); } } From 951c30d135390a108f102b0f6e3cfa6241f2a1aa Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 3 Dec 2009 13:54:25 +0100 Subject: [PATCH 077/113] writeback: remove the always false bdi_cap_writeback_dirty() test This is dead code because no bdi flush thread will be started for !bdi_cap_writeback_dirty bdi. Signed-off-by: Wu Fengguang Cc: Jens Axboe Cc: Trond Myklebust Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9d5360c4c2a..0306c8e7d6b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -614,7 +614,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { struct super_block *sb = wbc->sb, *pin_sb = NULL; - const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ spin_lock(&inode_lock); @@ -635,23 +634,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, continue; } - if (!bdi_cap_writeback_dirty(wb->bdi)) { - redirty_tail(inode); - if (is_blkdev_sb) { - /* - * Dirty memory-backed blockdev: the ramdisk - * driver does this. Skip just this inode - */ - continue; - } - /* - * Dirty memory-backed inode against a filesystem other - * than the kernel-internal bdev filesystem. Skip the - * entire superblock. - */ - break; - } - if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; From b17621fed6aa039387e35f9b4d34d98f213e5673 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 3 Dec 2009 13:54:25 +0100 Subject: [PATCH 078/113] writeback: introduce wbc.for_background It will lower the flush priority for NFS, and maybe more in future. Signed-off-by: Wu Fengguang Cc: Trond Myklebust Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 1 + fs/nfs/write.c | 2 +- include/linux/writeback.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0306c8e7d6b..0793961f769 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -738,6 +738,7 @@ static long wb_writeback(struct bdi_writeback *wb, .sync_mode = args->sync_mode, .older_than_this = NULL, .for_kupdate = args->for_kupdate, + .for_background = args->for_background, .range_cyclic = args->range_cyclic, }; unsigned long oldest_jif; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53eb26c16b5..c84b5cc1a94 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -178,7 +178,7 @@ static int wb_priority(struct writeback_control *wbc) { if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; - if (wbc->for_kupdate) + if (wbc->for_kupdate || wbc->for_background) return FLUSH_LOWPRI; return 0; } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 66ebddcff66..705f01fe413 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -49,6 +49,7 @@ struct writeback_control { unsigned nonblocking:1; /* Don't get stuck on request queues */ unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ + unsigned for_background:1; /* A background writeback */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ From 0d99519efef15fd0cf84a849492c7b1deee1e4b7 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 3 Dec 2009 13:54:25 +0100 Subject: [PATCH 079/113] writeback: remove unused nonblocking and congestion checks - no one is calling wb_writeback and write_cache_pages with wbc.nonblocking=1 any more - lumpy pageout will want to do nonblocking writeback without the congestion wait So remove the congestion checks as suggested by Chris. Signed-off-by: Wu Fengguang Cc: Chris Mason Cc: Jens Axboe Cc: Trond Myklebust Cc: Christoph Hellwig Cc: Dave Chinner Cc: Evgeniy Polyakov Cc: Alex Elder Signed-off-by: Jens Axboe --- drivers/staging/pohmelfs/inode.c | 10 ---------- fs/fs-writeback.c | 9 --------- fs/xfs/linux-2.6/xfs_aops.c | 9 +-------- mm/page-writeback.c | 12 ------------ 4 files changed, 1 insertion(+), 39 deletions(-) diff --git a/drivers/staging/pohmelfs/inode.c b/drivers/staging/pohmelfs/inode.c index c94de313922..f69b7783027 100644 --- a/drivers/staging/pohmelfs/inode.c +++ b/drivers/staging/pohmelfs/inode.c @@ -143,7 +143,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c struct inode *inode = mapping->host; struct pohmelfs_inode *pi = POHMELFS_I(inode); struct pohmelfs_sb *psb = POHMELFS_SB(inode->i_sb); - struct backing_dev_info *bdi = mapping->backing_dev_info; int err = 0; int done = 0; int nr_pages; @@ -152,11 +151,6 @@ static int pohmelfs_writepages(struct address_space *mapping, struct writeback_c int scanned = 0; int range_whole = 0; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -248,10 +242,6 @@ retry: if (wbc->nr_to_write <= 0) done = 1; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - } continue; out_continue: diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 0793961f769..49bc1b8e8f1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -639,14 +639,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, continue; } - if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { - wbc->encountered_congestion = 1; - if (!is_blkdev_sb) - break; /* Skip a congested fs */ - requeue_io(inode); - continue; /* Skip a congested blockdev */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -770,7 +762,6 @@ static long wb_writeback(struct bdi_writeback *wb, break; wbc.more_io = 0; - wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; writeback_inodes_wb(wb, &wbc); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c2e30eea74d..70f989895d1 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -904,16 +904,9 @@ xfs_convert_page( if (startio) { if (count) { - struct backing_dev_info *bdi; - - bdi = inode->i_mapping->backing_dev_info; wbc->nr_to_write--; - if (bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; + if (wbc->nr_to_write <= 0) done = 1; - } else if (wbc->nr_to_write <= 0) { - done = 1; - } } xfs_start_page_writeback(page, !page_dirty, count); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2c5d79236ea..0b19943ecf8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -821,7 +821,6 @@ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; @@ -834,11 +833,6 @@ int write_cache_pages(struct address_space *mapping, int range_whole = 0; long nr_to_write = wbc->nr_to_write; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - pagevec_init(&pvec, 0); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ @@ -957,12 +951,6 @@ continue_unlock: break; } } - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - done = 1; - break; - } } pagevec_release(&pvec); cond_resched(); From 753c89130c52b96e66e5ceff19bd1336de9a5ce8 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 18 Nov 2009 15:52:51 +0100 Subject: [PATCH 080/113] drbd_req.c: use part_[inc|dec]_in_flight() Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index d09aac4a84e..de81ab7b462 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -39,8 +39,8 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req cpu = part_stat_lock(); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); + part_inc_in_flight(&mdev->vdisk->part0, rw); part_stat_unlock(); - mdev->vdisk->part0.in_flight[rw]++; } /* Update disk stats when completing request upwards */ @@ -52,8 +52,8 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req) cpu = part_stat_lock(); part_stat_add(cpu, &mdev->vdisk->part0, ticks[rw], duration); part_round_stats(cpu, &mdev->vdisk->part0); + part_dec_in_flight(&mdev->vdisk->part0, rw); part_stat_unlock(); - mdev->vdisk->part0.in_flight[rw]--; } static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw) From bf7919371025412978268efca4b09dd847acb395 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:37 -0500 Subject: [PATCH 081/113] blkio: Set must_dispatch only if we decided to not dispatch the request o must_dispatch flag should be set only if we decided not to run the queue and dispatch the request. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f5b59e18ebd..15b53616516 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2490,9 +2490,9 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || cfqd->busy_queues > 1) { del_timer(&cfqd->idle_slice_timer); - __blk_run_queue(cfqd->queue); - } - cfq_mark_cfqq_must_dispatch(cfqq); + __blk_run_queue(cfqd->queue); + } else + cfq_mark_cfqq_must_dispatch(cfqq); } } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* From cdb16e8f739985b8a5c9f4569b026583bbcd01a5 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:38 -0500 Subject: [PATCH 082/113] blkio: Introduce the notion of cfq groups o This patch introduce the notion of cfq groups. Soon we will can have multiple groups of different weights in the system. o Various service trees (prioclass and workload type trees), will become per cfq group. So hierarchy looks as follows. cfq_groups | workload type | cfq queue o When an scheduling decision has to be taken, first we select the cfq group then workload with-in the group and then cfq queue with-in the workload type. o This patch just makes various workload service tree per cfq group and introduce the function to be able to choose a group for scheduling. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 112 ++++++++++++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 35 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 15b53616516..a4d17265411 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -132,6 +132,7 @@ struct cfq_queue { struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; + struct cfq_group *cfqg; }; /* @@ -153,25 +154,30 @@ enum wl_type_t { SYNC_WORKLOAD = 2 }; - -/* - * Per block device queue structure - */ -struct cfq_data { - struct request_queue *queue; - +/* This is per cgroup per device grouping structure */ +struct cfq_group { /* * rr lists of queues with requests, onle rr for each priority class. * Counts are embedded in the cfq_rb_root */ struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; +}; + +/* + * Per block device queue structure + */ +struct cfq_data { + struct request_queue *queue; + struct cfq_group root_group; + /* * The priority currently being served */ enum wl_prio_t serving_prio; enum wl_type_t serving_type; unsigned long workload_expires; + struct cfq_group *serving_group; bool noidle_tree_requires_idle; /* @@ -240,14 +246,15 @@ struct cfq_data { unsigned long last_end_sync_rq; }; -static struct cfq_rb_root *service_tree_for(enum wl_prio_t prio, +static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, + enum wl_prio_t prio, enum wl_type_t type, struct cfq_data *cfqd) { if (prio == IDLE_WORKLOAD) - return &cfqd->service_tree_idle; + return &cfqg->service_tree_idle; - return &cfqd->service_trees[prio][type]; + return &cfqg->service_trees[prio][type]; } enum cfqq_state_flags { @@ -317,12 +324,14 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd) { - if (wl == IDLE_WORKLOAD) - return cfqd->service_tree_idle.count; + struct cfq_group *cfqg = &cfqd->root_group; - return cfqd->service_trees[wl][ASYNC_WORKLOAD].count - + cfqd->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count - + cfqd->service_trees[wl][SYNC_WORKLOAD].count; + if (wl == IDLE_WORKLOAD) + return cfqg->service_tree_idle.count; + + return cfqg->service_trees[wl][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_WORKLOAD].count; } static void cfq_dispatch_insert(struct request_queue *, struct request *); @@ -612,7 +621,7 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, /* * just an approximation, should be ok. */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - + return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } @@ -630,7 +639,8 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rb_root *service_tree; int left; - service_tree = service_tree_for(cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); + service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), + cfqq_type(cfqq), cfqd); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; parent = rb_last(&service_tree->rb); @@ -1066,7 +1076,8 @@ static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { struct cfq_rb_root *service_tree = - service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd); + service_tree_for(cfqd->serving_group, cfqd->serving_prio, + cfqd->serving_type, cfqd); if (RB_EMPTY_ROOT(&service_tree->rb)) return NULL; @@ -1218,7 +1229,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * in their service tree. */ if (!service_tree) - service_tree = service_tree_for(prio, cfqq_type(cfqq), cfqd); + service_tree = service_tree_for(cfqq->cfqg, prio, + cfqq_type(cfqq), cfqd); if (service_tree->count == 0) return true; @@ -1377,8 +1389,9 @@ static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) } } -static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, - bool prio_changed) +static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, + struct cfq_group *cfqg, enum wl_prio_t prio, + bool prio_changed) { struct cfq_queue *queue; int i; @@ -1392,10 +1405,10 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, * from SYNC_NOIDLE (first choice), or just SYNC * over ASYNC */ - if (service_tree_for(prio, cur_best, cfqd)->count) + if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) return cur_best; cur_best = SYNC_WORKLOAD; - if (service_tree_for(prio, cur_best, cfqd)->count) + if (service_tree_for(cfqg, prio, cur_best, cfqd)->count) return cur_best; return ASYNC_WORKLOAD; @@ -1403,7 +1416,7 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, for (i = 0; i < 3; ++i) { /* otherwise, select the one with lowest rb_key */ - queue = cfq_rb_first(service_tree_for(prio, i, cfqd)); + queue = cfq_rb_first(service_tree_for(cfqg, prio, i, cfqd)); if (queue && (!key_valid || time_before(queue->rb_key, lowest_key))) { lowest_key = queue->rb_key; @@ -1415,12 +1428,13 @@ static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, enum wl_prio_t prio, return cur_best; } -static void choose_service_tree(struct cfq_data *cfqd) +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) { enum wl_prio_t previous_prio = cfqd->serving_prio; bool prio_changed; unsigned slice; unsigned count; + struct cfq_rb_root *st; /* Choose next priority. RT > BE > IDLE */ if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) @@ -1439,8 +1453,9 @@ static void choose_service_tree(struct cfq_data *cfqd) * expiration time */ prio_changed = (cfqd->serving_prio != previous_prio); - count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) - ->count; + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, + cfqd); + count = st->count; /* * If priority didn't change, check workload expiration, @@ -1452,9 +1467,10 @@ static void choose_service_tree(struct cfq_data *cfqd) /* otherwise select new workload type */ cfqd->serving_type = - cfq_choose_wl(cfqd, cfqd->serving_prio, prio_changed); - count = service_tree_for(cfqd->serving_prio, cfqd->serving_type, cfqd) - ->count; + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio, prio_changed); + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type, + cfqd); + count = st->count; /* * the workload slice is computed as a fraction of target latency @@ -1478,6 +1494,12 @@ static void choose_service_tree(struct cfq_data *cfqd) cfqd->noidle_tree_requires_idle = false; } +static void cfq_choose_cfqg(struct cfq_data *cfqd) +{ + cfqd->serving_group = &cfqd->root_group; + choose_service_tree(cfqd, &cfqd->root_group); +} + /* * Select a queue for service. If we have a current active queue, * check whether to continue servicing it, or retrieve and set a new one. @@ -1535,7 +1557,7 @@ new_queue: * service tree */ if (!new_cfqq) - choose_service_tree(cfqd); + cfq_choose_cfqg(cfqd); cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: @@ -1564,13 +1586,15 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) struct cfq_queue *cfqq; int dispatched = 0; int i, j; + struct cfq_group *cfqg = &cfqd->root_group; + for (i = 0; i < 2; ++i) for (j = 0; j < 3; ++j) - while ((cfqq = cfq_rb_first(&cfqd->service_trees[i][j])) + while ((cfqq = cfq_rb_first(&cfqg->service_trees[i][j])) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); - while ((cfqq = cfq_rb_first(&cfqd->service_tree_idle)) != NULL) + while ((cfqq = cfq_rb_first(&cfqg->service_tree_idle)) != NULL) dispatched += __cfq_forced_dispatch_cfqq(cfqq); cfq_slice_expired(cfqd, 0); @@ -2041,14 +2065,26 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->pid = pid; } +static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + cfqq->cfqg = cfqg; +} + +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + return &cfqd->root_group; +} + static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, gfp_t gfp_mask) { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct cfq_group *cfqg; retry: + cfqg = cfq_get_cfqg(cfqd, 1); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -2079,6 +2115,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, ioc); + cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else cfqq = &cfqd->oom_cfqq; @@ -2931,15 +2968,19 @@ static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; int i, j; + struct cfq_group *cfqg; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; + /* Init root group */ + cfqg = &cfqd->root_group; + for (i = 0; i < 2; ++i) for (j = 0; j < 3; ++j) - cfqd->service_trees[i][j] = CFQ_RB_ROOT; - cfqd->service_tree_idle = CFQ_RB_ROOT; + cfqg->service_trees[i][j] = CFQ_RB_ROOT; + cfqg->service_tree_idle = CFQ_RB_ROOT; /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -2956,6 +2997,7 @@ static void *cfq_init_queue(struct request_queue *q) */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); atomic_inc(&cfqd->oom_cfqq.ref); + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); INIT_LIST_HEAD(&cfqd->cic_list); From 615f0259e6940293359a189f4881bb28c2fea40b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:39 -0500 Subject: [PATCH 083/113] blkio: Implement macro to traverse each service tree in group o Implement a macro to traverse each service tree in the group. This avoids usage of double for loop and special condition for idle tree 4 times. o Macro is little twisted because of special handling of idle class service tree. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a4d17265411..fab2be0fa21 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -140,9 +140,9 @@ struct cfq_queue { * IDLE is handled separately, so it has negative index */ enum wl_prio_t { - IDLE_WORKLOAD = -1, BE_WORKLOAD = 0, - RT_WORKLOAD = 1 + RT_WORKLOAD = 1, + IDLE_WORKLOAD = 2, }; /* @@ -303,6 +303,17 @@ CFQ_CFQQ_FNS(deep); #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) +/* Traverses through cfq group service trees */ +#define for_each_cfqg_st(cfqg, i, j, st) \ + for (i = 0; i <= IDLE_WORKLOAD; i++) \ + for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ + : &cfqg->service_tree_idle; \ + (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ + (i == IDLE_WORKLOAD && j == 0); \ + j++, st = i < IDLE_WORKLOAD ? \ + &cfqg->service_trees[i][j]: NULL) \ + + static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) { if (cfq_class_idle(cfqq)) @@ -565,6 +576,10 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, */ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) { + /* Service tree is empty */ + if (!root->count) + return NULL; + if (!root->left) root->left = rb_first(&root->rb); @@ -1587,18 +1602,14 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) int dispatched = 0; int i, j; struct cfq_group *cfqg = &cfqd->root_group; + struct cfq_rb_root *st; - for (i = 0; i < 2; ++i) - for (j = 0; j < 3; ++j) - while ((cfqq = cfq_rb_first(&cfqg->service_trees[i][j])) - != NULL) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - - while ((cfqq = cfq_rb_first(&cfqg->service_tree_idle)) != NULL) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); + for_each_cfqg_st(cfqg, i, j, st) { + while ((cfqq = cfq_rb_first(st)) != NULL) + dispatched += __cfq_forced_dispatch_cfqq(cfqq); + } cfq_slice_expired(cfqd, 0); - BUG_ON(cfqd->busy_queues); cfq_log(cfqd, "forced_dispatch=%d", dispatched); @@ -2969,6 +2980,7 @@ static void *cfq_init_queue(struct request_queue *q) struct cfq_data *cfqd; int i, j; struct cfq_group *cfqg; + struct cfq_rb_root *st; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) @@ -2976,11 +2988,8 @@ static void *cfq_init_queue(struct request_queue *q) /* Init root group */ cfqg = &cfqd->root_group; - - for (i = 0; i < 2; ++i) - for (j = 0; j < 3; ++j) - cfqg->service_trees[i][j] = CFQ_RB_ROOT; - cfqg->service_tree_idle = CFQ_RB_ROOT; + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; /* * Not strictly needed (since RB_ROOT just clears the node and we From f04a64246344ad50e4b4b4186174a0912d07f30b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:40 -0500 Subject: [PATCH 084/113] blkio: Keep queue on service tree until we expire it o Currently cfqq deletes a queue from service tree if it is empty (even if we might idle on the queue). This patch keeps the queue on service tree hence associated group remains on the service tree until we decide that we are not going to idle on the queue and expire it. o This just helps in time accounting for queue/group and in implementation of rest of the patches. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 70 +++++++++++++++++++++++++++++++-------------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index fab2be0fa21..7f5646ac9f5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -393,7 +393,7 @@ static int cfq_queue_empty(struct request_queue *q) { struct cfq_data *cfqd = q->elevator->elevator_data; - return !cfqd->busy_queues; + return !cfqd->rq_queued; } /* @@ -842,7 +842,6 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -850,8 +849,17 @@ static void cfq_del_rq_rb(struct request *rq) elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { + /* + * Queue will be deleted from service tree when we actually + * expire it later. Right now just remove it from prio tree + * as it is empty. + */ + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) @@ -1065,6 +1073,9 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) + cfq_del_cfqq_rr(cfqd, cfqq); + cfq_resort_rr_list(cfqd, cfqq); if (cfqq == cfqd->active_queue) @@ -1094,11 +1105,30 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) service_tree_for(cfqd->serving_group, cfqd->serving_prio, cfqd->serving_type, cfqd); + if (!cfqd->rq_queued) + return NULL; + if (RB_EMPTY_ROOT(&service_tree->rb)) return NULL; return cfq_rb_first(service_tree); } +static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg = &cfqd->root_group; + struct cfq_queue *cfqq; + int i, j; + struct cfq_rb_root *st; + + if (!cfqd->rq_queued) + return NULL; + + for_each_cfqg_st(cfqg, i, j, st) + if ((cfqq = cfq_rb_first(st)) != NULL) + return cfqq; + return NULL; +} + /* * Get and set a new active queue for service. */ @@ -1231,6 +1261,9 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) enum wl_prio_t prio = cfqq_prio(cfqq); struct cfq_rb_root *service_tree = cfqq->service_tree; + BUG_ON(!service_tree); + BUG_ON(!service_tree->count); + /* We never do for idle class queues. */ if (prio == IDLE_WORKLOAD) return false; @@ -1243,14 +1276,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - if (!service_tree) - service_tree = service_tree_for(cfqq->cfqg, prio, - cfqq_type(cfqq), cfqd); - - if (service_tree->count == 0) - return true; - - return (service_tree->count == 1 && cfq_rb_first(service_tree) == cfqq); + return service_tree->count == 1; } static void cfq_arm_slice_timer(struct cfq_data *cfqd) @@ -1527,6 +1553,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) if (!cfqq) goto new_queue; + if (!cfqd->rq_queued) + return NULL; /* * The active queue has run out of time, expire it and select new. */ @@ -1589,6 +1617,9 @@ static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) } BUG_ON(!list_empty(&cfqq->fifo)); + + /* By default cfqq is not expired if it is empty. Do it explicitly */ + __cfq_slice_expired(cfqq->cfqd, cfqq, 0); return dispatched; } @@ -1600,14 +1631,9 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd) { struct cfq_queue *cfqq; int dispatched = 0; - int i, j; - struct cfq_group *cfqg = &cfqd->root_group; - struct cfq_rb_root *st; - for_each_cfqg_st(cfqg, i, j, st) { - while ((cfqq = cfq_rb_first(st)) != NULL) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - } + while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) + dispatched += __cfq_forced_dispatch_cfqq(cfqq); cfq_slice_expired(cfqd, 0); BUG_ON(cfqd->busy_queues); @@ -1776,13 +1802,13 @@ static void cfq_put_queue(struct cfq_queue *cfqq) cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - BUG_ON(cfq_cfqq_on_rr(cfqq)); if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } + BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); } @@ -2444,9 +2470,11 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_idle(cfqq)) return true; + /* Allow preemption only if we are idling on sync-noidle tree */ if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && - new_cfqq->service_tree->count == 1) + new_cfqq->service_tree->count == 2 && + RB_EMPTY_ROOT(&cfqq->sort_list)) return true; /* From 1fa8f6d68b5c8ca0a608fd8d296c5f07ac788cd6 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:41 -0500 Subject: [PATCH 085/113] blkio: Introduce the root service tree for cfq groups o So far we just had one cfq_group in cfq_data. To create space for more than one cfq_group, we need to have a service tree of groups where all the groups can be queued if they have active cfq queues backlogged in these. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 137 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 134 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7f5646ac9f5..e1f822ac469 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -66,6 +66,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock); #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) +#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) /* * Most of our rbtree usage is for sorting with min extraction, so @@ -77,8 +78,9 @@ struct cfq_rb_root { struct rb_root rb; struct rb_node *left; unsigned count; + u64 min_vdisktime; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } /* * Per process-grouping structure @@ -156,6 +158,16 @@ enum wl_type_t { /* This is per cgroup per device grouping structure */ struct cfq_group { + /* group service_tree member */ + struct rb_node rb_node; + + /* group service_tree key */ + u64 vdisktime; + bool on_st; + + /* number of cfqq currently on this group */ + int nr_cfqq; + /* * rr lists of queues with requests, onle rr for each priority class. * Counts are embedded in the cfq_rb_root @@ -169,6 +181,8 @@ struct cfq_group { */ struct cfq_data { struct request_queue *queue; + /* Root service tree for cfq_groups */ + struct cfq_rb_root grp_service_tree; struct cfq_group root_group; /* @@ -251,6 +265,9 @@ static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, enum wl_type_t type, struct cfq_data *cfqd) { + if (!cfqg) + return NULL; + if (prio == IDLE_WORKLOAD) return &cfqg->service_tree_idle; @@ -589,6 +606,17 @@ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) return NULL; } +static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) +{ + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_cfqg(root->left); + + return NULL; +} + static void rb_erase_init(struct rb_node *n, struct rb_root *root) { rb_erase(n, root); @@ -640,6 +668,83 @@ static unsigned long cfq_slice_offset(struct cfq_data *cfqd, cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); } +static inline s64 +cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + return cfqg->vdisktime - st->min_vdisktime; +} + +static void +__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct cfq_group *__cfqg; + s64 key = cfqg_key(st, cfqg); + int left = 1; + + while (*node != NULL) { + parent = *node; + __cfqg = rb_entry_cfqg(parent); + + if (key < cfqg_key(st, __cfqg)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &cfqg->rb_node; + + rb_link_node(&cfqg->rb_node, parent, node); + rb_insert_color(&cfqg->rb_node, &st->rb); +} + +static void +cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *__cfqg; + struct rb_node *n; + + cfqg->nr_cfqq++; + if (cfqg->on_st) + return; + + /* + * Currently put the group at the end. Later implement something + * so that groups get lesser vtime based on their weights, so that + * if group does not loose all if it was not continously backlogged. + */ + n = rb_last(&st->rb); + if (n) { + __cfqg = rb_entry_cfqg(n); + cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY; + } else + cfqg->vdisktime = st->min_vdisktime; + + __cfq_group_service_tree_add(st, cfqg); + cfqg->on_st = true; +} + +static void +cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + BUG_ON(cfqg->nr_cfqq < 1); + cfqg->nr_cfqq--; + /* If there are other cfq queues under this group, don't delete it */ + if (cfqg->nr_cfqq) + return; + + cfqg->on_st = false; + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + cfq_rb_erase(&cfqg->rb_node, st); +} + /* * The cfqd->service_trees holds all pending cfq_queue's that have * requests waiting to be processed. It is sorted in the order that @@ -722,6 +827,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, rb_link_node(&cfqq->rb_node, parent, p); rb_insert_color(&cfqq->rb_node, &service_tree->rb); service_tree->count++; + cfq_group_service_tree_add(cfqd, cfqq->cfqg); } static struct cfq_queue * @@ -832,6 +938,7 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) cfqq->p_root = NULL; } + cfq_group_service_tree_del(cfqd, cfqq->cfqg); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; } @@ -1108,6 +1215,9 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) if (!cfqd->rq_queued) return NULL; + /* There is nothing to dispatch */ + if (!service_tree) + return NULL; if (RB_EMPTY_ROOT(&service_tree->rb)) return NULL; return cfq_rb_first(service_tree); @@ -1477,6 +1587,12 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) unsigned count; struct cfq_rb_root *st; + if (!cfqg) { + cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->workload_expires = jiffies + 1; + return; + } + /* Choose next priority. RT > BE > IDLE */ if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) cfqd->serving_prio = RT_WORKLOAD; @@ -1535,10 +1651,21 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) cfqd->noidle_tree_requires_idle = false; } +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + if (RB_EMPTY_ROOT(&st->rb)) + return NULL; + return cfq_rb_first_group(st); +} + static void cfq_choose_cfqg(struct cfq_data *cfqd) { - cfqd->serving_group = &cfqd->root_group; - choose_service_tree(cfqd, &cfqd->root_group); + struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); + + cfqd->serving_group = cfqg; + choose_service_tree(cfqd, cfqg); } /* @@ -3014,10 +3141,14 @@ static void *cfq_init_queue(struct request_queue *q) if (!cfqd) return NULL; + /* Init root service tree */ + cfqd->grp_service_tree = CFQ_RB_ROOT; + /* Init root group */ cfqg = &cfqd->root_group; for_each_cfqg_st(cfqg, i, j, st) *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); /* * Not strictly needed (since RB_ROOT just clears the node and we From 31e4c28d95e64f2d5d3c497a3ecf37c62de635b4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:42 -0500 Subject: [PATCH 086/113] blkio: Introduce blkio controller cgroup interface o This is basic implementation of blkio controller cgroup interface. This is the common interface visible to user space and should be used by different IO control policies as we implement those. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/Kconfig | 13 +++ block/Kconfig.iosched | 1 + block/Makefile | 1 + block/blk-cgroup.c | 177 ++++++++++++++++++++++++++++++++++ block/blk-cgroup.h | 58 +++++++++++ include/linux/cgroup_subsys.h | 6 ++ include/linux/iocontext.h | 4 + 7 files changed, 260 insertions(+) create mode 100644 block/blk-cgroup.c create mode 100644 block/blk-cgroup.h diff --git a/block/Kconfig b/block/Kconfig index 9be0b56eaee..6ba1a8e3388 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -77,6 +77,19 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_CGROUP + bool + depends on CGROUPS + default n + ---help--- + Generic block IO controller cgroup interface. This is the common + cgroup interface which should be used by various IO controlling + policies. + + Currently, CFQ IO scheduler uses it to recognize task groups and + control disk bandwidth allocation (proportional time slice allocation) + to such task groups. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 8bd105115a6..be0280deec2 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,6 +23,7 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" + select BLK_CGROUP default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally diff --git a/block/Makefile b/block/Makefile index 7914108952f..cb2d515ebd6 100644 --- a/block/Makefile +++ b/block/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c new file mode 100644 index 00000000000..4f6afd76ec5 --- /dev/null +++ b/block/blk-cgroup.c @@ -0,0 +1,177 @@ +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ +#include +#include "blk-cgroup.h" + +struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; + +struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkio_cgroup, css); +} + +void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&blkcg->lock, flags); + rcu_assign_pointer(blkg->key, key); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + spin_unlock_irqrestore(&blkcg->lock, flags); +} + +int blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + /* Implemented later */ + return 0; +} + +/* called under rcu_read_lock(). */ +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +{ + struct blkio_group *blkg; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + __key = blkg->key; + if (__key == key) + return blkg; + } + + return NULL; +} + +#define SHOW_FUNCTION(__VAR) \ +static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype) \ +{ \ + struct blkio_cgroup *blkcg; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + return (u64)blkcg->__VAR; \ +} + +SHOW_FUNCTION(weight); +#undef SHOW_FUNCTION + +static int +blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + blkcg = cgroup_to_blkio_cgroup(cgroup); + blkcg->weight = (unsigned int)val; + return 0; +} + +struct cftype blkio_files[] = { + { + .name = "weight", + .read_u64 = blkiocg_weight_read, + .write_u64 = blkiocg_weight_write, + }, +}; + +static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, blkio_files, + ARRAY_SIZE(blkio_files)); +} + +static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + + free_css_id(&blkio_subsys, &blkcg->css); + kfree(blkcg); +} + +static struct cgroup_subsys_state * +blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg, *parent_blkcg; + + if (!cgroup->parent) { + blkcg = &blkio_root_cgroup; + goto done; + } + + /* Currently we do not support hierarchy deeper than two level (0,1) */ + parent_blkcg = cgroup_to_blkio_cgroup(cgroup->parent); + if (css_depth(&parent_blkcg->css) > 0) + return ERR_PTR(-EINVAL); + + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) + return ERR_PTR(-ENOMEM); + + blkcg->weight = BLKIO_WEIGHT_DEFAULT; +done: + spin_lock_init(&blkcg->lock); + INIT_HLIST_HEAD(&blkcg->blkg_list); + + return &blkcg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. + */ +static int blkiocg_can_attach(struct cgroup_subsys *subsys, + struct cgroup *cgroup, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, + struct cgroup *prev, struct task_struct *tsk, + bool threadgroup) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, + .subsys_id = blkio_subsys_id, + .use_id = 1, +}; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h new file mode 100644 index 00000000000..ba5703f69b4 --- /dev/null +++ b/block/blk-cgroup.h @@ -0,0 +1,58 @@ +#ifndef _BLK_CGROUP_H +#define _BLK_CGROUP_H +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include + +struct blkio_cgroup { + struct cgroup_subsys_state css; + unsigned int weight; + spinlock_t lock; + struct hlist_head blkg_list; +}; + +struct blkio_group { + /* An rcu protected unique identifier for the group */ + void *key; + struct hlist_node blkcg_node; +}; + +#define BLKIO_WEIGHT_MIN 100 +#define BLKIO_WEIGHT_MAX 1000 +#define BLKIO_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_BLK_CGROUP +extern struct blkio_cgroup blkio_root_cgroup; +extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key); +extern int blkiocg_del_blkio_group(struct blkio_group *blkg); +extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, + void *key); +#else +static inline struct blkio_cgroup * +cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } + +static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key) +{ +} + +static inline int +blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } + +static inline struct blkio_group * +blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } +#endif +#endif /* _BLK_CGROUP_H */ diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31bacf4..ccefff02b6c 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -60,3 +60,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_BLK_CGROUP +SUBSYS(blkio) +#endif + +/* */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index eb73632440f..d61b0b8b5cd 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -68,6 +68,10 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_BLK_CGROUP + unsigned short cgroup_changed; +#endif + /* * For request batching */ From 25bc6b07767fe77422312eda2af99c9477f76191 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:43 -0500 Subject: [PATCH 087/113] blkio: Introduce per cfq group weights and vdisktime calculations o Bring in the per cfq group weight and how vdisktime is calculated for the group. Also bring in the functionality of updating the min_vdisktime of the group service tree. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 9 ++++++- block/cfq-iosched.c | 62 ++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index be0280deec2..fa95fa77057 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,7 +23,6 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" - select BLK_CGROUP default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -33,6 +32,14 @@ config IOSCHED_CFQ This is the default I/O scheduler. +config CFQ_GROUP_IOSCHED + bool "CFQ Group Scheduling support" + depends on IOSCHED_CFQ && CGROUPS + select BLK_CGROUP + default n + ---help--- + Enable group IO scheduling in CFQ. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e1f822ac469..019f28eea9d 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -13,6 +13,7 @@ #include #include #include +#include "blk-cgroup.h" /* * tunables @@ -49,6 +50,7 @@ static const int cfq_hist_divisor = 4; #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) +#define CFQ_SERVICE_SHIFT 12 #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) @@ -79,6 +81,7 @@ struct cfq_rb_root { struct rb_node *left; unsigned count; u64 min_vdisktime; + struct rb_node *active; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } @@ -163,6 +166,7 @@ struct cfq_group { /* group service_tree key */ u64 vdisktime; + unsigned int weight; bool on_st; /* number of cfqq currently on this group */ @@ -434,6 +438,51 @@ cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } +static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +{ + u64 d = delta << CFQ_SERVICE_SHIFT; + + d = d * BLKIO_WEIGHT_DEFAULT; + do_div(d, cfqg->weight); + return d; +} + +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta > 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta < 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static void update_min_vdisktime(struct cfq_rb_root *st) +{ + u64 vdisktime = st->min_vdisktime; + struct cfq_group *cfqg; + + if (st->active) { + cfqg = rb_entry_cfqg(st->active); + vdisktime = cfqg->vdisktime; + } + + if (st->left) { + cfqg = rb_entry_cfqg(st->left); + vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); + } + + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + /* * get averaged number of queues of RT/BE priority. * average is updated, with a formula that gives more weight to higher numbers, @@ -734,8 +783,12 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) { struct cfq_rb_root *st = &cfqd->grp_service_tree; + if (st->active == &cfqg->rb_node) + st->active = NULL; + BUG_ON(cfqg->nr_cfqq < 1); cfqg->nr_cfqq--; + /* If there are other cfq queues under this group, don't delete it */ if (cfqg->nr_cfqq) return; @@ -1654,10 +1707,14 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) { struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *cfqg; if (RB_EMPTY_ROOT(&st->rb)) return NULL; - return cfq_rb_first_group(st); + cfqg = cfq_rb_first_group(st); + st->active = &cfqg->rb_node; + update_min_vdisktime(st); + return cfqg; } static void cfq_choose_cfqg(struct cfq_data *cfqd) @@ -3150,6 +3207,9 @@ static void *cfq_init_queue(struct request_queue *q) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); + /* Give preference to root group over other groups */ + cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; + /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides From 58ff82f34cded3812af5b6c69b6aa626b6be2490 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:44 -0500 Subject: [PATCH 088/113] blkio: Implement per cfq group latency target and busy queue avg o So far we had 300ms soft target latency system wide. Now with the introduction of cfq groups, divide that latency by number of groups so that one can come up with group target latency which will be helpful in determining the workload slice with-in group and also the dynamic slice length of the cfq queue. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 65 +++++++++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 20 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 019f28eea9d..84887e2eb21 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -82,6 +82,7 @@ struct cfq_rb_root { unsigned count; u64 min_vdisktime; struct rb_node *active; + unsigned total_weight; }; #define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, } @@ -172,6 +173,8 @@ struct cfq_group { /* number of cfqq currently on this group */ int nr_cfqq; + /* Per group busy queus average. Useful for workload slice calc. */ + unsigned int busy_queues_avg[2]; /* * rr lists of queues with requests, onle rr for each priority class. * Counts are embedded in the cfq_rb_root @@ -188,6 +191,8 @@ struct cfq_data { /* Root service tree for cfq_groups */ struct cfq_rb_root grp_service_tree; struct cfq_group root_group; + /* Number of active cfq groups on group service tree */ + int nr_groups; /* * The priority currently being served @@ -206,7 +211,6 @@ struct cfq_data { struct rb_root prio_trees[CFQ_PRIO_LISTS]; unsigned int busy_queues; - unsigned int busy_queues_avg[2]; int rq_in_driver[2]; int sync_flight; @@ -354,10 +358,10 @@ static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) return SYNC_WORKLOAD; } -static inline int cfq_busy_queues_wl(enum wl_prio_t wl, struct cfq_data *cfqd) +static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, + struct cfq_data *cfqd, + struct cfq_group *cfqg) { - struct cfq_group *cfqg = &cfqd->root_group; - if (wl == IDLE_WORKLOAD) return cfqg->service_tree_idle.count; @@ -489,18 +493,27 @@ static void update_min_vdisktime(struct cfq_rb_root *st) * to quickly follows sudden increases and decrease slowly */ -static inline unsigned cfq_get_avg_queues(struct cfq_data *cfqd, bool rt) +static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg, bool rt) { unsigned min_q, max_q; unsigned mult = cfq_hist_divisor - 1; unsigned round = cfq_hist_divisor / 2; - unsigned busy = cfq_busy_queues_wl(rt, cfqd); + unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); - min_q = min(cfqd->busy_queues_avg[rt], busy); - max_q = max(cfqd->busy_queues_avg[rt], busy); - cfqd->busy_queues_avg[rt] = (mult * max_q + min_q + round) / + min_q = min(cfqg->busy_queues_avg[rt], busy); + max_q = max(cfqg->busy_queues_avg[rt], busy); + cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / cfq_hist_divisor; - return cfqd->busy_queues_avg[rt]; + return cfqg->busy_queues_avg[rt]; +} + +static inline unsigned +cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + return cfq_target_latency * cfqg->weight / st->total_weight; } static inline void @@ -508,12 +521,17 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned slice = cfq_prio_to_slice(cfqd, cfqq); if (cfqd->cfq_latency) { - /* interested queues (we consider only the ones with the same - * priority class) */ - unsigned iq = cfq_get_avg_queues(cfqd, cfq_class_rt(cfqq)); + /* + * interested queues (we consider only the ones with the same + * priority class in the cfq group) + */ + unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, + cfq_class_rt(cfqq)); unsigned sync_slice = cfqd->cfq_slice[1]; unsigned expect_latency = sync_slice * iq; - if (expect_latency > cfq_target_latency) { + unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); + + if (expect_latency > group_slice) { unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; /* scale low_slice according to IO priority * and sync vs async */ @@ -521,7 +539,7 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) min(slice, base_low_slice * slice / sync_slice); /* the adapted slice value is scaled to fit all iqs * into the target latency */ - slice = max(slice * cfq_target_latency / expect_latency, + slice = max(slice * group_slice / expect_latency, low_slice); } } @@ -776,6 +794,8 @@ cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg) __cfq_group_service_tree_add(st, cfqg); cfqg->on_st = true; + cfqd->nr_groups++; + st->total_weight += cfqg->weight; } static void @@ -794,6 +814,8 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) return; cfqg->on_st = false; + cfqd->nr_groups--; + st->total_weight -= cfqg->weight; if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); } @@ -1639,6 +1661,7 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) unsigned slice; unsigned count; struct cfq_rb_root *st; + unsigned group_slice; if (!cfqg) { cfqd->serving_prio = IDLE_WORKLOAD; @@ -1647,9 +1670,9 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) } /* Choose next priority. RT > BE > IDLE */ - if (cfq_busy_queues_wl(RT_WORKLOAD, cfqd)) + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) cfqd->serving_prio = RT_WORKLOAD; - else if (cfq_busy_queues_wl(BE_WORKLOAD, cfqd)) + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) cfqd->serving_prio = BE_WORKLOAD; else { cfqd->serving_prio = IDLE_WORKLOAD; @@ -1687,9 +1710,11 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) * proportional to the number of queues in that workload, over * all the queues in the same priority class */ - slice = cfq_target_latency * count / - max_t(unsigned, cfqd->busy_queues_avg[cfqd->serving_prio], - cfq_busy_queues_wl(cfqd->serving_prio, cfqd)); + group_slice = cfq_group_slice(cfqd, cfqg); + + slice = group_slice * count / + max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], + cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); if (cfqd->serving_type == ASYNC_WORKLOAD) /* async workload slice is scaled down according to From dae739ebc4c590630039533a5bbd05865966094f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:45 -0500 Subject: [PATCH 089/113] blkio: Group time used accounting and workload context save restore o This patch introduces the functionality to do the accounting of group time when a queue expires. This time used decides which is the group to go next. o Also introduce the functionlity to save and restore the workload type context with-in group. It might happen that once we expire the cfq queue and group, a different group will schedule in and we will lose the context of the workload type. Hence save and restore it upon queue expiry. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 84887e2eb21..55d2a21f7f0 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -115,6 +115,10 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; + /* time when queue got scheduled in to dispatch first request. */ + unsigned long dispatch_start; + /* time when first request from queue completed and slice started. */ + unsigned long slice_start; unsigned long slice_end; long slice_resid; unsigned int slice_dispatch; @@ -181,6 +185,10 @@ struct cfq_group { */ struct cfq_rb_root service_trees[2][3]; struct cfq_rb_root service_tree_idle; + + unsigned long saved_workload_slice; + enum wl_type_t saved_workload; + enum wl_prio_t saved_serving_prio; }; /* @@ -543,6 +551,7 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) low_slice); } } + cfqq->slice_start = jiffies; cfqq->slice_end = jiffies + slice; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } @@ -818,6 +827,58 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) st->total_weight -= cfqg->weight; if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); + cfqg->saved_workload_slice = 0; +} + +static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) +{ + unsigned int slice_used, allocated_slice; + + /* + * Queue got expired before even a single request completed or + * got expired immediately after first request completion. + */ + if (!cfqq->slice_start || cfqq->slice_start == jiffies) { + /* + * Also charge the seek time incurred to the group, otherwise + * if there are mutiple queues in the group, each can dispatch + * a single request on seeky media and cause lots of seek time + * and group will never know it. + */ + slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), + 1); + } else { + slice_used = jiffies - cfqq->slice_start; + allocated_slice = cfqq->slice_end - cfqq->slice_start; + if (slice_used > allocated_slice) + slice_used = allocated_slice; + } + + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); + return slice_used; +} + +static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, + struct cfq_queue *cfqq) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + unsigned int used_sl; + + used_sl = cfq_cfqq_slice_usage(cfqq); + + /* Can't update vdisktime while group is on service tree */ + cfq_rb_erase(&cfqg->rb_node, st); + cfqg->vdisktime += cfq_scale_slice(used_sl, cfqg); + __cfq_group_service_tree_add(st, cfqg); + + /* This group is being expired. Save the context */ + if (time_after(cfqd->workload_expires, jiffies)) { + cfqg->saved_workload_slice = cfqd->workload_expires + - jiffies; + cfqg->saved_workload = cfqd->serving_type; + cfqg->saved_serving_prio = cfqd->serving_prio; + } else + cfqg->saved_workload_slice = 0; } /* @@ -833,6 +894,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, unsigned long rb_key; struct cfq_rb_root *service_tree; int left; + int new_cfqq = 1; service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); @@ -861,6 +923,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, } if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + new_cfqq = 0; /* * same position, nothing more to do */ @@ -902,6 +965,8 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, rb_link_node(&cfqq->rb_node, parent, p); rb_insert_color(&cfqq->rb_node, &service_tree->rb); service_tree->count++; + if (add_front || !new_cfqq) + return; cfq_group_service_tree_add(cfqd, cfqq->cfqg); } @@ -1218,6 +1283,8 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, { if (cfqq) { cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfqq->slice_start = 0; + cfqq->dispatch_start = jiffies; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; @@ -1255,6 +1322,8 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } + cfq_group_served(cfqd, cfqq->cfqg, cfqq); + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) cfq_del_cfqq_rr(cfqd, cfqq); @@ -1263,6 +1332,9 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; + if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) + cfqd->grp_service_tree.active = NULL; + if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -1747,6 +1819,13 @@ static void cfq_choose_cfqg(struct cfq_data *cfqd) struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); cfqd->serving_group = cfqg; + + /* Restore the workload type data */ + if (cfqg->saved_workload_slice) { + cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; + cfqd->serving_type = cfqg->saved_workload; + cfqd->serving_prio = cfqg->saved_serving_prio; + } choose_service_tree(cfqd, cfqg); } From 25fb5169d4c9d4255107abbb7c08ab712434efc8 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:46 -0500 Subject: [PATCH 090/113] blkio: Dynamic cfq group creation based on cgroup tasks belongs to o Determine the cgroup IO submitting task belongs to and create the cfq group if it does not exist already. o Also link cfqq and associated cfq group. o Currently all async IO is mapped to root group. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 111 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 11 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 55d2a21f7f0..a877eeee80a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -189,6 +189,10 @@ struct cfq_group { unsigned long saved_workload_slice; enum wl_type_t saved_workload; enum wl_prio_t saved_serving_prio; + struct blkio_group blkg; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + struct hlist_node cfqd_node; +#endif }; /* @@ -274,8 +278,13 @@ struct cfq_data { struct cfq_queue oom_cfqq; unsigned long last_end_sync_rq; + + /* List of cfq groups being managed on this device*/ + struct hlist_head cfqg_list; }; +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); + static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, enum wl_prio_t prio, enum wl_type_t type, @@ -881,6 +890,89 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfqg->saved_workload_slice = 0; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct cfq_group, blkg); + return NULL; +} + +static struct cfq_group * +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct cfq_group *cfqg = NULL; + void *key = cfqd; + int i, j; + struct cfq_rb_root *st; + + /* Do we need to take this reference */ + if (!css_tryget(&blkcg->css)) + return NULL;; + + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + if (cfqg || !create) + goto done; + + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); + if (!cfqg) + goto done; + + cfqg->weight = blkcg->weight; + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* Add group onto cgroup list */ + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + +done: + css_put(&blkcg->css); + return cfqg; +} + +/* + * Search for the cfq group current task belongs to. If create = 1, then also + * create the cfq group if it does not exist. request_queue lock must be held. + */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + struct cgroup *cgroup; + struct cfq_group *cfqg = NULL; + + rcu_read_lock(); + cgroup = task_cgroup(current, blkio_subsys_id); + cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create); + if (!cfqg && create) + cfqg = &cfqd->root_group; + rcu_read_unlock(); + return cfqg; +} + +static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + /* Currently, all async queues are mapped to root group */ + if (!cfq_cfqq_sync(cfqq)) + cfqg = &cfqq->cfqd->root_group; + + cfqq->cfqg = cfqg; +} +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) +{ + return &cfqd->root_group; +} +static inline void +cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { + cfqq->cfqg = cfqg; +} + +#endif /* GROUP_IOSCHED */ + /* * The cfqd->service_trees holds all pending cfq_queue's that have * requests waiting to be processed. It is sorted in the order that @@ -1372,7 +1464,7 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) { - struct cfq_group *cfqg = &cfqd->root_group; + struct cfq_group *cfqg; struct cfq_queue *cfqq; int i, j; struct cfq_rb_root *st; @@ -1380,6 +1472,10 @@ static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) if (!cfqd->rq_queued) return NULL; + cfqg = cfq_get_next_cfqg(cfqd); + if (!cfqg) + return NULL; + for_each_cfqg_st(cfqg, i, j, st) if ((cfqq = cfq_rb_first(st)) != NULL) return cfqq; @@ -2390,16 +2486,6 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->pid = pid; } -static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) -{ - cfqq->cfqg = cfqg; -} - -static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) -{ - return &cfqd->root_group; -} - static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, gfp_t gfp_mask) @@ -3314,6 +3400,9 @@ static void *cfq_init_queue(struct request_queue *q) /* Give preference to root group over other groups */ cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd); +#endif /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides From b1c3576961847da26c91b1e97f226bb66be5fa3f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:47 -0500 Subject: [PATCH 091/113] blkio: Take care of cgroup deletion and cfq group reference counting o One can choose to change elevator or delete a cgroup. Implement group reference counting so that both elevator exit and cgroup deletion can take place gracefully. Signed-off-by: Vivek Goyal Signed-off-by: Nauman Rafique Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 66 ++++++++++++++++++++++++++++++- block/blk-cgroup.h | 1 + block/cfq-iosched.c | 95 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 160 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4f6afd76ec5..0426ab692fd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -13,6 +13,8 @@ #include #include "blk-cgroup.h" +extern void cfq_unlink_blkio_group(void *, struct blkio_group *); + struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) @@ -28,14 +30,43 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, spin_lock_irqsave(&blkcg->lock, flags); rcu_assign_pointer(blkg->key, key); + blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); spin_unlock_irqrestore(&blkcg->lock, flags); } +static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + hlist_del_init_rcu(&blkg->blkcg_node); + blkg->blkcg_id = 0; +} + +/* + * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 + * indicating that blk_group was unhashed by the time we got to it. + */ int blkiocg_del_blkio_group(struct blkio_group *blkg) { - /* Implemented later */ - return 0; + struct blkio_cgroup *blkcg; + unsigned long flags; + struct cgroup_subsys_state *css; + int ret = 1; + + rcu_read_lock(); + css = css_lookup(&blkio_subsys, blkg->blkcg_id); + if (!css) + goto out; + + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); +out: + rcu_read_unlock(); + return ret; } /* called under rcu_read_lock(). */ @@ -97,8 +128,39 @@ static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) { struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + unsigned long flags; + struct blkio_group *blkg; + void *key; + rcu_read_lock(); +remove_entry: + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + goto done; + } + + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); + + spin_unlock_irqrestore(&blkcg->lock, flags); + + /* + * This blkio_group is being unlinked as associated cgroup is going + * away. Let all the IO controlling policies know about this event. + * + * Currently this is static call to one io controlling policy. Once + * we have more policies in place, we need some dynamic registration + * of callback function. + */ + cfq_unlink_blkio_group(key, blkg); + goto remove_entry; +done: free_css_id(&blkio_subsys, &blkcg->css); + rcu_read_unlock(); kfree(blkcg); } diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index ba5703f69b4..cd50a2f8733 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -26,6 +26,7 @@ struct blkio_group { /* An rcu protected unique identifier for the group */ void *key; struct hlist_node blkcg_node; + unsigned short blkcg_id; }; #define BLKIO_WEIGHT_MIN 100 diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a877eeee80a..8bc31a50a57 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -192,6 +192,7 @@ struct cfq_group { struct blkio_group blkg; #ifdef CONFIG_CFQ_GROUP_IOSCHED struct hlist_node cfqd_node; + atomic_t ref; #endif }; @@ -924,6 +925,14 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) *st = CFQ_RB_ROOT; RB_CLEAR_NODE(&cfqg->rb_node); + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&cfqg->ref, 1); + /* Add group onto cgroup list */ blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd); @@ -960,7 +969,77 @@ static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) cfqg = &cfqq->cfqd->root_group; cfqq->cfqg = cfqg; + /* cfqq reference on cfqg */ + atomic_inc(&cfqq->cfqg->ref); } + +static void cfq_put_cfqg(struct cfq_group *cfqg) +{ + struct cfq_rb_root *st; + int i, j; + + BUG_ON(atomic_read(&cfqg->ref) <= 0); + if (!atomic_dec_and_test(&cfqg->ref)) + return; + for_each_cfqg_st(cfqg, i, j, st) + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); + kfree(cfqg); +} + +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + + hlist_del_init(&cfqg->cfqd_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + cfq_put_cfqg(cfqg); +} + +static void cfq_release_cfq_groups(struct cfq_data *cfqd) +{ + struct hlist_node *pos, *n; + struct cfq_group *cfqg; + + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); + } +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu + * read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if elevator was exiting, cgroup deltion + * path got to it first. + */ +void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct cfq_data *cfqd = key; + + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} + #else /* GROUP_IOSCHED */ static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create) { @@ -971,6 +1050,9 @@ cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { cfqq->cfqg = cfqg; } +static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} +static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} + #endif /* GROUP_IOSCHED */ /* @@ -2172,11 +2254,13 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) * task holds one reference to the queue, dropped when task exits. each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * + * Each cfq queue took a reference on the parent group. Drop it now. * queue lock must be held here. */ static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; + struct cfq_group *cfqg; BUG_ON(atomic_read(&cfqq->ref) <= 0); @@ -2186,6 +2270,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); + cfqg = cfqq->cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); @@ -2194,6 +2279,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); + cfq_put_cfqg(cfqg); } /* @@ -3369,11 +3455,15 @@ static void cfq_exit_queue(struct elevator_queue *e) } cfq_put_async_queues(cfqd); + cfq_release_cfq_groups(cfqd); + blkiocg_del_blkio_group(&cfqd->root_group.blkg); spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); + /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ + synchronize_rcu(); kfree(cfqd); } @@ -3401,6 +3491,11 @@ static void *cfq_init_queue(struct request_queue *q) cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; #ifdef CONFIG_CFQ_GROUP_IOSCHED + /* + * Take a reference to root group which we never drop. This is just + * to make sure that cfq_put_cfqg() does not try to kfree root group + */ + atomic_set(&cfqg->ref, 1); blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd); #endif /* From 2868ef7b39490e6b41c2c61cd9a5cd891e778b54 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:48 -0500 Subject: [PATCH 092/113] blkio: Some debugging aids for CFQ o Some debugging aids for CFQ. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/Kconfig | 9 +++++++++ block/Kconfig.iosched | 9 +++++++++ block/blk-cgroup.c | 4 ++++ block/blk-cgroup.h | 13 +++++++++++++ block/cfq-iosched.c | 19 ++++++++++++++++++- 5 files changed, 53 insertions(+), 1 deletion(-) diff --git a/block/Kconfig b/block/Kconfig index 6ba1a8e3388..e20fbde0875 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -90,6 +90,15 @@ config BLK_CGROUP control disk bandwidth allocation (proportional time slice allocation) to such task groups. +config DEBUG_BLK_CGROUP + bool + depends on BLK_CGROUP + default n + ---help--- + Enable some debugging help. Currently it stores the cgroup path + in the blk group which can be used by cfq for tracing various + group related activity. + endif # BLOCK config BLOCK_COMPAT diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index fa95fa77057..b71abfb0d72 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -40,6 +40,15 @@ config CFQ_GROUP_IOSCHED ---help--- Enable group IO scheduling in CFQ. +config DEBUG_CFQ_IOSCHED + bool "Debug CFQ Scheduling" + depends on CFQ_GROUP_IOSCHED + select DEBUG_BLK_CGROUP + default n + ---help--- + Enable CFQ IO scheduling debugging in CFQ. Currently it makes + blktrace output more verbose. + choice prompt "Default I/O scheduler" default DEFAULT_CFQ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0426ab692fd..6bc99a3865b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -33,6 +33,10 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, blkg->blkcg_id = css_id(&blkcg->css); hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); spin_unlock_irqrestore(&blkcg->lock, flags); +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Need to take css reference ? */ + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); +#endif } static void __blkiocg_del_blkio_group(struct blkio_group *blkg) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index cd50a2f8733..3573199b298 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -27,12 +27,25 @@ struct blkio_group { void *key; struct hlist_node blkcg_node; unsigned short blkcg_id; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Store cgroup path */ + char path[128]; +#endif }; #define BLKIO_WEIGHT_MIN 100 #define BLKIO_WEIGHT_MAX 1000 #define BLKIO_WEIGHT_DEFAULT 500 +#ifdef CONFIG_DEBUG_BLK_CGROUP +static inline char *blkg_path(struct blkio_group *blkg) +{ + return blkg->path; +} +#else +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } +#endif + #ifdef CONFIG_BLK_CGROUP extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 8bc31a50a57..662d4e55b3c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -341,8 +341,21 @@ CFQ_CFQQ_FNS(coop); CFQ_CFQQ_FNS(deep); #undef CFQ_CFQQ_FNS +#ifdef CONFIG_DEBUG_CFQ_IOSCHED +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&(cfqq)->cfqg->blkg), ##args); + +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ + blkg_path(&(cfqg)->blkg), ##args); \ + +#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +#endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) @@ -832,6 +845,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) if (cfqg->nr_cfqq) return; + cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); cfqg->on_st = false; cfqd->nr_groups--; st->total_weight -= cfqg->weight; @@ -889,6 +903,9 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfqg->saved_serving_prio = cfqd->serving_prio; } else cfqg->saved_workload_slice = 0; + + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, + st->min_vdisktime); } #ifdef CONFIG_CFQ_GROUP_IOSCHED @@ -3102,7 +3119,7 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) unsigned long now; now = jiffies; - cfq_log_cfqq(cfqd, cfqq, "complete"); + cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq)); cfq_update_hw_tag(cfqd); From 220841906fccafaf4094e87bdb6d252e20cf8c7c Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:49 -0500 Subject: [PATCH 093/113] blkio: Export disk time and sectors used by a group to user space o Export disk time and sector used by a group to user space through cgroup interface. o Also export a "dequeue" interface to cgroup which keeps track of how many a times a group was deleted from service tree. Helps in debugging. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 64 ++++++++++++++++++++++++++++++++++++++++++++- block/blk-cgroup.h | 22 ++++++++++++++-- block/cfq-iosched.c | 19 +++++++++++--- 3 files changed, 99 insertions(+), 6 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 6bc99a3865b..4ef78d35cbd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -11,6 +11,8 @@ * Nauman Rafique */ #include +#include +#include #include "blk-cgroup.h" extern void cfq_unlink_blkio_group(void *, struct blkio_group *); @@ -23,8 +25,15 @@ struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) struct blkio_cgroup, css); } +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors) +{ + blkg->time += time; + blkg->sectors += sectors; +} + void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key) + struct blkio_group *blkg, void *key, dev_t dev) { unsigned long flags; @@ -37,6 +46,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, /* Need to take css reference ? */ cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); #endif + blkg->dev = dev; } static void __blkiocg_del_blkio_group(struct blkio_group *blkg) @@ -115,12 +125,64 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) return 0; } +#define SHOW_FUNCTION_PER_GROUP(__VAR) \ +static int blkiocg_##__VAR##_read(struct cgroup *cgroup, \ + struct cftype *cftype, struct seq_file *m) \ +{ \ + struct blkio_cgroup *blkcg; \ + struct blkio_group *blkg; \ + struct hlist_node *n; \ + \ + if (!cgroup_lock_live_group(cgroup)) \ + return -ENODEV; \ + \ + blkcg = cgroup_to_blkio_cgroup(cgroup); \ + rcu_read_lock(); \ + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) {\ + if (blkg->dev) \ + seq_printf(m, "%u:%u %lu\n", MAJOR(blkg->dev), \ + MINOR(blkg->dev), blkg->__VAR); \ + } \ + rcu_read_unlock(); \ + cgroup_unlock(); \ + return 0; \ +} + +SHOW_FUNCTION_PER_GROUP(time); +SHOW_FUNCTION_PER_GROUP(sectors); +#ifdef CONFIG_DEBUG_BLK_CGROUP +SHOW_FUNCTION_PER_GROUP(dequeue); +#endif +#undef SHOW_FUNCTION_PER_GROUP + +#ifdef CONFIG_DEBUG_BLK_CGROUP +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->dequeue += dequeue; +} +#endif + struct cftype blkio_files[] = { { .name = "weight", .read_u64 = blkiocg_weight_read, .write_u64 = blkiocg_weight_write, }, + { + .name = "time", + .read_seq_string = blkiocg_time_read, + }, + { + .name = "sectors", + .read_seq_string = blkiocg_sectors_read, + }, +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "dequeue", + .read_seq_string = blkiocg_dequeue_read, + }, +#endif }; static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 3573199b298..b24ab71db82 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -30,7 +30,15 @@ struct blkio_group { #ifdef CONFIG_DEBUG_BLK_CGROUP /* Store cgroup path */ char path[128]; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; #endif + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; + + /* total disk time and nr sectors dispatched by this group */ + unsigned long time; + unsigned long sectors; }; #define BLKIO_WEIGHT_MIN 100 @@ -42,24 +50,30 @@ static inline char *blkg_path(struct blkio_group *blkg) { return blkg->path; } +void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue); #else static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } +static inline void blkiocg_update_blkio_group_dequeue_stats( + struct blkio_group *blkg, unsigned long dequeue) {} #endif #ifdef CONFIG_BLK_CGROUP extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key); + struct blkio_group *blkg, void *key, dev_t dev); extern int blkiocg_del_blkio_group(struct blkio_group *blkg); extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key); +void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors); #else static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, - struct blkio_group *blkg, void *key) + struct blkio_group *blkg, void *key, dev_t dev) { } @@ -68,5 +82,9 @@ blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } static inline struct blkio_group * blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } +static inline void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, + unsigned long time, unsigned long sectors) +{ +} #endif #endif /* _BLK_CGROUP_H */ diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 662d4e55b3c..7d345e772d8 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -143,6 +143,8 @@ struct cfq_queue { struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; + /* Sectors dispatched in current dispatch round */ + unsigned long nr_sectors; }; /* @@ -852,6 +854,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) if (!RB_EMPTY_NODE(&cfqg->rb_node)) cfq_rb_erase(&cfqg->rb_node, st); cfqg->saved_workload_slice = 0; + blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1); } static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) @@ -878,7 +881,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) slice_used = allocated_slice; } - cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, + cfqq->nr_sectors); return slice_used; } @@ -906,6 +910,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, st->min_vdisktime); + blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl, + cfqq->nr_sectors); } #ifdef CONFIG_CFQ_GROUP_IOSCHED @@ -924,6 +930,8 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) void *key = cfqd; int i, j; struct cfq_rb_root *st; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; /* Do we need to take this reference */ if (!css_tryget(&blkcg->css)) @@ -951,7 +959,9 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) atomic_set(&cfqg->ref, 1); /* Add group onto cgroup list */ - blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd); + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd, + MKDEV(major, minor)); /* Add group on cfqd list */ hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); @@ -1478,6 +1488,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, cfqq->dispatch_start = jiffies; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -1801,6 +1812,7 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) if (cfq_cfqq_sync(cfqq)) cfqd->sync_flight++; + cfqq->nr_sectors += blk_rq_sectors(rq); } /* @@ -3513,7 +3525,8 @@ static void *cfq_init_queue(struct request_queue *q) * to make sure that cfq_put_cfqg() does not try to kfree root group */ atomic_set(&cfqg->ref, 1); - blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd); + blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd, + 0); #endif /* * Not strictly needed (since RB_ROOT just clears the node and we From 8682e1f15f26dae9a9e8af794d179055fbd81166 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:50 -0500 Subject: [PATCH 094/113] blkio: Provide some isolation between groups o Do not allow following three operations across groups for isolation. - selection of co-operating queues - preemtpions across groups - request merging across groups. o Async queues are currently global and not per group. Allow preemption of an async queue if a sync queue in other group gets backlogged. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7d345e772d8..3a62ce95dae 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1461,6 +1461,9 @@ static int cfq_allow_merge(struct request_queue *q, struct request *rq, struct cfq_io_context *cic; struct cfq_queue *cfqq; + /* Deny merge if bio and rq don't belong to same cfq group */ + if ((RQ_CFQQ(rq))->cfqg != cfq_get_cfqg(cfqd, 0)) + return false; /* * Disallow merge of a sync bio into an async request. */ @@ -1698,6 +1701,10 @@ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, if (!cfqq) return NULL; + /* If new queue belongs to different cfq_group, don't choose it */ + if (cur_cfqq->cfqg != cfqq->cfqg) + return NULL; + /* * It only makes sense to merge sync queues. */ @@ -2950,22 +2957,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (!cfqq) return false; - if (cfq_slice_used(cfqq)) - return true; - if (cfq_class_idle(new_cfqq)) return false; if (cfq_class_idle(cfqq)) return true; - /* Allow preemption only if we are idling on sync-noidle tree */ - if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && - cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && - new_cfqq->service_tree->count == 2 && - RB_EMPTY_ROOT(&cfqq->sort_list)) - return true; - /* * if the new request is sync, but the currently running queue is * not, let the sync request have priority. @@ -2973,6 +2970,19 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return true; + if (new_cfqq->cfqg != cfqq->cfqg) + return false; + + if (cfq_slice_used(cfqq)) + return true; + + /* Allow preemption only if we are idling on sync-noidle tree */ + if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && + cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && + new_cfqq->service_tree->count == 2 && + RB_EMPTY_ROOT(&cfqq->sort_list)) + return true; + /* * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. From 24610333d578478d354144ab4709a203684afc5f Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:51 -0500 Subject: [PATCH 095/113] blkio: Drop the reference to queue once the task changes cgroup o If a task changes cgroup, drop reference to the cfqq associated with io context and set cfqq pointer stored in ioc to NULL so that upon next request arrival we will allocate a new queue in new group. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3a62ce95dae..3d99e45789b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2608,6 +2608,41 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->pid = pid; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +{ + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); + struct cfq_data *cfqd = cic->key; + unsigned long flags; + struct request_queue *q; + + if (unlikely(!cfqd)) + return; + + q = cfqd->queue; + + spin_lock_irqsave(q->queue_lock, flags); + + if (sync_cfqq) { + /* + * Drop reference to sync queue. A new sync queue will be + * assigned in new group upon arrival of a fresh request. + */ + cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(sync_cfqq); + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void cfq_ioc_set_cgroup(struct io_context *ioc) +{ + call_for_each_cic(ioc, changed_cgroup); + ioc->cgroup_changed = 0; +} +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, gfp_t gfp_mask) @@ -2840,6 +2875,10 @@ out: if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (unlikely(ioc->cgroup_changed)) + cfq_ioc_set_cgroup(ioc); +#endif return cic; err_free: cfq_cic_free(cic); From f8d461d692c341add957fb973fb5ee1f62039dc7 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:52 -0500 Subject: [PATCH 096/113] blkio: Propagate cgroup weight updation to cfq groups o Propagate blkio cgroup weight updation to associated cfq groups. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 7 +++++++ block/cfq-iosched.c | 6 ++++++ 2 files changed, 13 insertions(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4ef78d35cbd..179ddfaebc5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -16,6 +16,7 @@ #include "blk-cgroup.h" extern void cfq_unlink_blkio_group(void *, struct blkio_group *); +extern void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int); struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; @@ -116,12 +117,18 @@ static int blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) { struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct hlist_node *n; if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) return -EINVAL; blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + cfq_update_blkio_group_weight(blkg, blkcg->weight); + spin_unlock_irq(&blkcg->lock); return 0; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3d99e45789b..f7364621613 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -922,6 +922,12 @@ static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) return NULL; } +void +cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + cfqg_of_blkg(blkg)->weight = weight; +} + static struct cfq_group * cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) { From f75edf2dc828802d358393be80a6c89e919f8273 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:53 -0500 Subject: [PATCH 097/113] blkio: Wait for cfq queue to get backlogged if group is empty o If a queue consumes its slice and then gets deleted from service tree, its associated group will also get deleted from service tree if this was the only queue in the group. That will make group loose its share. o For the queues on which we have idling on and if these have used their slice, wait a bit for these queues to get backlogged again and then expire these queues so that group does not loose its share. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index f7364621613..1cc10489eaf 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -117,6 +117,7 @@ struct cfq_queue { /* time when queue got scheduled in to dispatch first request. */ unsigned long dispatch_start; + unsigned int allocated_slice; /* time when first request from queue completed and slice started. */ unsigned long slice_start; unsigned long slice_end; @@ -314,6 +315,8 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ + CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ + CFQ_CFQQ_FLAG_wait_busy_done, /* Got new request. Expire the queue */ }; #define CFQ_CFQQ_FNS(name) \ @@ -341,6 +344,8 @@ CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); CFQ_CFQQ_FNS(deep); +CFQ_CFQQ_FNS(wait_busy); +CFQ_CFQQ_FNS(wait_busy_done); #undef CFQ_CFQQ_FNS #ifdef CONFIG_DEBUG_CFQ_IOSCHED @@ -578,6 +583,7 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) } cfqq->slice_start = jiffies; cfqq->slice_end = jiffies + slice; + cfqq->allocated_slice = slice; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } @@ -859,7 +865,7 @@ cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg) static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) { - unsigned int slice_used, allocated_slice; + unsigned int slice_used; /* * Queue got expired before even a single request completed or @@ -876,9 +882,8 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) 1); } else { slice_used = jiffies - cfqq->slice_start; - allocated_slice = cfqq->slice_end - cfqq->slice_start; - if (slice_used > allocated_slice) - slice_used = allocated_slice; + if (slice_used > cfqq->allocated_slice) + slice_used = cfqq->allocated_slice; } cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used, @@ -1495,6 +1500,7 @@ static void __cfq_set_active_queue(struct cfq_data *cfqd, cfq_log_cfqq(cfqd, cfqq, "set_active"); cfqq->slice_start = 0; cfqq->dispatch_start = jiffies; + cfqq->allocated_slice = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; cfqq->nr_sectors = 0; @@ -1524,6 +1530,8 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, del_timer(&cfqd->idle_slice_timer); cfq_clear_cfqq_wait_request(cfqq); + cfq_clear_cfqq_wait_busy(cfqq); + cfq_clear_cfqq_wait_busy_done(cfqq); /* * store what was left of this slice, if the queue idled/timed out @@ -2066,7 +2074,8 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) /* * The active queue has run out of time, expire it and select new. */ - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) + if ((cfq_slice_used(cfqq) || cfq_cfqq_wait_busy_done(cfqq)) + && !cfq_cfqq_must_dispatch(cfqq)) goto expire; /* @@ -3096,6 +3105,10 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); if (cfqq == cfqd->active_queue) { + if (cfq_cfqq_wait_busy(cfqq)) { + cfq_clear_cfqq_wait_busy(cfqq); + cfq_mark_cfqq_wait_busy_done(cfqq); + } /* * Remember that we saw a request from this process, but * don't start queuing just yet. Otherwise we risk seeing lots @@ -3214,6 +3227,17 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) cfq_set_prio_slice(cfqd, cfqq); cfq_clear_cfqq_slice_new(cfqq); } + + /* + * If this queue consumed its slice and this is last queue + * in the group, wait for next request before we expire + * the queue + */ + if (cfq_slice_used(cfqq) && cfqq->cfqg->nr_cfqq == 1) { + cfqq->slice_end = jiffies + cfqd->cfq_slice_idle; + cfq_mark_cfqq_wait_busy(cfqq); + } + /* * Idling is not enabled on: * - expired queues From f26bd1f0a3a31bc5e16d285f5e1b00a56abf6238 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:54 -0500 Subject: [PATCH 098/113] blkio: Determine async workload length based on total number of queues o Async queues are not per group. Instead these are system wide and maintained in root group. Hence their workload slice length should be calculated based on total number of queues in the system and not just queues in the root group. o As root group's default weight is 1000, make sure to charge async queue more in terms of vtime so that it does not get more time on disk because root group has higher weight. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 1cc10489eaf..b9e483d9031 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -408,6 +408,13 @@ static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, + cfqg->service_trees[wl][SYNC_WORKLOAD].count; } +static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg) +{ + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; +} + static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, struct io_context *, gfp_t); @@ -895,13 +902,19 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, struct cfq_queue *cfqq) { struct cfq_rb_root *st = &cfqd->grp_service_tree; - unsigned int used_sl; + unsigned int used_sl, charge_sl; + int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) + - cfqg->service_tree_idle.count; - used_sl = cfq_cfqq_slice_usage(cfqq); + BUG_ON(nr_sync < 0); + used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq); + + if (!cfq_cfqq_sync(cfqq) && !nr_sync) + charge_sl = cfqq->allocated_slice; /* Can't update vdisktime while group is on service tree */ cfq_rb_erase(&cfqg->rb_node, st); - cfqg->vdisktime += cfq_scale_slice(used_sl, cfqg); + cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg); __cfq_group_service_tree_add(st, cfqg); /* This group is being expired. Save the context */ @@ -2016,11 +2029,24 @@ static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); - if (cfqd->serving_type == ASYNC_WORKLOAD) + if (cfqd->serving_type == ASYNC_WORKLOAD) { + unsigned int tmp; + + /* + * Async queues are currently system wide. Just taking + * proportion of queues with-in same group will lead to higher + * async ratio system wide as generally root group is going + * to have higher weight. A more accurate thing would be to + * calculate system wide asnc/sync ratio. + */ + tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); + tmp = tmp/cfqd->busy_queues; + slice = min_t(unsigned, slice, tmp); + /* async workload slice is scaled down according to * the sync/async slice ratio. */ slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; - else + } else /* sync workload slice is at least 2 * cfq_slice_idle */ slice = max(slice, 2 * cfqd->cfq_slice_idle); From ae30c286553c91c49af5cbc0265a05a6543d0c52 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:55 -0500 Subject: [PATCH 099/113] blkio: Implement group_isolation tunable o If a group is running only a random reader, then it will not have enough traffic to keep disk busy and we will reduce overall throughput. This should result in better latencies for random reader though. If we don't idle on random reader service tree, then this random reader will experience large latencies if there are other groups present in system with sequential readers running in these. o One solution suggested by corrado is that by default keep the random readers or sync-noidle workload in root group so that during one dispatch round we idle only once on sync-noidle tree. This means that all the sync-idle workload queues will be in their respective group and we will see service differentiation in those but not on sync-noidle workload. o Provide a tunable group_isolation. If set, this will make sure that even sync-noidle queues go in their respective group and we wait on these. This provides stronger isolation between groups but at the expense of throughput if group does not have enough traffic to keep the disk busy. o By default group_isolation = 0 Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b9e483d9031..063dcbb714e 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -144,6 +144,7 @@ struct cfq_queue { struct cfq_rb_root *service_tree; struct cfq_queue *new_cfqq; struct cfq_group *cfqg; + struct cfq_group *orig_cfqg; /* Sectors dispatched in current dispatch round */ unsigned long nr_sectors; }; @@ -273,6 +274,7 @@ struct cfq_data { unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; unsigned int cfq_latency; + unsigned int cfq_group_isolation; struct list_head cic_list; @@ -1120,6 +1122,33 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct cfq_rb_root *service_tree; int left; int new_cfqq = 1; + int group_changed = 0; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD + && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { + /* Move this cfq to root group */ + cfq_log_cfqq(cfqd, cfqq, "moving to root group"); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfqq->orig_cfqg = cfqq->cfqg; + cfqq->cfqg = &cfqd->root_group; + atomic_inc(&cfqd->root_group.ref); + group_changed = 1; + } else if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { + /* cfqq is sequential now needs to go to its original group */ + BUG_ON(cfqq->cfqg != &cfqd->root_group); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_service_tree_del(cfqd, cfqq->cfqg); + cfq_put_cfqg(cfqq->cfqg); + cfqq->cfqg = cfqq->orig_cfqg; + cfqq->orig_cfqg = NULL; + group_changed = 1; + cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); + } +#endif service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), cfqq_type(cfqq), cfqd); @@ -1190,7 +1219,7 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, rb_link_node(&cfqq->rb_node, parent, p); rb_insert_color(&cfqq->rb_node, &service_tree->rb); service_tree->count++; - if (add_front || !new_cfqq) + if ((add_front || !new_cfqq) && !group_changed) return; cfq_group_service_tree_add(cfqd, cfqq->cfqg); } @@ -2357,6 +2386,8 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); cfq_put_cfqg(cfqg); + if (cfqq->orig_cfqg) + cfq_put_cfqg(cfqq->orig_cfqg); } /* @@ -3670,6 +3701,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; cfqd->cfq_latency = 1; + cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; cfqd->last_end_sync_rq = jiffies; return cfqd; @@ -3740,6 +3772,7 @@ SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); +SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -3772,6 +3805,7 @@ STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); +STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -3788,6 +3822,7 @@ static struct elv_fs_entry cfq_attrs[] = { CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), CFQ_ATTR(low_latency), + CFQ_ATTR(group_isolation), __ATTR_NULL }; From c04645e592d4dd60c58def40c913699d4c806727 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:56 -0500 Subject: [PATCH 100/113] blkio: Wait on sync-noidle queue even if rq_noidle = 1 o rq_noidle() is supposed to tell cfq that do not expect a request after this one, hence don't idle. But this does not seem to work very well. For example for direct random readers, rq_noidle = 1 but there is next request coming after this. Not idling, leads to a group not getting its share even if group_isolation=1. o The right solution for this issue is to scan the higher layers and set right flag (WRITE_SYNC or WRITE_ODIRECT). For the time being, this single line fix helps. This should not have any significant impact when we are not using cgroups. I will later figure out IO paths in higher layer and fix it. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 063dcbb714e..08b057b1b3b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3314,7 +3314,8 @@ static void cfq_completed_request(struct request_queue *q, struct request *rq) * only if we processed at least one !rq_noidle request */ if (cfqd->serving_type == SYNC_WORKLOAD - || cfqd->noidle_tree_requires_idle) + || cfqd->noidle_tree_requires_idle + || cfqq->cfqg->nr_cfqq == 1) cfq_arm_slice_timer(cfqd); } } From 72f924f62a6eb375c7c237ecc911f95be0531d1a Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:57 -0500 Subject: [PATCH 101/113] blkio: Documentation Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- Documentation/cgroups/blkio-controller.txt | 135 +++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 Documentation/cgroups/blkio-controller.txt diff --git a/Documentation/cgroups/blkio-controller.txt b/Documentation/cgroups/blkio-controller.txt new file mode 100644 index 00000000000..630879cd9a4 --- /dev/null +++ b/Documentation/cgroups/blkio-controller.txt @@ -0,0 +1,135 @@ + Block IO Controller + =================== +Overview +======== +cgroup subsys "blkio" implements the block io controller. There seems to be +a need of various kinds of IO control policies (like proportional BW, max BW) +both at leaf nodes as well as at intermediate nodes in a storage hierarchy. +Plan is to use the same cgroup based management interface for blkio controller +and based on user options switch IO policies in the background. + +In the first phase, this patchset implements proportional weight time based +division of disk policy. It is implemented in CFQ. Hence this policy takes +effect only on leaf nodes when CFQ is being used. + +HOWTO +===== +You can do a very simple testing of running two dd threads in two different +cgroups. Here is what you can do. + +- Enable group scheduling in CFQ + CONFIG_CFQ_GROUP_IOSCHED=y + +- Compile and boot into kernel and mount IO controller (blkio). + + mount -t cgroup -o blkio none /cgroup + +- Create two cgroups + mkdir -p /cgroup/test1/ /cgroup/test2 + +- Set weights of group test1 and test2 + echo 1000 > /cgroup/test1/blkio.weight + echo 500 > /cgroup/test2/blkio.weight + +- Create two same size files (say 512MB each) on same disk (file1, file2) and + launch two dd threads in different cgroup to read those files. + + sync + echo 3 > /proc/sys/vm/drop_caches + + dd if=/mnt/sdb/zerofile1 of=/dev/null & + echo $! > /cgroup/test1/tasks + cat /cgroup/test1/tasks + + dd if=/mnt/sdb/zerofile2 of=/dev/null & + echo $! > /cgroup/test2/tasks + cat /cgroup/test2/tasks + +- At macro level, first dd should finish first. To get more precise data, keep + on looking at (with the help of script), at blkio.disk_time and + blkio.disk_sectors files of both test1 and test2 groups. This will tell how + much disk time (in milli seconds), each group got and how many secotors each + group dispatched to the disk. We provide fairness in terms of disk time, so + ideally io.disk_time of cgroups should be in proportion to the weight. + +Various user visible config options +=================================== +CONFIG_CFQ_GROUP_IOSCHED + - Enables group scheduling in CFQ. Currently only 1 level of group + creation is allowed. + +CONFIG_DEBUG_CFQ_IOSCHED + - Enables some debugging messages in blktrace. Also creates extra + cgroup file blkio.dequeue. + +Config options selected automatically +===================================== +These config options are not user visible and are selected/deselected +automatically based on IO scheduler configuration. + +CONFIG_BLK_CGROUP + - Block IO controller. Selected by CONFIG_CFQ_GROUP_IOSCHED. + +CONFIG_DEBUG_BLK_CGROUP + - Debug help. Selected by CONFIG_DEBUG_CFQ_IOSCHED. + +Details of cgroup files +======================= +- blkio.weight + - Specifies per cgroup weight. + + Currently allowed range of weights is from 100 to 1000. + +- blkio.time + - disk time allocated to cgroup per device in milliseconds. First + two fields specify the major and minor number of the device and + third field specifies the disk time allocated to group in + milliseconds. + +- blkio.sectors + - number of sectors transferred to/from disk by the group. First + two fields specify the major and minor number of the device and + third field specifies the number of sectors transferred by the + group to/from the device. + +- blkio.dequeue + - Debugging aid only enabled if CONFIG_DEBUG_CFQ_IOSCHED=y. This + gives the statistics about how many a times a group was dequeued + from service tree of the device. First two fields specify the major + and minor number of the device and third field specifies the number + of times a group was dequeued from a particular device. + +CFQ sysfs tunable +================= +/sys/block//queue/iosched/group_isolation + +If group_isolation=1, it provides stronger isolation between groups at the +expense of throughput. By default group_isolation is 0. In general that +means that if group_isolation=0, expect fairness for sequential workload +only. Set group_isolation=1 to see fairness for random IO workload also. + +Generally CFQ will put random seeky workload in sync-noidle category. CFQ +will disable idling on these queues and it does a collective idling on group +of such queues. Generally these are slow moving queues and if there is a +sync-noidle service tree in each group, that group gets exclusive access to +disk for certain period. That means it will bring the throughput down if +group does not have enough IO to drive deeper queue depths and utilize disk +capacity to the fullest in the slice allocated to it. But the flip side is +that even a random reader should get better latencies and overall throughput +if there are lots of sequential readers/sync-idle workload running in the +system. + +If group_isolation=0, then CFQ automatically moves all the random seeky queues +in the root group. That means there will be no service differentiation for +that kind of workload. This leads to better throughput as we do collective +idling on root sync-noidle tree. + +By default one should run with group_isolation=0. If that is not sufficient +and one wants stronger isolation between groups, then set group_isolation=1 +but this will come at cost of reduced throughput. + +What works +========== +- Currently only sync IO queues are support. All the buffered writes are + still system wide and not per group. Hence we will not see service + differentiation between buffered writes between groups. From 2f5ea47712489a9d2d3cb832eb06062e4e64e0ec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 3 Dec 2009 21:06:43 +0100 Subject: [PATCH 102/113] cfq-iosched: fix compile problem with !CONFIG_CGROUP Signed-off-by: Jens Axboe --- block/blk-cgroup.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b24ab71db82..257dc6a956b 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,6 +15,8 @@ #include +#ifdef CONFIG_BLK_CGROUP + struct blkio_cgroup { struct cgroup_subsys_state css; unsigned int weight; @@ -41,6 +43,13 @@ struct blkio_group { unsigned long sectors; }; +#else + +struct blkio_group { +}; + +#endif + #define BLKIO_WEIGHT_MIN 100 #define BLKIO_WEIGHT_MAX 1000 #define BLKIO_WEIGHT_DEFAULT 500 @@ -69,6 +78,7 @@ extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, unsigned long time, unsigned long sectors); #else +struct cgroup; static inline struct blkio_cgroup * cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } From f2eecb91522686edf8199947b77f435a4031d92f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Dec 2009 10:06:35 +0100 Subject: [PATCH 103/113] cfq-iosched: move IO controller declerations to a header file They should not be declared inside some other file that's not related to CFQ. Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 +--- block/cfq-iosched.c | 1 + block/cfq-iosched.h | 7 +++++++ 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 block/cfq-iosched.h diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 179ddfaebc5..73a5525cc0a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -14,9 +14,7 @@ #include #include #include "blk-cgroup.h" - -extern void cfq_unlink_blkio_group(void *, struct blkio_group *); -extern void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int); +#include "cfq-iosched.h" struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 08b057b1b3b..43ec3340cb5 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,6 +14,7 @@ #include #include #include "blk-cgroup.h" +#include "cfq-iosched.h" /* * tunables diff --git a/block/cfq-iosched.h b/block/cfq-iosched.h new file mode 100644 index 00000000000..ef7b4798a34 --- /dev/null +++ b/block/cfq-iosched.h @@ -0,0 +1,7 @@ +#ifndef CFQ_IOSCHED_H +#define CFQ_IOSCHED_H + +void cfq_unlink_blkio_group(void *, struct blkio_group *); +void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int); + +#endif From 237e5bc4e51813e9d8ba9da0f63e7acc608882d7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 4 Dec 2009 10:07:38 +0100 Subject: [PATCH 104/113] io controller: quick fix for blk-cgroup and modular CFQ It's currently not an allowed configuration, so express that in Kconfig. Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b71abfb0d72..5368d74207e 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -34,7 +34,7 @@ config IOSCHED_CFQ config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ && CGROUPS + depends on IOSCHED_CFQ=y && CGROUPS select BLK_CGROUP default n ---help--- From 3c764b7a654668dd04905841d6024f7b6aa843a5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 4 Dec 2009 13:12:06 +0100 Subject: [PATCH 105/113] cfq-iosched: make nonrot check logic consistent cfq_arm_slice_timer() has logic to disable idle window for SSD device. The same thing should be done at cfq_select_queue() too, otherwise we will still see idle window. This makes the nonrot check logic consistent in cfq. Tests in a intel SSD with low_latency knob close, below patch can triple disk thoughput for muti-thread sequential read. Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 43ec3340cb5..b00ca4c86e2 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1796,7 +1796,8 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) return false; /* We do for queues that were marked with idle window flag. */ - if (cfq_cfqq_idle_window(cfqq)) + if (cfq_cfqq_idle_window(cfqq) && + !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) return true; /* From 61cc74fbb87af6aa551a06a370590c9bc07e29d9 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Fri, 4 Dec 2009 14:52:41 +0100 Subject: [PATCH 106/113] block: Fix io_context leak after clone with CLONE_IO With CLONE_IO, copy_io() increments both ioc->refcount and ioc->nr_tasks. However exit_io_context() only decrements ioc->refcount if ioc->nr_tasks reaches 0. Always call put_io_context() in exit_io_context(). Signed-off-by: Louis Rilling Signed-off-by: Jens Axboe --- block/blk-ioc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index d4ed6000147..dcd041290b2 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -80,8 +80,8 @@ void exit_io_context(void) ioc->aic->exit(ioc->aic); cfq_exit(ioc); - put_io_context(ioc); } + put_io_context(ioc); } struct io_context *alloc_io_context(gfp_t gfp_flags, int node) From b69f2292063d2caf37ca9aec7d63ded203701bf3 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Fri, 4 Dec 2009 14:52:42 +0100 Subject: [PATCH 107/113] block: Fix io_context leak after failure of clone with CLONE_IO With CLONE_IO, parent's io_context->nr_tasks is incremented, but never decremented whenever copy_process() fails afterwards, which prevents exit_io_context() from calling IO schedulers exit functions. Give a task_struct to exit_io_context(), and call exit_io_context() instead of put_io_context() in copy_process() cleanup path. Signed-off-by: Louis Rilling Signed-off-by: Jens Axboe --- block/blk-ioc.c | 10 +++++----- include/linux/iocontext.h | 5 +++-- kernel/exit.c | 2 +- kernel/fork.c | 3 ++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index dcd041290b2..cbdabb0dd6d 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -66,14 +66,14 @@ static void cfq_exit(struct io_context *ioc) } /* Called by the exitting task */ -void exit_io_context(void) +void exit_io_context(struct task_struct *task) { struct io_context *ioc; - task_lock(current); - ioc = current->io_context; - current->io_context = NULL; - task_unlock(current); + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); if (atomic_dec_and_test(&ioc->nr_tasks)) { if (ioc->aic && ioc->aic->exit) diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index d61b0b8b5cd..a6323599630 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -98,14 +98,15 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) return NULL; } +struct task_struct; #ifdef CONFIG_BLOCK int put_io_context(struct io_context *ioc); -void exit_io_context(void); +void exit_io_context(struct task_struct *task); struct io_context *get_io_context(gfp_t gfp_flags, int node); struct io_context *alloc_io_context(gfp_t gfp_flags, int node); void copy_io_context(struct io_context **pdst, struct io_context **psrc); #else -static inline void exit_io_context(void) +static inline void exit_io_context(struct task_struct *task) { } diff --git a/kernel/exit.c b/kernel/exit.c index f7864ac2ecc..2544000125d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1004,7 +1004,7 @@ NORET_TYPE void do_exit(long code) tsk->flags |= PF_EXITPIDONE; if (tsk->io_context) - exit_io_context(); + exit_io_context(tsk); if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); diff --git a/kernel/fork.c b/kernel/fork.c index 166b8c49257..607353425bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1310,7 +1310,8 @@ bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_io: - put_io_context(p->io_context); + if (p->io_context) + exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: From 9d6a986c0b276085f7944cd8ad65f4f82aff7536 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 4 Dec 2009 10:36:41 -0500 Subject: [PATCH 108/113] blkio: Export some symbols from blkio as its user CFQ can be a module o blkio controller is inside the kernel and cfq makes use of interfaces exported by blkio. CFQ can be a module too, hence export symbols used by CFQ. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 22 ++++++++++++++++++++++ block/blk-cgroup.h | 3 +++ block/cfq-iosched.c | 4 ++-- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 73a5525cc0a..4d4a277b290 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -13,16 +13,33 @@ #include #include #include +#include #include "blk-cgroup.h" #include "cfq-iosched.h" struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; +EXPORT_SYMBOL_GPL(blkio_root_cgroup); + +bool blkiocg_css_tryget(struct blkio_cgroup *blkcg) +{ + if (!css_tryget(&blkcg->css)) + return false; + return true; +} +EXPORT_SYMBOL_GPL(blkiocg_css_tryget); + +void blkiocg_css_put(struct blkio_cgroup *blkcg) +{ + css_put(&blkcg->css); +} +EXPORT_SYMBOL_GPL(blkiocg_css_put); struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), struct blkio_cgroup, css); } +EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, unsigned long time, unsigned long sectors) @@ -30,6 +47,7 @@ void blkiocg_update_blkio_group_stats(struct blkio_group *blkg, blkg->time += time; blkg->sectors += sectors; } +EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_stats); void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, struct blkio_group *blkg, void *key, dev_t dev) @@ -47,6 +65,7 @@ void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, #endif blkg->dev = dev; } +EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); static void __blkiocg_del_blkio_group(struct blkio_group *blkg) { @@ -81,6 +100,7 @@ out: rcu_read_unlock(); return ret; } +EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); /* called under rcu_read_lock(). */ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) @@ -97,6 +117,7 @@ struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) return NULL; } +EXPORT_SYMBOL_GPL(blkiocg_lookup_group); #define SHOW_FUNCTION(__VAR) \ static u64 blkiocg_##__VAR##_read(struct cgroup *cgroup, \ @@ -166,6 +187,7 @@ void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg, { blkg->dequeue += dequeue; } +EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats); #endif struct cftype blkio_files[] = { diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 257dc6a956b..4f89b967467 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -43,6 +43,9 @@ struct blkio_group { unsigned long sectors; }; +extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg); +extern void blkiocg_css_put(struct blkio_cgroup *blkcg); + #else struct blkio_group { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index b00ca4c86e2..7f3f343b0c6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -961,7 +961,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) unsigned int major, minor; /* Do we need to take this reference */ - if (!css_tryget(&blkcg->css)) + if (!blkiocg_css_tryget(blkcg)) return NULL;; cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); @@ -994,7 +994,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create) hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); done: - css_put(&blkcg->css); + blkiocg_css_put(blkcg); return cfqg; } From 3e2520668970aab5a764044a298e987aafc1f63d Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 4 Dec 2009 10:36:42 -0500 Subject: [PATCH 109/113] blkio: Implement dynamic io controlling policy registration o One of the goals of block IO controller is that it should be able to support mulitple io control policies, some of which be operational at higher level in storage hierarchy. o To begin with, we had one io controlling policy implemented by CFQ, and I hard coded the CFQ functions called by blkio. This created issues when CFQ is compiled as module. o This patch implements a basic dynamic io controlling policy registration functionality in blkio. This is similar to elevator functionality where ioschedulers register the functions dynamically. o Now in future, when more IO controlling policies are implemented, these can dynakically register with block IO controller. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 36 ++++++++++++++++++++++++++++++++---- block/blk-cgroup.h | 24 ++++++++++++++++++++++++ block/cfq-iosched.c | 14 +++++++++++++- block/cfq-iosched.h | 7 ------- 4 files changed, 69 insertions(+), 12 deletions(-) delete mode 100644 block/cfq-iosched.h diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4d4a277b290..3ad497f4eed 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -15,7 +15,9 @@ #include #include #include "blk-cgroup.h" -#include "cfq-iosched.h" + +static DEFINE_SPINLOCK(blkio_list_lock); +static LIST_HEAD(blkio_list); struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); @@ -138,6 +140,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) struct blkio_cgroup *blkcg; struct blkio_group *blkg; struct hlist_node *n; + struct blkio_policy_type *blkiop; if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) return -EINVAL; @@ -145,8 +148,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) blkcg = cgroup_to_blkio_cgroup(cgroup); spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; - hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) - cfq_update_blkio_group_weight(blkg, blkcg->weight); + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + spin_lock(&blkio_list_lock); + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_update_group_weight_fn(blkg, + blkcg->weight); + spin_unlock(&blkio_list_lock); + } spin_unlock_irq(&blkcg->lock); return 0; } @@ -224,6 +232,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) unsigned long flags; struct blkio_group *blkg; void *key; + struct blkio_policy_type *blkiop; rcu_read_lock(); remove_entry: @@ -249,7 +258,10 @@ remove_entry: * we have more policies in place, we need some dynamic registration * of callback function. */ - cfq_unlink_blkio_group(key, blkg); + spin_lock(&blkio_list_lock); + list_for_each_entry(blkiop, &blkio_list, list) + blkiop->ops.blkio_unlink_group_fn(key, blkg); + spin_unlock(&blkio_list_lock); goto remove_entry; done: free_css_id(&blkio_subsys, &blkcg->css); @@ -330,3 +342,19 @@ struct cgroup_subsys blkio_subsys = { .subsys_id = blkio_subsys_id, .use_id = 1, }; + +void blkio_policy_register(struct blkio_policy_type *blkiop) +{ + spin_lock(&blkio_list_lock); + list_add_tail(&blkiop->list, &blkio_list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_register); + +void blkio_policy_unregister(struct blkio_policy_type *blkiop) +{ + spin_lock(&blkio_list_lock); + list_del_init(&blkiop->list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_unregister); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 4f89b967467..4d316df863b 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -46,11 +46,35 @@ struct blkio_group { extern bool blkiocg_css_tryget(struct blkio_cgroup *blkcg); extern void blkiocg_css_put(struct blkio_cgroup *blkcg); +typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); +typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg, + unsigned int weight); + +struct blkio_policy_ops { + blkio_unlink_group_fn *blkio_unlink_group_fn; + blkio_update_group_weight_fn *blkio_update_group_weight_fn; +}; + +struct blkio_policy_type { + struct list_head list; + struct blkio_policy_ops ops; +}; + +/* Blkio controller policy registration */ +extern void blkio_policy_register(struct blkio_policy_type *); +extern void blkio_policy_unregister(struct blkio_policy_type *); + #else struct blkio_group { }; +struct blkio_policy_type { +}; + +static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } +static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } + #endif #define BLKIO_WEIGHT_MIN 100 diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 7f3f343b0c6..78f4829895b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,7 +14,6 @@ #include #include #include "blk-cgroup.h" -#include "cfq-iosched.h" /* * tunables @@ -3855,6 +3854,17 @@ static struct elevator_type iosched_cfq = { .elevator_owner = THIS_MODULE, }; +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static struct blkio_policy_type blkio_policy_cfq = { + .ops = { + .blkio_unlink_group_fn = cfq_unlink_blkio_group, + .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, + }, +}; +#else +static struct blkio_policy_type blkio_policy_cfq; +#endif + static int __init cfq_init(void) { /* @@ -3869,6 +3879,7 @@ static int __init cfq_init(void) return -ENOMEM; elv_register(&iosched_cfq); + blkio_policy_register(&blkio_policy_cfq); return 0; } @@ -3876,6 +3887,7 @@ static int __init cfq_init(void) static void __exit cfq_exit(void) { DECLARE_COMPLETION_ONSTACK(all_gone); + blkio_policy_unregister(&blkio_policy_cfq); elv_unregister(&iosched_cfq); ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ diff --git a/block/cfq-iosched.h b/block/cfq-iosched.h deleted file mode 100644 index ef7b4798a34..00000000000 --- a/block/cfq-iosched.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef CFQ_IOSCHED_H -#define CFQ_IOSCHED_H - -void cfq_unlink_blkio_group(void *, struct blkio_group *); -void cfq_update_blkio_group_weight(struct blkio_group *, unsigned int); - -#endif From 846954b0a32f4ae953e082eabd178e7a98dd2efd Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 4 Dec 2009 10:36:43 -0500 Subject: [PATCH 110/113] blkio: Allow CFQ group IO scheduling even when CFQ is a module o Now issues of blkio controller and CFQ in module mode should be fixed. Enable the cfq group scheduling support in module mode. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/Kconfig.iosched | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 5368d74207e..b71abfb0d72 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -34,7 +34,7 @@ config IOSCHED_CFQ config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ=y && CGROUPS + depends on IOSCHED_CFQ && CGROUPS select BLK_CGROUP default n ---help--- From bb729bc98c0f3e6a898d8730df3e2830bf68751a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 6 Dec 2009 09:54:19 +0100 Subject: [PATCH 111/113] cfq-iosched: use call_rcu() instead of doing grace period stall on queue exit After the merge of the IO controller patches, booting on my megaraid box ran much slower. Vivek Goyal traced it down to megaraid discovery creating tons of devices, each suffering a grace period when they later kill that queue (if no device is found). So lets use call_rcu() to batch these deferred frees, instead of taking the grace period hit for each one. Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 78f4829895b..3815f9789b6 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -287,6 +287,7 @@ struct cfq_data { /* List of cfq groups being managed on this device*/ struct hlist_head cfqg_list; + struct rcu_head rcu; }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); @@ -3601,6 +3602,11 @@ static void cfq_put_async_queues(struct cfq_data *cfqd) cfq_put_queue(cfqd->async_idle_cfqq); } +static void cfq_cfqd_free(struct rcu_head *head) +{ + kfree(container_of(head, struct cfq_data, rcu)); +} + static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -3630,8 +3636,7 @@ static void cfq_exit_queue(struct elevator_queue *e) cfq_shutdown_timer_wq(cfqd); /* Wait for cfqg->blkg->key accessors to exit their grace periods. */ - synchronize_rcu(); - kfree(cfqd); + call_rcu(&cfqd->rcu, cfq_cfqd_free); } static void *cfq_init_queue(struct request_queue *q) @@ -3706,6 +3711,7 @@ static void *cfq_init_queue(struct request_queue *q) cfqd->cfq_group_isolation = 0; cfqd->hw_tag = -1; cfqd->last_end_sync_rq = jiffies; + INIT_RCU_HEAD(&cfqd->rcu); return cfqd; } From accee7854b378a8ab5995d8f5dc5d8abc3b3d23a Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 7 Dec 2009 19:29:39 +1100 Subject: [PATCH 112/113] block: include linux/err.h to use ERR_PTR Signed-off-by: Stephen Rothwell Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 3ad497f4eed..1fa2654db0a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "blk-cgroup.h" static DEFINE_SPINLOCK(blkio_list_lock); From 878eaddd05d251cefa9632c2b8046833c5eead66 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 7 Dec 2009 19:37:15 +0100 Subject: [PATCH 113/113] cfq-iosched: Do not access cfqq after freeing it Fix a crash during boot reported by Jeff Moyer. Fix the issue of accessing cfqq after freeing it. Reported-by: Jeff Moyer Signed-off-by: Vivek Goyal Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 3815f9789b6..cfb0b2f5f63 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2368,7 +2368,7 @@ static int cfq_dispatch_requests(struct request_queue *q, int force) static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; - struct cfq_group *cfqg; + struct cfq_group *cfqg, *orig_cfqg; BUG_ON(atomic_read(&cfqq->ref) <= 0); @@ -2379,6 +2379,7 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); cfqg = cfqq->cfqg; + orig_cfqg = cfqq->orig_cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); @@ -2388,8 +2389,8 @@ static void cfq_put_queue(struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); cfq_put_cfqg(cfqg); - if (cfqq->orig_cfqg) - cfq_put_cfqg(cfqq->orig_cfqg); + if (orig_cfqg) + cfq_put_cfqg(orig_cfqg); } /*