aha/block/cfq-iosched.c

3974 lines
99 KiB
C
Raw Permalink Normal View History

/*
* CFQ, or complete fairness queueing, disk scheduler.
*
* Based on ideas from a previously unfinished io
* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
*
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
*/
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/jiffies.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/blktrace_api.h>
#include "blk-cgroup.h"
/*
* tunables
*/
/* max queue in one round of service */
static const int cfq_quantum = 4;
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
/* maximum backwards seek, in KiB */
static const int cfq_back_max = 16 * 1024;
/* penalty of a backwards seek */
static const int cfq_back_penalty = 2;
static const int cfq_slice_sync = HZ / 10;
static int cfq_slice_async = HZ / 25;
static const int cfq_slice_async_rq = 2;
static int cfq_slice_idle = HZ / 125;
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
static const int cfq_hist_divisor = 4;
/*
* offset from end of service tree
*/
#define CFQ_IDLE_DELAY (HZ / 5)
/*
* below this threshold, we consider thinktime immediate
*/
#define CFQ_MIN_TT (2)
/*
* Allow merged cfqqs to perform this amount of seeky I/O before
* deciding to break the queues up again.
*/
#define CFQQ_COOP_TOUT (HZ)
#define CFQ_SLICE_SCALE (5)
#define CFQ_HW_QUEUE_MIN (5)
#define CFQ_SERVICE_SHIFT 12
#define RQ_CIC(rq) \
((struct cfq_io_context *) (rq)->elevator_private)
#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
static struct kmem_cache *cfq_pool;
static struct kmem_cache *cfq_ioc_pool;
static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
static struct completion *ioc_gone;
static DEFINE_SPINLOCK(ioc_gone_lock);
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define sample_valid(samples) ((samples) > 80)
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
/*
* Most of our rbtree usage is for sorting with min extraction, so
* if we cache the leftmost node we don't have to walk down the tree
* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
* move this into the elevator for the rq sorting as well.
*/
struct cfq_rb_root {
struct rb_root rb;
struct rb_node *left;
unsigned count;
u64 min_vdisktime;
struct rb_node *active;
unsigned total_weight;
};
#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
/*
* Per process-grouping structure
*/
struct cfq_queue {
/* reference count */
atomic_t ref;
/* various state flags, see below */
unsigned int flags;
/* parent cfq_data */
struct cfq_data *cfqd;
/* service_tree member */
struct rb_node rb_node;
/* service_tree key */
unsigned long rb_key;
/* prio tree member */
struct rb_node p_node;
/* prio tree root we belong to, if any */
struct rb_root *p_root;
/* sorted list of pending requests */
struct rb_root sort_list;
/* if fifo isn't expired, next request to serve */
struct request *next_rq;
/* requests queued in sort_list */
int queued[2];
/* currently allocated requests */
int allocated[2];
/* fifo list of requests in sort_list */
struct list_head fifo;
/* time when queue got scheduled in to dispatch first request. */
unsigned long dispatch_start;
unsigned int allocated_slice;
/* time when first request from queue completed and slice started. */
unsigned long slice_start;
unsigned long slice_end;
long slice_resid;
unsigned int slice_dispatch;
/* pending metadata requests */
int meta_pending;
/* number of requests that are on the dispatch list or inside driver */
int dispatched;
/* io prio of this group */
unsigned short ioprio, org_ioprio;
unsigned short ioprio_class, org_ioprio_class;
unsigned int seek_samples;
u64 seek_total;
sector_t seek_mean;
sector_t last_request_pos;
unsigned long seeky_start;
pid_t pid;
struct cfq_rb_root *service_tree;
struct cfq_queue *new_cfqq;
struct cfq_group *cfqg;
struct cfq_group *orig_cfqg;
/* Sectors dispatched in current dispatch round */
unsigned long nr_sectors;
};
/*
* First index in the service_trees.
* IDLE is handled separately, so it has negative index
*/
enum wl_prio_t {
BE_WORKLOAD = 0,
RT_WORKLOAD = 1,
IDLE_WORKLOAD = 2,
};
/*
* Second index in the service_trees.
*/
enum wl_type_t {
ASYNC_WORKLOAD = 0,
SYNC_NOIDLE_WORKLOAD = 1,
SYNC_WORKLOAD = 2
};
/* This is per cgroup per device grouping structure */
struct cfq_group {
/* group service_tree member */
struct rb_node rb_node;
/* group service_tree key */
u64 vdisktime;
unsigned int weight;
bool on_st;
/* number of cfqq currently on this group */
int nr_cfqq;
/* Per group busy queus average. Useful for workload slice calc. */
unsigned int busy_queues_avg[2];
/*
* rr lists of queues with requests, onle rr for each priority class.
* Counts are embedded in the cfq_rb_root
*/
struct cfq_rb_root service_trees[2][3];
struct cfq_rb_root service_tree_idle;
unsigned long saved_workload_slice;
enum wl_type_t saved_workload;
enum wl_prio_t saved_serving_prio;
struct blkio_group blkg;
#ifdef CONFIG_CFQ_GROUP_IOSCHED
struct hlist_node cfqd_node;
atomic_t ref;
#endif
};
/*
* Per block device queue structure
*/
struct cfq_data {
struct request_queue *queue;
/* Root service tree for cfq_groups */
struct cfq_rb_root grp_service_tree;
struct cfq_group root_group;
/* Number of active cfq groups on group service tree */
int nr_groups;
/*
* The priority currently being served
*/
enum wl_prio_t serving_prio;
enum wl_type_t serving_type;
unsigned long workload_expires;
struct cfq_group *serving_group;
bool noidle_tree_requires_idle;
/*
* Each priority tree is sorted by next_request position. These
* trees are used when determining if two or more queues are
* interleaving requests (see cfq_close_cooperator).
*/
struct rb_root prio_trees[CFQ_PRIO_LISTS];
unsigned int busy_queues;
int rq_in_driver[2];
int sync_flight;
/*
* queue-depth detection
*/
int rq_queued;
int hw_tag;
/*
* hw_tag can be
* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
* 0 => no NCQ
*/
int hw_tag_est_depth;
unsigned int hw_tag_samples;
/*
* idle window management
*/
struct timer_list idle_slice_timer;
struct work_struct unplug_work;
struct cfq_queue *active_queue;
struct cfq_io_context *active_cic;
/*
* async queue for each priority case
*/
struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
struct cfq_queue *async_idle_cfqq;
sector_t last_position;
/*
* tunables, see top of file
*/
unsigned int cfq_quantum;
unsigned int cfq_fifo_expire[2];
unsigned int cfq_back_penalty;
unsigned int cfq_back_max;
unsigned int cfq_slice[2];
unsigned int cfq_slice_async_rq;
unsigned int cfq_slice_idle;
unsigned int cfq_latency;
unsigned int cfq_group_isolation;
struct list_head cic_list;
/*
* Fallback dummy cfqq for extreme OOM conditions
*/
struct cfq_queue oom_cfqq;
unsigned long last_delayed_sync;
/* List of cfq groups being managed on this device*/
struct hlist_head cfqg_list;
struct rcu_head rcu;
};
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
enum wl_prio_t prio,
enum wl_type_t type,
struct cfq_data *cfqd)
{
if (!cfqg)
return NULL;
if (prio == IDLE_WORKLOAD)
return &cfqg->service_tree_idle;
return &cfqg->service_trees[prio][type];
}
enum cfqq_state_flags {
CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
};
#define CFQ_CFQQ_FNS(name) \
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
{ \
(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
} \
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
{ \
(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
} \
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
{ \
return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
}
CFQ_CFQQ_FNS(on_rr);
CFQ_CFQQ_FNS(wait_request);
CFQ_CFQQ_FNS(must_dispatch);
CFQ_CFQQ_FNS(must_alloc_slice);
CFQ_CFQQ_FNS(fifo_expire);
CFQ_CFQQ_FNS(idle_window);
CFQ_CFQQ_FNS(prio_changed);
CFQ_CFQQ_FNS(slice_new);
CFQ_CFQQ_FNS(sync);
CFQ_CFQQ_FNS(coop);
CFQ_CFQQ_FNS(deep);
CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS
#ifdef CONFIG_DEBUG_CFQ_IOSCHED
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
blkg_path(&(cfqq)->cfqg->blkg), ##args);
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
blkg_path(&(cfqg)->blkg), ##args); \
#else
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
#endif
#define cfq_log(cfqd, fmt, args...) \
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
/* Traverses through cfq group service trees */
#define for_each_cfqg_st(cfqg, i, j, st) \
for (i = 0; i <= IDLE_WORKLOAD; i++) \
for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
: &cfqg->service_tree_idle; \
(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
(i == IDLE_WORKLOAD && j == 0); \
j++, st = i < IDLE_WORKLOAD ? \
&cfqg->service_trees[i][j]: NULL) \
static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
{
if (cfq_class_idle(cfqq))
return IDLE_WORKLOAD;
if (cfq_class_rt(cfqq))
return RT_WORKLOAD;
return BE_WORKLOAD;
}
static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
{
if (!cfq_cfqq_sync(cfqq))
return ASYNC_WORKLOAD;
if (!cfq_cfqq_idle_window(cfqq))
return SYNC_NOIDLE_WORKLOAD;
return SYNC_WORKLOAD;
}
static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
struct cfq_data *cfqd,
struct cfq_group *cfqg)
{
if (wl == IDLE_WORKLOAD)
return cfqg->service_tree_idle.count;
return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
}
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
struct cfq_group *cfqg)
{
return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
}
static void cfq_dispatch_insert(struct request_queue *, struct request *);
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
struct io_context *, gfp_t);
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
struct io_context *);
static inline int rq_in_driver(struct cfq_data *cfqd)
{
return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
}
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
bool is_sync)
{
return cic->cfqq[is_sync];
}
static inline void cic_set_cfqq(struct cfq_io_context *cic,
struct cfq_queue *cfqq, bool is_sync)
{
cic->cfqq[is_sync] = cfqq;
}
/*
* We regard a request as SYNC, if it's either a read or has the SYNC bit
* set (in which case it could also be direct WRITE).
*/
static inline bool cfq_bio_sync(struct bio *bio)
{
return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
}
/*
* scheduler run of queue, if there are requests pending and no one in the
* driver that will restart queueing
*/
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
{
if (cfqd->busy_queues) {
cfq_log(cfqd, "schedule dispatch");
kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
}
}
static int cfq_queue_empty(struct request_queue *q)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
return !cfqd->rq_queued;
}
/*
* Scale schedule slice based on io priority. Use the sync time slice only
* if a queue is marked sync and has sync io queued. A sync queue with async
* io only, should not get full sync slice length.
*/
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
unsigned short prio)
{
const int base_slice = cfqd->cfq_slice[sync];
WARN_ON(prio >= IOPRIO_BE_NR);
return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
}
static inline int
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
{
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
}
static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
{
u64 d = delta << CFQ_SERVICE_SHIFT;
d = d * BLKIO_WEIGHT_DEFAULT;
do_div(d, cfqg->weight);
return d;
}
static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
s64 delta = (s64)(vdisktime - min_vdisktime);
if (delta > 0)
min_vdisktime = vdisktime;
return min_vdisktime;
}
static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
{
s64 delta = (s64)(vdisktime - min_vdisktime);
if (delta < 0)
min_vdisktime = vdisktime;
return min_vdisktime;
}
static void update_min_vdisktime(struct cfq_rb_root *st)
{
u64 vdisktime = st->min_vdisktime;
struct cfq_group *cfqg;
if (st->active) {
cfqg = rb_entry_cfqg(st->active);
vdisktime = cfqg->vdisktime;
}
if (st->left) {
cfqg = rb_entry_cfqg(st->left);
vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
}
st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
}
/*
* get averaged number of queues of RT/BE priority.
* average is updated, with a formula that gives more weight to higher numbers,
* to quickly follows sudden increases and decrease slowly
*/
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
struct cfq_group *cfqg, bool rt)
{
unsigned min_q, max_q;
unsigned mult = cfq_hist_divisor - 1;
unsigned round = cfq_hist_divisor / 2;
unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
min_q = min(cfqg->busy_queues_avg[rt], busy);
max_q = max(cfqg->busy_queues_avg[rt], busy);
cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
cfq_hist_divisor;