2005-04-16 15:20:36 -07:00
|
|
|
/*
|
|
|
|
* CFQ, or complete fairness queueing, disk scheduler.
|
|
|
|
*
|
|
|
|
* Based on ideas from a previously unfinished io
|
|
|
|
* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
|
|
|
|
*
|
2006-09-04 15:41:16 +02:00
|
|
|
* Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
|
2005-04-16 15:20:36 -07:00
|
|
|
*/
|
|
|
|
#include <linux/module.h>
|
2006-03-18 12:29:52 -05:00
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/elevator.h>
|
2009-11-11 13:47:45 +01:00
|
|
|
#include <linux/jiffies.h>
|
2005-04-16 15:20:36 -07:00
|
|
|
#include <linux/rbtree.h>
|
2005-06-27 10:55:12 +02:00
|
|
|
#include <linux/ioprio.h>
|
2008-05-30 12:23:07 +02:00
|
|
|
#include <linux/blktrace_api.h>
|
2009-12-03 12:59:43 -05:00
|
|
|
#include "blk-cgroup.h"
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* tunables
|
|
|
|
*/
|
2008-01-31 13:08:54 +01:00
|
|
|
/* max queue in one round of service */
|
|
|
|
static const int cfq_quantum = 4;
|
2006-01-06 09:46:02 +01:00
|
|
|
static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
|
2008-01-31 13:08:54 +01:00
|
|
|
/* maximum backwards seek, in KiB */
|
|
|
|
static const int cfq_back_max = 16 * 1024;
|
|
|
|
/* penalty of a backwards seek */
|
|
|
|
static const int cfq_back_penalty = 2;
|
2006-01-06 09:46:02 +01:00
|
|
|
static const int cfq_slice_sync = HZ / 10;
|
2005-06-27 10:56:24 +02:00
|
|
|
static int cfq_slice_async = HZ / 25;
|
2006-01-06 09:46:02 +01:00
|
|
|
static const int cfq_slice_async_rq = 2;
|
2006-06-16 11:23:00 +02:00
|
|
|
static int cfq_slice_idle = HZ / 125;
|
2009-10-26 22:44:04 +01:00
|
|
|
static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
|
|
|
|
static const int cfq_hist_divisor = 4;
|
2005-06-27 10:55:12 +02:00
|
|
|
|
2007-04-20 14:27:50 +02:00
|
|
|
/*
|
2008-01-28 11:38:15 +01:00
|
|
|
* offset from end of service tree
|
2007-04-20 14:27:50 +02:00
|
|
|
*/
|
2008-01-28 11:38:15 +01:00
|
|
|
#define CFQ_IDLE_DELAY (HZ / 5)
|
2007-04-20 14:27:50 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* below this threshold, we consider thinktime immediate
|
|
|
|
*/
|
|
|
|
#define CFQ_MIN_TT (2)
|
|
|
|
|
2009-10-23 17:14:52 -04:00
|
|
|
/*
|
|
|
|
* Allow merged cfqqs to perform this amount of seeky I/O before
|
|
|
|
* deciding to break the queues up again.
|
|
|
|
*/
|
|
|
|
#define CFQQ_COOP_TOUT (HZ)
|
|
|
|
|
2005-06-27 10:55:12 +02:00
|
|
|
#define CFQ_SLICE_SCALE (5)
|
2008-08-26 15:52:36 +02:00
|
|
|
#define CFQ_HW_QUEUE_MIN (5)
|
2009-12-03 12:59:43 -05:00
|
|
|
#define CFQ_SERVICE_SHIFT 12
|
2005-06-27 10:55:12 +02:00
|
|
|
|
2008-01-31 13:08:54 +01:00
|
|
|
#define RQ_CIC(rq) \
|
|
|
|
((struct cfq_io_context *) (rq)->elevator_private)
|
2008-05-30 12:23:07 +02:00
|
|
|
#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2006-12-06 20:33:20 -08:00
|
|
|
static struct kmem_cache *cfq_pool;
|
|
|
|
static struct kmem_cache *cfq_ioc_pool;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-06-24 15:13:48 +09:00
|
|
|
static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
|
2006-03-18 15:05:53 -05:00
|
|
|
static struct completion *ioc_gone;
|
2008-05-29 09:32:08 +02:00
|
|
|
static DEFINE_SPINLOCK(ioc_gone_lock);
|
2006-03-18 15:05:53 -05:00
|
|
|
|
2005-06-27 10:55:12 +02:00
|
|
|
#define CFQ_PRIO_LISTS IOPRIO_BE_NR
|
|
|
|
#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
|
|
|
|
#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
|
|
|
|
|
2006-03-28 13:03:44 +02:00
|
|
|
#define sample_valid(samples) ((samples) > 80)
|
2009-12-03 12:59:41 -05:00
|
|
|
#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)
|
2006-03-28 13:03:44 +02:00
|
|
|
|
2007-04-26 12:53:50 +02:00
|
|
|
/*
|
|
|
|
* Most of our rbtree usage is for sorting with min extraction, so
|
|
|
|
* if we cache the leftmost node we don't have to walk down the tree
|
|
|
|
* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
|
|
|
|
* move this into the elevator for the rq sorting as well.
|
|
|
|
*/
|
|
|
|
struct cfq_rb_root {
|
|
|
|
struct rb_root rb;
|
|
|
|
struct rb_node *left;
|
2009-10-26 22:44:33 +01:00
|
|
|
unsigned count;
|
2009-12-03 12:59:41 -05:00
|
|
|
u64 min_vdisktime;
|
2009-12-03 12:59:43 -05:00
|
|
|
struct rb_node *active;
|
2009-12-03 12:59:44 -05:00
|
|
|
unsigned total_weight;
|
2007-04-26 12:53:50 +02:00
|
|
|
};
|
2009-12-03 12:59:41 -05:00
|
|
|
#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, 0, 0, }
|
2007-04-26 12:53:50 +02:00
|
|
|
|
2009-06-30 09:34:12 +02:00
|
|
|
/*
|
|
|
|
* Per process-grouping structure
|
|
|
|
*/
|
|
|
|
struct cfq_queue {
|
|
|
|
/* reference count */
|
|
|
|
atomic_t ref;
|
|
|
|
/* various state flags, see below */
|
|
|
|
unsigned int flags;
|
|
|
|
/* parent cfq_data */
|
|
|
|
struct cfq_data *cfqd;
|
|
|
|
/* service_tree member */
|
|
|
|
struct rb_node rb_node;
|
|
|
|
/* service_tree key */
|
|
|
|
unsigned long rb_key;
|
|
|
|
/* prio tree member */
|
|
|
|
struct rb_node p_node;
|
|
|
|
/* prio tree root we belong to, if any */
|
|
|
|
struct rb_root *p_root;
|
|
|
|
/* sorted list of pending requests */
|
|
|
|
struct rb_root sort_list;
|
|
|
|
/* if fifo isn't expired, next request to serve */
|
|
|
|
struct request *next_rq;
|
|
|
|
/* requests queued in sort_list */
|
|
|
|
int queued[2];
|
|
|
|
/* currently allocated requests */
|
|
|
|
int allocated[2];
|
|
|
|
/* fifo list of requests in sort_list */
|
|
|
|
struct list_head fifo;
|
|
|
|
|
2009-12-03 12:59:45 -05:00
|
|
|
/* time when queue got scheduled in to dispatch first request. */
|
|
|
|
unsigned long dispatch_start;
|
2009-12-03 12:59:53 -05:00
|
|
|
unsigned int allocated_slice;
|
2009-12-03 12:59:45 -05:00
|
|
|
/* time when first request from queue completed and slice started. */
|
|
|
|
unsigned long slice_start;
|
2009-06-30 09:34:12 +02:00
|
|
|
unsigned long slice_end;
|
|
|
|
long slice_resid;
|
|
|
|
unsigned int slice_dispatch;
|
|
|
|
|
|
|
|
/* pending metadata requests */
|
|
|
|
int meta_pending;
|
|
|
|
/* number of requests that are on the dispatch list or inside driver */
|
|
|
|
int dispatched;
|
|
|
|
|
|
|
|
/* io prio of this group */
|
|
|
|
unsigned short ioprio, org_ioprio;
|
|
|
|
unsigned short ioprio_class, org_ioprio_class;
|
|
|
|
|
2009-10-23 17:14:49 -04:00
|
|
|
unsigned int seek_samples;
|
|
|
|
u64 seek_total;
|
|
|
|
sector_t seek_mean;
|
|
|
|
sector_t last_request_pos;
|
2009-10-23 17:14:52 -04:00
|
|
|
unsigned long seeky_start;
|
2009-10-23 17:14:49 -04:00
|
|
|
|
2009-06-30 09:34:12 +02:00
|
|
|
pid_t pid;
|
2009-10-23 17:14:50 -04:00
|
|
|
|
2009-10-26 22:44:33 +01:00
|
|
|
struct cfq_rb_root *service_tree;
|
2009-10-23 17:14:50 -04:00
|
|
|
struct cfq_queue *new_cfqq;
|
2009-12-03 12:59:38 -05:00
|
|
|
struct cfq_group *cfqg;
|
2009-12-03 12:59:55 -05:00
|
|
|
struct cfq_group *orig_cfqg;
|
2009-12-03 12:59:49 -05:00
|
|
|
/* Sectors dispatched in current dispatch round */
|
|
|
|
unsigned long nr_sectors;
|
2009-06-30 09:34:12 +02:00
|
|
|
};
|
|
|
|
|
2009-10-27 19:16:03 +01:00
|
|
|
/*
|
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-26 22:45:29 +01:00
|
|
|
* First index in the service_trees.
|
2009-10-27 19:16:03 +01:00
|
|
|
* IDLE is handled separately, so it has negative index
|
|
|
|
*/
|
|
|
|
enum wl_prio_t {
|
|
|
|
BE_WORKLOAD = 0,
|
2009-12-03 12:59:39 -05:00
|
|
|
RT_WORKLOAD = 1,
|
|
|
|
IDLE_WORKLOAD = 2,
|
2009-10-27 19:16:03 +01:00
|
|
|
};
|
|
|
|
|
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-26 22:45:29 +01:00
|
|
|
/*
|
|
|
|
* Second index in the service_trees.
|
|
|
|
*/
|
|
|
|
enum wl_type_t {
|
|
|
|
ASYNC_WORKLOAD = 0,
|
|
|
|
SYNC_NOIDLE_WORKLOAD = 1,
|
|
|
|
SYNC_WORKLOAD = 2
|
|
|
|
};
|
|
|
|
|
2009-12-03 12:59:38 -05:00
|
|
|
/* This is per cgroup per device grouping structure */
|
|
|
|
struct cfq_group {
|
2009-12-03 12:59:41 -05:00
|
|
|
/* group service_tree member */
|
|
|
|
struct rb_node rb_node;
|
|
|
|
|
|
|
|
/* group service_tree key */
|
|
|
|
u64 vdisktime;
|
2009-12-03 12:59:43 -05:00
|
|
|
unsigned int weight;
|
2009-12-03 12:59:41 -05:00
|
|
|
bool on_st;
|
|
|
|
|
|
|
|
/* number of cfqq currently on this group */
|
|
|
|
int nr_cfqq;
|
|
|
|
|
2009-12-03 12:59:44 -05:00
|
|
|
/* Per group busy queus average. Useful for workload slice calc. */
|
|
|
|
unsigned int busy_queues_avg[2];
|
2009-12-03 12:59:38 -05:00
|
|
|
/*
|
|
|
|
* rr lists of queues with requests, onle rr for each priority class.
|
|
|
|
* Counts are embedded in the cfq_rb_root
|
|
|
|
*/
|
|
|
|
struct cfq_rb_root service_trees[2][3];
|
|
|
|
struct cfq_rb_root service_tree_idle;
|
2009-12-03 12:59:45 -05:00
|
|
|
|
|
|
|
unsigned long saved_workload_slice;
|
|
|
|
enum wl_type_t saved_workload;
|
|
|
|
enum wl_prio_t saved_serving_prio;
|
2009-12-03 12:59:46 -05:00
|
|
|
struct blkio_group blkg;
|
|
|
|
#ifdef CONFIG_CFQ_GROUP_IOSCHED
|
|
|
|
struct hlist_node cfqd_node;
|
2009-12-03 12:59:47 -05:00
|
|
|
atomic_t ref;
|
2009-12-03 12:59:46 -05:00
|
|
|
#endif
|
2009-12-03 12:59:38 -05:00
|
|
|
};
|
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-26 22:45:29 +01:00
|
|
|
|
2005-06-27 10:55:12 +02:00
|
|
|
/*
|
|
|
|
* Per block device queue structure
|
|
|
|
*/
|
2005-04-16 15:20:36 -07:00
|
|
|
struct cfq_data {
|
2007-07-24 09:28:11 +02:00
|
|
|
struct request_queue *queue;
|
2009-12-03 12:59:41 -05:00
|
|
|
/* Root service tree for cfq_groups */
|
|
|
|
struct cfq_rb_root grp_service_tree;
|
2009-12-03 12:59:38 -05:00
|
|
|
struct cfq_group root_group;
|
2009-12-03 12:59:44 -05:00
|
|
|
/* Number of active cfq groups on group service tree */
|
|
|
|
int nr_groups;
|
2005-06-27 10:55:12 +02:00
|
|
|
|
2009-10-27 19:16:03 +01:00
|
|
|
/*
|
|
|
|
* The priority currently being served
|
2005-06-27 10:55:12 +02:00
|
|
|
*/
|
2009-10-27 19:16:03 +01:00
|
|
|
enum wl_prio_t serving_prio;
|
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-26 22:45:29 +01:00
|
|
|
enum wl_type_t serving_type;
|
|
|
|
unsigned long workload_expires;
|
2009-12-03 12:59:38 -05:00
|
|
|
struct cfq_group *serving_group;
|
2009-11-26 10:02:58 +01:00
|
|
|
bool noidle_tree_requires_idle;
|
2009-04-15 12:15:11 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Each priority tree is sorted by next_request position. These
|
|
|
|
* trees are used when determining if two or more queues are
|
|
|
|
* interleaving requests (see cfq_close_cooperator).
|
|
|
|
*/
|
|
|
|
struct rb_root prio_trees[CFQ_PRIO_LISTS];
|
|
|
|
|
2005-06-27 10:55:12 +02:00
|
|
|
unsigned int busy_queues;
|
|
|
|
|
2009-07-03 12:57:48 +02:00
|
|
|
int rq_in_driver[2];
|
2007-04-23 08:33:33 +02:00
|
|
|
int sync_flight;
|
2008-08-26 15:52:36 +02:00
|
|
|
|
|
|
|
/*
|
|
|
|
* queue-depth detection
|
|
|
|
*/
|
|
|
|
int rq_queued;
|
2006-06-01 10:12:26 +02:00
|
|
|
int hw_tag;
|
cfq-iosched: fix ncq detection code
CFQ's detection of queueing devices initially assumes a queuing device
and detects if the queue depth reaches a certain threshold.
However, it will reconsider this choice periodically.
Unfortunately, if device is considered not queuing, CFQ will force a
unit queue depth for some workloads, thus defeating the detection logic.
This leads to poor performance on queuing hardware,
since the idle window remains enabled.
Given this premise, switching to hw_tag = 0 after we have proved at
least once that the device is NCQ capable is not a good choice.
The new detection code starts in an indeterminate state, in which CFQ behaves
as if hw_tag = 1, and then, if for a long observation period we never saw
large depth, we switch to hw_tag = 0, otherwise we stick to hw_tag = 1,
without reconsidering it again.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-11-26 10:02:57 +01:00
|
|
|
/*
|
|
|
|
* hw_tag can be
|
|
|
|
* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
|
|
|
|
* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
|
|
|
|
* 0 => no NCQ
|
|
|
|
*/
|
|
|
|
int hw_tag_est_depth;
|
|
|
|
unsigned int hw_tag_samples;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2005-06-27 10:55:12 +02:00
|
|
|
/*
|
|
|
|
* idle window management
|
|
|
|
*/
|
|
|
|
struct timer_list idle_slice_timer;
|
2009-10-05 08:52:35 +02:00
|
|
|
struct work_struct unplug_work;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2005-06-27 10:55:12 +02:00
|
|
|
struct cfq_queue *active_queue;
|
|
|
|
struct cfq_io_context *active_cic;
|
|
|
|
|
2007-07-20 10:06:38 +02:00
|
|
|
/*
|
|
|
|
* async queue for each priority case
|
|
|
|
*/
|
|
|
|
struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
|
|
|
|
struct cfq_queue *async_idle_cfqq;
|
2007-07-10 13:43:25 +02:00
|
|
|
|
2007-04-25 12:44:27 +02:00
|
|
|
sector_t last_position;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
|
|
|
/*
|
|
|
|
* tunables, see top of file
|
|
|
|
*/
|
|
|
|
unsigned int cfq_quantum;
|
2005-06-27 10:55:12 +02:00
|
|
|
unsigned int cfq_fifo_expire[2];
|
2005-04-16 15:20:36 -07:00
|
|
|
unsigned int cfq_back_penalty;
|
|
|
|
unsigned int cfq_back_max;
|
2005-06-27 10:55:12 +02:00
|
|
|
unsigned int cfq_slice[2];
|
|
|
|
unsigned int cfq_slice_async_rq;
|
|
|
|
unsigned int cfq_slice_idle;
|
2009-10-03 19:42:18 +02:00
|
|
|
unsigned int cfq_latency;
|
2009-12-03 12:59:55 -05:00
|
|
|
unsigned int cfq_group_isolation;
|
2006-03-18 13:51:22 -05:00
|
|
|
|
|
|
|
struct list_head cic_list;
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2009-06-30 09:34:12 +02:00
|
|
|
/*
|
|
|
|
* Fallback dummy cfqq for extreme OOM conditions
|
|
|
|
*/
|
|
|
|
struct cfq_queue oom_cfqq;
|
2009-10-03 15:21:27 +02:00
|
|
|
|
2009-12-06 11:48:52 +01:00
|
|
|
unsigned long last_delayed_sync;
|
2009-12-03 12:59:46 -05:00
|
|
|
|
|
|
|
/* List of cfq groups being managed on this device*/
|
|
|
|
struct hlist_head cfqg_list;
|
2009-12-06 09:54:19 +01:00
|
|
|
struct rcu_head rcu;
|
2005-04-16 15:20:36 -07:00
|
|
|
};
|
|
|
|
|
2009-12-03 12:59:46 -05:00
|
|
|
static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
|
|
|
|
|
2009-12-03 12:59:38 -05:00
|
|
|
static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
|
|
|
|
enum wl_prio_t prio,
|
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-26 22:45:29 +01:00
|
|
|
enum wl_type_t type,
|
2009-10-27 19:16:03 +01:00
|
|
|
struct cfq_data *cfqd)
|
|
|
|
{
|
2009-12-03 12:59:41 -05:00
|
|
|
if (!cfqg)
|
|
|
|
return NULL;
|
|
|
|
|
2009-10-27 19:16:03 +01:00
|
|
|
if (prio == IDLE_WORKLOAD)
|
2009-12-03 12:59:38 -05:00
|
|
|
return &cfqg->service_tree_idle;
|
2009-10-27 19:16:03 +01:00
|
|
|
|
2009-12-03 12:59:38 -05:00
|
|
|
return &cfqg->service_trees[prio][type];
|
2009-10-27 19:16:03 +01:00
|
|
|
}
|
|
|
|
|
2005-06-27 10:56:24 +02:00
|
|
|
enum cfqq_state_flags {
|
2007-01-19 11:35:30 +11:00
|
|
|
CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */
|
|
|
|
CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */
|
2009-04-07 11:38:31 +02:00
|
|
|
CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */
|
2007-01-19 11:35:30 +11:00
|
|
|
CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
|
|
|
|
CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
|
|
|
|
CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */
|
|
|
|
CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */
|
2007-01-19 11:51:58 +11:00
|
|
|
CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */
|
2007-04-25 12:29:51 +02:00
|
|
|
CFQ_CFQQ_FLAG_sync, /* synchronous queue */
|
2009-10-23 17:14:51 -04:00
|
|
|
CFQ_CFQQ_FLAG_coop, /* cfqq is shared */
|
2009-11-26 10:02:58 +01:00
|
|
|
CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */
|
2009-12-03 12:59:53 -05:00
|
|
|
CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */
|
2005-06-27 10:56:24 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
#define CFQ_CFQQ_FNS(name) \
|
|
|
|
static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \
|
|
|
|
{ \
|
2008-01-31 13:08:54 +01:00
|
|
|
(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \
|
2005-06-27 10:56:24 +02:00
|
|
|
} \
|
|
|
|
static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \
|
|
|
|
{ \
|
2008-01-31 13:08:54 +01:00
|
|
|
(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \
|
2005-06-27 10:56:24 +02:00
|
|
|
} \
|
|
|
|
static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \
|
|
|
|
{ \
|
2008-01-31 13:08:54 +01:00
|
|
|
return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \
|
2005-06-27 10:56:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
CFQ_CFQQ_FNS(on_rr);
|
|
|
|
CFQ_CFQQ_FNS(wait_request);
|
2009-04-07 11:38:31 +02:00
|
|
|
CFQ_CFQQ_FNS(must_dispatch);
|
2005-06-27 10:56:24 +02:00
|
|
|
CFQ_CFQQ_FNS(must_alloc_slice);
|
|
|
|
CFQ_CFQQ_FNS(fifo_expire);
|
|
|
|
CFQ_CFQQ_FNS(idle_window);
|
|
|
|
CFQ_CFQQ_FNS(prio_changed);
|
2007-01-19 11:51:58 +11:00
|
|
|
CFQ_CFQQ_FNS(slice_new);
|
2007-04-25 12:29:51 +02:00
|
|
|
CFQ_CFQQ_FNS(sync);
|
2009-04-15 12:15:11 +02:00
|
|
|
CFQ_CFQQ_FNS(coop);
|
2009-11-26 10:02:58 +01:00
|
|
|
CFQ_CFQQ_FNS(deep);
|
2009-12-03 12:59:53 -05:00
|
|
|
CFQ_CFQQ_FNS(wait_busy);
|
2005-06-27 10:56:24 +02:00
|
|
|
#undef CFQ_CFQQ_FNS
|
|
|
|
|
2009-12-03 12:59:48 -05:00
|
|
|
#ifdef CONFIG_DEBUG_CFQ_IOSCHED
|
|
|
|
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
|
|
|
|
blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
|
|
|
|
cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
|
|
|
|
blkg_path(&(cfqq)->cfqg->blkg), ##args);
|
|
|
|
|
|
|
|
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \
|
|
|
|
blk_add_trace_msg((cfqd)->queue, "%s " fmt, \
|
|
|
|
blkg_path(&(cfqg)->blkg), ##args); \
|
|
|
|
|
|
|
|
#else
|
2008-05-30 12:23:07 +02:00
|
|
|
#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \
|
|
|
|
blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
|
2009-12-03 12:59:48 -05:00
|
|
|
#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);
|
|
|
|
#endif
|
2008-05-30 12:23:07 +02:00
|
|
|
#define cfq_log(cfqd, fmt, args...) \
|
|
|
|
blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
|
|
|
|
|
2009-12-03 12:59:39 -05:00
|
|
|
/* Traverses through cfq group service trees */
|
|
|
|
#define for_each_cfqg_st(cfqg, i, j, st) \
|
|
|
|
for (i = 0; i <= IDLE_WORKLOAD; i++) \
|
|
|
|
for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
|
|
|
|
: &cfqg->service_tree_idle; \
|
|
|
|
(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
|
|
|
|
(i == IDLE_WORKLOAD && j == 0); \
|
|
|
|
j++, st = i < IDLE_WORKLOAD ? \
|
|
|
|
&cfqg->service_trees[i][j]: NULL) \
|
|
|
|
|
|
|
|
|
2009-10-27 19:16:03 +01:00
|
|
|
static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
|
|
|
|
{
|
|
|
|
if (cfq_class_idle(cfqq))
|
|
|
|
return IDLE_WORKLOAD;
|
|
|
|
if (cfq_class_rt(cfqq))
|
|
|
|
return RT_WORKLOAD;
|
|
|
|
return BE_WORKLOAD;
|
|
|
|
}
|
|
|
|
|
cfq-iosched: fairness for sync no-idle queues
Currently no-idle queues in cfq are not serviced fairly:
even if they can only dispatch a small number of requests at a time,
they have to compete with idling queues to be serviced, experiencing
large latencies.
We should notice, instead, that no-idle queues are the ones that would
benefit most from having low latency, in fact they are any of:
* processes with large think times (e.g. interactive ones like file
managers)
* seeky (e.g. programs faulting in their code at startup)
* or marked as no-idle from upper levels, to improve latencies of those
requests.
This patch improves the fairness and latency for those queues, by:
* separating sync idle, sync no-idle and async queues in separate
service_trees, for each priority
* service all no-idle queues together
* and idling when the last no-idle queue has been serviced, to
anticipate for more no-idle work
* the timeslices allotted for idle and no-idle service_trees are
computed proportionally to the number of processes in each set.
Servicing all no-idle queues together should have a performance boost
for NCQ-capable drives, without compromising fairness.
Signed-off-by: Corrado Zoccolo <czoccolo@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-26 22:45:29 +01:00
|
|
|
|
|
|
|
static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
|
|
|
|
{
|
|
|
|
if (!cfq_cfqq_sync(cfqq))
|
|
|
|
return ASYNC_WORKLOAD;
|
|
|
|
if (!cfq_cfqq_idle_window(cfqq))
|
|
|
|
return SYNC_NOIDLE_WORKLOAD;
|
|
|
|
return SYNC_WORKLOAD;
|
|
|
|
}
|
|
|
|
|
2009-12-03 12:59:44 -05:00
|
|
|
static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
|
|
|
|
struct cfq_data *cfqd,
|
|
|
|
struct cfq_group *cfqg)
|
2009-10-27 19:16:03 +01:00
|
|
|
{
|
|
|
|
if (wl == IDLE_WORKLOAD)
|
2009-12-03 12:59:38 -05:00
|
|
|
return cfqg->service_tree_idle.count;
|
2009-10-27 19:16:03 +01:00
|
|
|
|
2009-12-03 12:59:38 -05:00
|
|
|
return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
|
|
|
|
+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
|
|
|
|
+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
|
2009-10-27 19:16:03 +01:00
|
|
|
}
|
|
|
|
|
2009-12-03 12:59:54 -05:00
|
|
|
static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
|
|
|
|
struct cfq_group *cfqg)
|
|
|
|
{
|
|
|
|
return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
|
|
|
|
+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
|
|
|
|
}
|
|
|
|
|
2007-07-24 09:28:11 +02:00
|
|
|
static void cfq_dispatch_insert(struct request_queue *, struct request *);
|
2009-10-07 20:02:57 +02:00
|
|
|
static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
|
2008-01-24 08:52:45 +01:00
|
|
|
struct io_context *, gfp_t);
|
2008-01-24 08:44:49 +01:00
|
|
|
static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
|
2007-04-25 12:29:51 +02:00
|
|
|
struct io_context *);
|
|
|
|
|
2009-07-03 12:57:48 +02:00
|
|
|
static inline int rq_in_driver(struct cfq_data *cfqd)
|
|
|
|
{
|
|
|
|
return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
|
|
|
|
}
|
|
|
|
|
2007-04-25 12:29:51 +02:00
|
|
|
static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
|
2009-10-07 20:02:57 +02:00
|
|
|
bool is_sync)
|
2007-04-25 12:29:51 +02:00
|
|
|
{
|
2009-10-07 20:02:57 +02:00
|
|
|
return cic->cfqq[is_sync];
|
2007-04-25 12:29:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void cic_set_cfqq(struct cfq_io_context *cic,
|
2009-10-07 20:02:57 +02:00
|
|
|
struct cfq_queue *cfqq, bool is_sync)
|
2007-04-25 12:29:51 +02:00
|
|
|
{
|
2009-10-07 20:02:57 +02:00
|
|
|
cic->cfqq[is_sync] = cfqq;
|
2007-04-25 12:29:51 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We regard a request as SYNC, if it's either a read or has the SYNC bit
|
|
|
|
* set (in which case it could also be direct WRITE).
|
|
|
|
*/
|
2009-10-07 20:02:57 +02:00
|
|
|
static inline bool cfq_bio_sync(struct bio *bio)
|
2007-04-25 12:29:51 +02:00
|
|
|
{
|
2009-10-07 20:02:57 +02:00
|
|
|
return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
|
2007-04-25 12:29:51 +02:00
|
|
|
}
|
2005-04-16 15:20:36 -07:00
|
|
|
|
2005-06-27 20:14:05 -07:00
|
|
|
/*
|
|
|
|
* scheduler run of queue, if there are requests pending and no one in the
|
|
|
|
* driver that will restart queueing
|
|
|
|
*/
|
2009-10-05 08:52:35 +02:00
|
|
|
static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
|
2005-06-27 20:14:05 -07:00
|
|
|
{
|
2008-05-30 12:23:07 +02:00
|
|
|
if (cfqd->busy_queues) {
|
|
|
|
cfq_log(cfqd, "schedule dispatch");
|
2009-10-05 08:52:35 +02:00
|
|
|
kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
|
2008-05-30 12:23:07 +02:00
|
|
|
}
|
2005-06-27 20:14:05 -07:00
|
|
|
}
|
|
|
|
|
2007-07-24 09:28:11 +02:00
|
|
|
static int cfq_queue_empty(struct request_queue *q)
|
2005-06-27 20:14:05 -07:00
|
|
|
{
|
|
|
|
struct cfq_data *cfqd = q->elevator->elevator_data;
|
|
|
|
|
2009-12-03 12:59:40 -05:00
|
|
|
return !cfqd->rq_queued;
|
2005-06-27 20:14:05 -07:00
|
|
|
}
|
|
|
|
|
2007-01-19 11:51:58 +11:00
|
|
|
/*
|
|
|
|
* Scale schedule slice based on io priority. Use the sync time slice only
|
|
|
|
* if a queue is marked sync and has sync io queued. A sync queue with async
|
|
|
|
* io only, should not get full sync slice length.
|
|
|
|
*/
|
2009-10-07 20:02:57 +02:00
|
|
|
static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
|
2007-04-20 14:27:50 +02:00
|
|
|
unsigned short prio)
|
2007-01-19 11:51:58 +11:00
|
|
|
{
|
2007-04-20 14:27:50 +02:00
|
|
|
const int base_slice = cfqd->cfq_slice[sync];
|
2007-01-19 11:51:58 +11:00
|
|
|
|
2007-04-20 14:27:50 +02:00
|
|
|
WARN_ON(prio >= IOPRIO_BE_NR);
|
|
|
|
|
|
|
|
return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
|
|
|
|
}
|
2007-01-19 11:51:58 +11:00
|
|
|
|
2007-04-20 14:27:50 +02:00
|
|
|
static inline int
|
|
|
|
cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
|
|
|
|
{
|
|
|
|
return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
|
2007-01-19 11:51:58 +11:00
|
|
|
}
|
|
|
|
|
2009-12-03 12:59:43 -05:00
|
|
|
static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
|
|
|
|
{
|
|
|
|
u64 d = delta << CFQ_SERVICE_SHIFT;
|
|
|
|
|
|
|
|
d = d * BLKIO_WEIGHT_DEFAULT;
|
|
|
|
do_div(d, cfqg->weight);
|
|
|
|
return d;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
|
|
|
|
{
|
|
|
|
s64 delta = (s64)(vdisktime - min_vdisktime);
|
|
|
|
if (delta > 0)
|
|
|
|
min_vdisktime = vdisktime;
|
|
|
|
|
|
|
|
return min_vdisktime;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
|
|
|
|
{
|
|
|
|
s64 delta = (s64)(vdisktime - min_vdisktime);
|
|
|
|
if (delta < 0)
|
|
|
|
min_vdisktime = vdisktime;
|
|
|
|
|
|
|
|
return min_vdisktime;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void update_min_vdisktime(struct cfq_rb_root *st)
|
|
|
|
{
|
|
|
|
u64 vdisktime = st->min_vdisktime;
|
|
|
|
struct cfq_group *cfqg;
|
|
|
|
|
|
|
|
if (st->active) {
|
|
|
|
cfqg = rb_entry_cfqg(st->active);
|
|
|
|
vdisktime = cfqg->vdisktime;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (st->left) {
|
|
|
|
cfqg = rb_entry_cfqg(st->left);
|
|
|
|
vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
|
|
|
|
}
|
|
|
|
|
|
|
|
st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
|
|
|
|
}
|
|
|
|
|
2009-10-26 22:44:04 +01:00
|
|
|
/*
|
|
|
|
* get averaged number of queues of RT/BE priority.
|
|
|
|
* average is updated, with a formula that gives more weight to higher numbers,
|
|
|
|
* to quickly follows sudden increases and decrease slowly
|
|
|
|
*/
|
|
|
|
|
2009-12-03 12:59:44 -05:00
|
|
|
static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
|
|
|
|
struct cfq_group *cfqg, bool rt)
|
2009-10-28 09:27:07 +01:00
|
|
|
{
|
2009-10-26 22:44:04 +01:00
|
|
|
unsigned min_q, max_q;
|
|
|
|
unsigned mult = cfq_hist_divisor - 1;
|
|
|
|
unsigned round = cfq_hist_divisor / 2;
|
2009-12-03 12:59:44 -05:00
|
|
|
unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
|
2009-10-26 22:44:04 +01:00
|
|
|
|
2009-12-03 12:59:44 -05:00
|
|
|
min_q = min(cfqg->busy_queues_avg[rt], busy);
|
|
|
|
max_q = max(cfqg->busy_queues_avg[rt], busy);
|
|
|
|
cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
|
2009-10-26 22:44:04 +01:00
|
|
|
cfq_hist_divisor;
|
|