mirror of
https://github.com/adulau/aha.git
synced 2024-12-29 12:16:20 +00:00
[PATCH] Zone reclaim: Reclaim logic
Some bits for zone reclaim exists in 2.6.15 but they are not usable. This patch fixes them up, removes unused code and makes zone reclaim usable. Zone reclaim allows the reclaiming of pages from a zone if the number of free pages falls below the watermarks even if other zones still have enough pages available. Zone reclaim is of particular importance for NUMA machines. It can be more beneficial to reclaim a page than taking the performance penalties that come with allocating a page on a remote zone. Zone reclaim is enabled if the maximum distance to another node is higher than RECLAIM_DISTANCE, which may be defined by an arch. By default RECLAIM_DISTANCE is 20. 20 is the distance to another node in the same component (enclosure or motherboard) on IA64. The meaning of the NUMA distance information seems to vary by arch. If zone reclaim is not successful then no further reclaim attempts will occur for a certain time period (ZONE_RECLAIM_INTERVAL). This patch was discussed before. See http://marc.theaimsgroup.com/?l=linux-kernel&m=113519961504207&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113408418232531&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113389027420032&w=2 http://marc.theaimsgroup.com/?l=linux-kernel&m=113380938612205&w=2 Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
parent
f1fd1067ec
commit
9eeff2395e
5 changed files with 108 additions and 8 deletions
|
@ -149,14 +149,16 @@ struct zone {
|
||||||
unsigned long pages_scanned; /* since last reclaim */
|
unsigned long pages_scanned; /* since last reclaim */
|
||||||
int all_unreclaimable; /* All pages pinned */
|
int all_unreclaimable; /* All pages pinned */
|
||||||
|
|
||||||
/*
|
|
||||||
* Does the allocator try to reclaim pages from the zone as soon
|
|
||||||
* as it fails a watermark_ok() in __alloc_pages?
|
|
||||||
*/
|
|
||||||
int reclaim_pages;
|
|
||||||
/* A count of how many reclaimers are scanning this zone */
|
/* A count of how many reclaimers are scanning this zone */
|
||||||
atomic_t reclaim_in_progress;
|
atomic_t reclaim_in_progress;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* timestamp (in jiffies) of the last zone reclaim that did not
|
||||||
|
* result in freeing of pages. This is used to avoid repeated scans
|
||||||
|
* if all memory in the zone is in use.
|
||||||
|
*/
|
||||||
|
unsigned long last_unsuccessful_zone_reclaim;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* prev_priority holds the scanning priority for this zone. It is
|
* prev_priority holds the scanning priority for this zone. It is
|
||||||
* defined as the scanning priority at which we achieved our reclaim
|
* defined as the scanning priority at which we achieved our reclaim
|
||||||
|
|
|
@ -176,6 +176,17 @@ extern int try_to_free_pages(struct zone **, gfp_t);
|
||||||
extern int shrink_all_memory(int);
|
extern int shrink_all_memory(int);
|
||||||
extern int vm_swappiness;
|
extern int vm_swappiness;
|
||||||
|
|
||||||
|
#ifdef CONFIG_NUMA
|
||||||
|
extern int zone_reclaim_mode;
|
||||||
|
extern int zone_reclaim(struct zone *, gfp_t, unsigned int);
|
||||||
|
#else
|
||||||
|
#define zone_reclaim_mode 0
|
||||||
|
static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef CONFIG_MIGRATION
|
#ifdef CONFIG_MIGRATION
|
||||||
extern int isolate_lru_page(struct page *p);
|
extern int isolate_lru_page(struct page *p);
|
||||||
extern int putback_lru_pages(struct list_head *l);
|
extern int putback_lru_pages(struct list_head *l);
|
||||||
|
|
|
@ -56,6 +56,14 @@
|
||||||
#define REMOTE_DISTANCE 20
|
#define REMOTE_DISTANCE 20
|
||||||
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
|
#define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE)
|
||||||
#endif
|
#endif
|
||||||
|
#ifndef RECLAIM_DISTANCE
|
||||||
|
/*
|
||||||
|
* If the distance between nodes in a system is larger than RECLAIM_DISTANCE
|
||||||
|
* (in whatever arch specific measurement units returned by node_distance())
|
||||||
|
* then switch on zone reclaim on boot.
|
||||||
|
*/
|
||||||
|
#define RECLAIM_DISTANCE 20
|
||||||
|
#endif
|
||||||
#ifndef PENALTY_FOR_NODE_WITH_CPUS
|
#ifndef PENALTY_FOR_NODE_WITH_CPUS
|
||||||
#define PENALTY_FOR_NODE_WITH_CPUS (1)
|
#define PENALTY_FOR_NODE_WITH_CPUS (1)
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -878,7 +878,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
|
||||||
mark = (*z)->pages_high;
|
mark = (*z)->pages_high;
|
||||||
if (!zone_watermark_ok(*z, order, mark,
|
if (!zone_watermark_ok(*z, order, mark,
|
||||||
classzone_idx, alloc_flags))
|
classzone_idx, alloc_flags))
|
||||||
continue;
|
if (!zone_reclaim_mode ||
|
||||||
|
!zone_reclaim(*z, gfp_mask, order))
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
|
page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
|
||||||
|
@ -1595,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
|
||||||
prev_node = local_node;
|
prev_node = local_node;
|
||||||
nodes_clear(used_mask);
|
nodes_clear(used_mask);
|
||||||
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
|
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
|
||||||
|
int distance = node_distance(local_node, node);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If another node is sufficiently far away then it is better
|
||||||
|
* to reclaim pages in a zone before going off node.
|
||||||
|
*/
|
||||||
|
if (distance > RECLAIM_DISTANCE)
|
||||||
|
zone_reclaim_mode = 1;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We don't want to pressure a particular node.
|
* We don't want to pressure a particular node.
|
||||||
* So adding penalty to the first node in same
|
* So adding penalty to the first node in same
|
||||||
* distance group to make it round-robin.
|
* distance group to make it round-robin.
|
||||||
*/
|
*/
|
||||||
if (node_distance(local_node, node) !=
|
|
||||||
node_distance(local_node, prev_node))
|
if (distance != node_distance(local_node, prev_node))
|
||||||
node_load[node] += load;
|
node_load[node] += load;
|
||||||
prev_node = node;
|
prev_node = node;
|
||||||
load--;
|
load--;
|
||||||
|
|
68
mm/vmscan.c
68
mm/vmscan.c
|
@ -1572,3 +1572,71 @@ static int __init kswapd_init(void)
|
||||||
}
|
}
|
||||||
|
|
||||||
module_init(kswapd_init)
|
module_init(kswapd_init)
|
||||||
|
|
||||||
|
#ifdef CONFIG_NUMA
|
||||||
|
/*
|
||||||
|
* Zone reclaim mode
|
||||||
|
*
|
||||||
|
* If non-zero call zone_reclaim when the number of free pages falls below
|
||||||
|
* the watermarks.
|
||||||
|
*
|
||||||
|
* In the future we may add flags to the mode. However, the page allocator
|
||||||
|
* should only have to check that zone_reclaim_mode != 0 before calling
|
||||||
|
* zone_reclaim().
|
||||||
|
*/
|
||||||
|
int zone_reclaim_mode __read_mostly;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Mininum time between zone reclaim scans
|
||||||
|
*/
|
||||||
|
#define ZONE_RECLAIM_INTERVAL HZ/2
|
||||||
|
/*
|
||||||
|
* Try to free up some pages from this zone through reclaim.
|
||||||
|
*/
|
||||||
|
int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
|
||||||
|
{
|
||||||
|
int nr_pages = 1 << order;
|
||||||
|
struct task_struct *p = current;
|
||||||
|
struct reclaim_state reclaim_state;
|
||||||
|
struct scan_control sc = {
|
||||||
|
.gfp_mask = gfp_mask,
|
||||||
|
.may_writepage = 0,
|
||||||
|
.may_swap = 0,
|
||||||
|
.nr_mapped = read_page_state(nr_mapped),
|
||||||
|
.nr_scanned = 0,
|
||||||
|
.nr_reclaimed = 0,
|
||||||
|
.priority = 0
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!(gfp_mask & __GFP_WAIT) ||
|
||||||
|
zone->zone_pgdat->node_id != numa_node_id() ||
|
||||||
|
zone->all_unreclaimable ||
|
||||||
|
atomic_read(&zone->reclaim_in_progress) > 0)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (time_before(jiffies,
|
||||||
|
zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
disable_swap_token();
|
||||||
|
|
||||||
|
if (nr_pages > SWAP_CLUSTER_MAX)
|
||||||
|
sc.swap_cluster_max = nr_pages;
|
||||||
|
else
|
||||||
|
sc.swap_cluster_max = SWAP_CLUSTER_MAX;
|
||||||
|
|
||||||
|
cond_resched();
|
||||||
|
p->flags |= PF_MEMALLOC;
|
||||||
|
reclaim_state.reclaimed_slab = 0;
|
||||||
|
p->reclaim_state = &reclaim_state;
|
||||||
|
shrink_zone(zone, &sc);
|
||||||
|
p->reclaim_state = NULL;
|
||||||
|
current->flags &= ~PF_MEMALLOC;
|
||||||
|
|
||||||
|
if (sc.nr_reclaimed == 0)
|
||||||
|
zone->last_unsuccessful_zone_reclaim = jiffies;
|
||||||
|
|
||||||
|
return sc.nr_reclaimed > nr_pages;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue