Skip to content

Commit 9d3be21

Browse files
Michal Hockotorvalds
authored andcommitted
mm, page_alloc: simplify zonelist initialization
build_zonelists gradually builds zonelists from the nearest to the most distant node. As we do not know how many populated zones we will have in each node we rely on the _zoneref to terminate initialized part of the zonelist by a NULL zone. While this is functionally correct it is quite suboptimal because we cannot allow updaters to race with zonelists users because they could see an empty zonelist and fail the allocation or hit the OOM killer in the worst case. We can do much better, though. We can store the node ordering into an already existing node_order array and then give this array to build_zonelists_in_node_order and do the whole initialization at once. zonelists consumers still might see halfway initialized state but that should be much more tolerateable because the list will not be empty and they would either see some zone twice or skip over some zone(s) in the worst case which shouldn't lead to immediate failures. While at it let's simplify build_zonelists_node which is rather confusing now. It gets an index into the zoneref array and returns the updated index for the next iteration. Let's rename the function to build_zonerefs_node to better reflect its purpose and give it zoneref array to update. The function doesn't the index anymore. It just returns the number of added zones so that the caller can advance the zonered array start for the next update. This patch alone doesn't introduce any functional change yet, though, it is merely a preparatory work for later changes. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Michal Hocko <[email protected]> Acked-by: Vlastimil Babka <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Joonsoo Kim <[email protected]> Cc: Mel Gorman <[email protected]> Cc: Shaohua Li <[email protected]> Cc: Toshi Kani <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent 34ad129 commit 9d3be21

File tree

1 file changed

+41
-40
lines changed

1 file changed

+41
-40
lines changed

mm/page_alloc.c

Lines changed: 41 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4839,18 +4839,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
48394839
*
48404840
* Add all populated zones of a node to the zonelist.
48414841
*/
4842-
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
4843-
int nr_zones)
4842+
static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
48444843
{
48454844
struct zone *zone;
48464845
enum zone_type zone_type = MAX_NR_ZONES;
4846+
int nr_zones = 0;
48474847

48484848
do {
48494849
zone_type--;
48504850
zone = pgdat->node_zones + zone_type;
48514851
if (managed_zone(zone)) {
4852-
zoneref_set_zone(zone,
4853-
&zonelist->_zonerefs[nr_zones++]);
4852+
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
48544853
check_highest_zone(zone_type);
48554854
}
48564855
} while (zone_type);
@@ -4977,31 +4976,39 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
49774976
* This results in maximum locality--normal zone overflows into local
49784977
* DMA zone, if any--but risks exhausting DMA zone.
49794978
*/
4980-
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
4979+
static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
4980+
unsigned nr_nodes)
49814981
{
4982-
int j;
4983-
struct zonelist *zonelist;
4982+
struct zoneref *zonerefs;
4983+
int i;
4984+
4985+
zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
4986+
4987+
for (i = 0; i < nr_nodes; i++) {
4988+
int nr_zones;
49844989

4985-
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
4986-
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
4987-
;
4988-
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
4989-
zonelist->_zonerefs[j].zone = NULL;
4990-
zonelist->_zonerefs[j].zone_idx = 0;
4990+
pg_data_t *node = NODE_DATA(node_order[i]);
4991+
4992+
nr_zones = build_zonerefs_node(node, zonerefs);
4993+
zonerefs += nr_zones;
4994+
}
4995+
zonerefs->zone = NULL;
4996+
zonerefs->zone_idx = 0;
49914997
}
49924998

49934999
/*
49945000
* Build gfp_thisnode zonelists
49955001
*/
49965002
static void build_thisnode_zonelists(pg_data_t *pgdat)
49975003
{
4998-
int j;
4999-
struct zonelist *zonelist;
5004+
struct zoneref *zonerefs;
5005+
int nr_zones;
50005006

5001-
zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
5002-
j = build_zonelists_node(pgdat, zonelist, 0);
5003-
zonelist->_zonerefs[j].zone = NULL;
5004-
zonelist->_zonerefs[j].zone_idx = 0;
5007+
zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5008+
nr_zones = build_zonerefs_node(pgdat, zonerefs);
5009+
zonerefs += nr_zones;
5010+
zonerefs->zone = NULL;
5011+
zonerefs->zone_idx = 0;
50055012
}
50065013

50075014
/*
@@ -5010,21 +5017,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
50105017
* exhausted, but results in overflowing to remote node while memory
50115018
* may still exist in local DMA zone.
50125019
*/
5013-
static int node_order[MAX_NUMNODES];
50145020

50155021
static void build_zonelists(pg_data_t *pgdat)
50165022
{
5017-
int i, node, load;
5023+
static int node_order[MAX_NUMNODES];
5024+
int node, load, nr_nodes = 0;
50185025
nodemask_t used_mask;
50195026
int local_node, prev_node;
5020-
struct zonelist *zonelist;
5021-
5022-
/* initialize zonelists */
5023-
for (i = 0; i < MAX_ZONELISTS; i++) {
5024-
zonelist = pgdat->node_zonelists + i;
5025-
zonelist->_zonerefs[0].zone = NULL;
5026-
zonelist->_zonerefs[0].zone_idx = 0;
5027-
}
50285027

50295028
/* NUMA-aware ordering of nodes */
50305029
local_node = pgdat->node_id;
@@ -5033,8 +5032,6 @@ static void build_zonelists(pg_data_t *pgdat)
50335032
nodes_clear(used_mask);
50345033

50355034
memset(node_order, 0, sizeof(node_order));
5036-
i = 0;
5037-
50385035
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
50395036
/*
50405037
* We don't want to pressure a particular node.
@@ -5045,11 +5042,12 @@ static void build_zonelists(pg_data_t *pgdat)
50455042
node_distance(local_node, prev_node))
50465043
node_load[node] = load;
50475044

5045+
node_order[nr_nodes++] = node;
50485046
prev_node = node;
50495047
load--;
5050-
build_zonelists_in_node_order(pgdat, node);
50515048
}
50525049

5050+
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
50535051
build_thisnode_zonelists(pgdat);
50545052
}
50555053

@@ -5078,13 +5076,14 @@ static void setup_min_slab_ratio(void);
50785076
static void build_zonelists(pg_data_t *pgdat)
50795077
{
50805078
int node, local_node;
5081-
enum zone_type j;
5082-
struct zonelist *zonelist;
5079+
struct zoneref *zonerefs;
5080+
int nr_zones;
50835081

50845082
local_node = pgdat->node_id;
50855083

5086-
zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
5087-
j = build_zonelists_node(pgdat, zonelist, 0);
5084+
zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5085+
nr_zones = build_zonerefs_node(pgdat, zonerefs);
5086+
zonerefs += nr_zones;
50885087

50895088
/*
50905089
* Now we build the zonelist so that it contains the zones
@@ -5097,16 +5096,18 @@ static void build_zonelists(pg_data_t *pgdat)
50975096
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
50985097
if (!node_online(node))
50995098
continue;
5100-
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
5099+
nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5100+
zonerefs += nr_zones;
51015101
}
51025102
for (node = 0; node < local_node; node++) {
51035103
if (!node_online(node))
51045104
continue;
5105-
j = build_zonelists_node(NODE_DATA(node), zonelist, j);
5105+
nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5106+
zonerefs += nr_zones;
51065107
}
51075108

5108-
zonelist->_zonerefs[j].zone = NULL;
5109-
zonelist->_zonerefs[j].zone_idx = 0;
5109+
zonerefs->zone = NULL;
5110+
zonerefs->zone_idx = 0;
51105111
}
51115112

51125113
#endif /* CONFIG_NUMA */

0 commit comments

Comments
 (0)