Skip to content

Commit 3e59fcb

Browse files
Michal Hockotorvalds
authored andcommitted
hugetlb: add support for preferred node to alloc_huge_page_nodemask
alloc_huge_page_nodemask tries to allocate from any numa node in the allowed node mask starting from lower numa nodes. This might lead to filling up those low NUMA nodes while others are not used. We can reduce this risk by introducing a concept of the preferred node similar to what we have in the regular page allocator. We will start allocating from the preferred nid and then iterate over all allowed nodes in the zonelist order until we try them all. This is mimicing the page allocator logic except it operates on per-node mempools. dequeue_huge_page_vma already does this so distill the zonelist logic into a more generic dequeue_huge_page_nodemask and use it in alloc_huge_page_nodemask. This will allow us to use proper per numa distance fallback also for alloc_huge_page_node which can use alloc_huge_page_nodemask now and we can get rid of alloc_huge_page_node helper which doesn't have any user anymore. Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Michal Hocko <[email protected]> Acked-by: Vlastimil Babka <[email protected]> Reviewed-by: Mike Kravetz <[email protected]> Tested-by: Mike Kravetz <[email protected]> Cc: Naoya Horiguchi <[email protected]> Cc: Mel Gorman <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]>
1 parent aaf14e4 commit 3e59fcb

File tree

3 files changed

+48
-47
lines changed

3 files changed

+48
-47
lines changed

include/linux/hugetlb.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
349349
struct page *alloc_huge_page_node(struct hstate *h, int nid);
350350
struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
351351
unsigned long addr, int avoid_reserve);
352-
struct page *alloc_huge_page_nodemask(struct hstate *h, nodemask_t *nmask);
352+
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
353+
nodemask_t *nmask);
353354
int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
354355
pgoff_t idx);
355356

@@ -525,7 +526,7 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
525526
struct hstate {};
526527
#define alloc_huge_page(v, a, r) NULL
527528
#define alloc_huge_page_node(h, nid) NULL
528-
#define alloc_huge_page_nodemask(h, nmask) NULL
529+
#define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL
529530
#define alloc_huge_page_noerr(v, a, r) NULL
530531
#define alloc_bootmem_huge_page(h) NULL
531532
#define hstate_file(f) NULL

include/linux/migrate.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ static inline struct page *new_page_nodemask(struct page *page,
3838

3939
if (PageHuge(page))
4040
return alloc_huge_page_nodemask(page_hstate(compound_head(page)),
41-
nodemask);
41+
preferred_nid, nodemask);
4242

4343
if (PageHighMem(page) || (zone_idx(page_zone(page)) == ZONE_MOVABLE))
4444
gfp_mask |= __GFP_HIGHMEM;

mm/hugetlb.c

Lines changed: 44 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -887,19 +887,39 @@ static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
887887
return page;
888888
}
889889

890-
static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
890+
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
891+
nodemask_t *nmask)
891892
{
892-
struct page *page;
893-
int node;
893+
unsigned int cpuset_mems_cookie;
894+
struct zonelist *zonelist;
895+
struct zone *zone;
896+
struct zoneref *z;
897+
int node = -1;
894898

895-
if (nid != NUMA_NO_NODE)
896-
return dequeue_huge_page_node_exact(h, nid);
899+
zonelist = node_zonelist(nid, gfp_mask);
900+
901+
retry_cpuset:
902+
cpuset_mems_cookie = read_mems_allowed_begin();
903+
for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nmask) {
904+
struct page *page;
905+
906+
if (!cpuset_zone_allowed(zone, gfp_mask))
907+
continue;
908+
/*
909+
* no need to ask again on the same node. Pool is node rather than
910+
* zone aware
911+
*/
912+
if (zone_to_nid(zone) == node)
913+
continue;
914+
node = zone_to_nid(zone);
897915

898-
for_each_online_node(node) {
899916
page = dequeue_huge_page_node_exact(h, node);
900917
if (page)
901918
return page;
902919
}
920+
if (unlikely(read_mems_allowed_retry(cpuset_mems_cookie)))
921+
goto retry_cpuset;
922+
903923
return NULL;
904924
}
905925

@@ -917,15 +937,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
917937
unsigned long address, int avoid_reserve,
918938
long chg)
919939
{
920-
struct page *page = NULL;
940+
struct page *page;
921941
struct mempolicy *mpol;
922-
nodemask_t *nodemask;
923942
gfp_t gfp_mask;
943+
nodemask_t *nodemask;
924944
int nid;
925-
struct zonelist *zonelist;
926-
struct zone *zone;
927-
struct zoneref *z;
928-
unsigned int cpuset_mems_cookie;
929945

930946
/*
931947
* A child process with MAP_PRIVATE mappings created by their parent
@@ -940,32 +956,15 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
940956
if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
941957
goto err;
942958

943-
retry_cpuset:
944-
cpuset_mems_cookie = read_mems_allowed_begin();
945959
gfp_mask = htlb_alloc_mask(h);
946960
nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
947-
zonelist = node_zonelist(nid, gfp_mask);
948-
949-
for_each_zone_zonelist_nodemask(zone, z, zonelist,
950-
MAX_NR_ZONES - 1, nodemask) {
951-
if (cpuset_zone_allowed(zone, gfp_mask)) {
952-
page = dequeue_huge_page_node(h, zone_to_nid(zone));
953-
if (page) {
954-
if (avoid_reserve)
955-
break;
956-
if (!vma_has_reserves(vma, chg))
957-
break;
958-
959-
SetPagePrivate(page);
960-
h->resv_huge_pages--;
961-
break;
962-
}
963-
}
961+
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, nodemask);
962+
if (page && !avoid_reserve && vma_has_reserves(vma, chg)) {
963+
SetPagePrivate(page);
964+
h->resv_huge_pages--;
964965
}
965966

966967
mpol_cond_put(mpol);
967-
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
968-
goto retry_cpuset;
969968
return page;
970969

971970
err:
@@ -1633,7 +1632,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
16331632

16341633
spin_lock(&hugetlb_lock);
16351634
if (h->free_huge_pages - h->resv_huge_pages > 0)
1636-
page = dequeue_huge_page_node(h, nid);
1635+
page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
16371636
spin_unlock(&hugetlb_lock);
16381637

16391638
if (!page)
@@ -1642,26 +1641,27 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
16421641
return page;
16431642
}
16441643

1645-
struct page *alloc_huge_page_nodemask(struct hstate *h, nodemask_t *nmask)
1644+
1645+
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
1646+
nodemask_t *nmask)
16461647
{
16471648
gfp_t gfp_mask = htlb_alloc_mask(h);
1648-
struct page *page = NULL;
1649-
int node;
16501649

16511650
spin_lock(&hugetlb_lock);
16521651
if (h->free_huge_pages - h->resv_huge_pages > 0) {
1653-
for_each_node_mask(node, *nmask) {
1654-
page = dequeue_huge_page_node_exact(h, node);
1655-
if (page)
1656-
break;
1652+
struct page *page;
1653+
1654+
page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
1655+
if (page) {
1656+
spin_unlock(&hugetlb_lock);
1657+
return page;
16571658
}
16581659
}
16591660
spin_unlock(&hugetlb_lock);
1660-
if (page)
1661-
return page;
16621661

16631662
/* No reservations, try to overcommit */
1664-
return __alloc_buddy_huge_page(h, gfp_mask, NUMA_NO_NODE, nmask);
1663+
1664+
return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
16651665
}
16661666

16671667
/*

0 commit comments

Comments
 (0)