Skip to content

Commit b714ccb

Browse files
lorenzo-stoakesakpm00
authored andcommitted
mm/mremap: complete refactor of move_vma()
We invoke ksm_madvise() with an intentionally dummy flags field, so no need to pass around. Additionally, the code tries to be 'clever' with account_start, account_end, using these to both check that vma->vm_start != 0 and that we ought to account the newly split portion of VMA post-move, either before or after it. We need to do this because we intentionally removed VM_ACCOUNT on the VMA prior to unmapping, so we don't erroneously unaccount memory (we have already calculated the correct amount to account and accounted it, any subsequent subtraction will be incorrect). This patch significantly expands the comment (from 2002!) about 'concealing' the flag to make it abundantly clear what's going on, as well as adding and expanding a number of other comments also. We can remove account_start, account_end by instead tracking when we account (i.e. vma->vm_flags has the VM_ACCOUNT flag set, and this is not an MREMAP_DONTUNMAP operation), and figuring out when to reinstate the VM_ACCOUNT flag on prior/subsequent VMAs separately. We additionally break the function into logical pieces and attack the very confusing error handling logic (where, for instance, new_addr is set to err). After this change the code is considerably more readable and easy to manipulate. Link: https://lkml.kernel.org/r/e7eaa307e444ba2b04d94fd985c907c8e896f893.1741639347.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes <[email protected]> Reviewed-by: Vlastimil Babka <[email protected]> Cc: Harry Yoo <[email protected]> Cc: Liam R. Howlett <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent d5c8aec commit b714ccb

File tree

1 file changed

+204
-89
lines changed

1 file changed

+204
-89
lines changed

mm/mremap.c

Lines changed: 204 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -884,13 +884,13 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
884884
* Perform checks before attempting to write a VMA prior to it being
885885
* moved.
886886
*/
887-
static unsigned long prep_move_vma(struct vma_remap_struct *vrm,
888-
unsigned long *vm_flags_ptr)
887+
static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
889888
{
890889
unsigned long err = 0;
891890
struct vm_area_struct *vma = vrm->vma;
892891
unsigned long old_addr = vrm->addr;
893892
unsigned long old_len = vrm->old_len;
893+
unsigned long dummy = vma->vm_flags;
894894

895895
/*
896896
* We'd prefer to avoid failure later on in do_munmap:
@@ -916,85 +916,236 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm,
916916
* so KSM can come around to merge on vma and new_vma afterwards.
917917
*/
918918
err = ksm_madvise(vma, old_addr, old_addr + old_len,
919-
MADV_UNMERGEABLE, vm_flags_ptr);
919+
MADV_UNMERGEABLE, &dummy);
920920
if (err)
921921
return err;
922922

923923
return 0;
924924
}
925925

926-
static unsigned long move_vma(struct vma_remap_struct *vrm)
926+
/*
927+
* Unmap source VMA for VMA move, turning it from a copy to a move, being
928+
* careful to ensure we do not underflow memory account while doing so if an
929+
* accountable move.
930+
*
931+
* This is best effort, if we fail to unmap then we simply try to correct
932+
* accounting and exit.
933+
*/
934+
static void unmap_source_vma(struct vma_remap_struct *vrm)
927935
{
928936
struct mm_struct *mm = current->mm;
937+
unsigned long addr = vrm->addr;
938+
unsigned long len = vrm->old_len;
929939
struct vm_area_struct *vma = vrm->vma;
930-
struct vm_area_struct *new_vma;
931-
unsigned long vm_flags = vma->vm_flags;
932-
unsigned long old_addr = vrm->addr, new_addr = vrm->new_addr;
933-
unsigned long old_len = vrm->old_len, new_len = vrm->new_len;
934-
unsigned long new_pgoff;
935-
unsigned long moved_len;
936-
unsigned long account_start = false;
937-
unsigned long account_end = false;
938-
unsigned long hiwater_vm;
940+
VMA_ITERATOR(vmi, mm, addr);
939941
int err;
940-
bool need_rmap_locks;
941-
struct vma_iterator vmi;
942+
unsigned long vm_start;
943+
unsigned long vm_end;
944+
/*
945+
* It might seem odd that we check for MREMAP_DONTUNMAP here, given this
946+
* function implies that we unmap the original VMA, which seems
947+
* contradictory.
948+
*
949+
* However, this occurs when this operation was attempted and an error
950+
* arose, in which case we _do_ wish to unmap the _new_ VMA, which means
951+
* we actually _do_ want it be unaccounted.
952+
*/
953+
bool accountable_move = (vma->vm_flags & VM_ACCOUNT) &&
954+
!(vrm->flags & MREMAP_DONTUNMAP);
942955

943-
err = prep_move_vma(vrm, &vm_flags);
944-
if (err)
945-
return err;
956+
/*
957+
* So we perform a trick here to prevent incorrect accounting. Any merge
958+
* or new VMA allocation performed in copy_vma() does not adjust
959+
* accounting, it is expected that callers handle this.
960+
*
961+
* And indeed we already have, accounting appropriately in the case of
962+
* both in vrm_charge().
963+
*
964+
* However, when we unmap the existing VMA (to effect the move), this
965+
* code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
966+
* removed pages.
967+
*
968+
* To avoid this we temporarily clear this flag, reinstating on any
969+
* portions of the original VMA that remain.
970+
*/
971+
if (accountable_move) {
972+
vm_flags_clear(vma, VM_ACCOUNT);
973+
/* We are about to split vma, so store the start/end. */
974+
vm_start = vma->vm_start;
975+
vm_end = vma->vm_end;
976+
}
946977

947-
/* If accounted, charge the number of bytes the operation will use. */
948-
if (!vrm_charge(vrm))
949-
return -ENOMEM;
978+
err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
979+
vrm->vma = NULL; /* Invalidated. */
980+
if (err) {
981+
/* OOM: unable to split vma, just get accounts right */
982+
vm_acct_memory(len >> PAGE_SHIFT);
983+
return;
984+
}
950985

951-
vma_start_write(vma);
952-
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
953-
new_vma = copy_vma(&vrm->vma, new_addr, new_len, new_pgoff,
986+
/*
987+
* If we mremap() from a VMA like this:
988+
*
989+
* addr end
990+
* | |
991+
* v v
992+
* |-------------|
993+
* | |
994+
* |-------------|
995+
*
996+
* Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
997+
* we'll end up with:
998+
*
999+
* addr end
1000+
* | |
1001+
* v v
1002+
* |---| |---|
1003+
* | A | | B |
1004+
* |---| |---|
1005+
*
1006+
* The VMI is still pointing at addr, so vma_prev() will give us A, and
1007+
* a subsequent or lone vma_next() will give as B.
1008+
*
1009+
* do_vmi_munmap() will have restored the VMI back to addr.
1010+
*/
1011+
if (accountable_move) {
1012+
unsigned long end = addr + len;
1013+
1014+
if (vm_start < addr) {
1015+
struct vm_area_struct *prev = vma_prev(&vmi);
1016+
1017+
vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */
1018+
}
1019+
1020+
if (vm_end > end) {
1021+
struct vm_area_struct *next = vma_next(&vmi);
1022+
1023+
vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */
1024+
}
1025+
}
1026+
}
1027+
1028+
/*
1029+
* Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
1030+
* process. Additionally handle an error occurring on moving of page tables,
1031+
* where we reset vrm state to cause unmapping of the new VMA.
1032+
*
1033+
* Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
1034+
* error code.
1035+
*/
1036+
static int copy_vma_and_data(struct vma_remap_struct *vrm,
1037+
struct vm_area_struct **new_vma_ptr)
1038+
{
1039+
unsigned long internal_offset = vrm->addr - vrm->vma->vm_start;
1040+
unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
1041+
unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
1042+
unsigned long moved_len;
1043+
bool need_rmap_locks;
1044+
struct vm_area_struct *vma;
1045+
struct vm_area_struct *new_vma;
1046+
int err = 0;
1047+
1048+
new_vma = copy_vma(&vrm->vma, vrm->new_addr, vrm->new_len, new_pgoff,
9541049
&need_rmap_locks);
955-
/* This may have been updated. */
956-
vma = vrm->vma;
9571050
if (!new_vma) {
9581051
vrm_uncharge(vrm);
1052+
*new_vma_ptr = NULL;
9591053
return -ENOMEM;
9601054
}
1055+
vma = vrm->vma;
9611056

962-
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
963-
need_rmap_locks, false);
964-
if (moved_len < old_len) {
1057+
moved_len = move_page_tables(vma, vrm->addr, new_vma,
1058+
vrm->new_addr, vrm->old_len,
1059+
need_rmap_locks, /* for_stack= */false);
1060+
if (moved_len < vrm->old_len)
9651061
err = -ENOMEM;
966-
} else if (vma->vm_ops && vma->vm_ops->mremap) {
1062+
else if (vma->vm_ops && vma->vm_ops->mremap)
9671063
err = vma->vm_ops->mremap(new_vma);
968-
}
9691064

9701065
if (unlikely(err)) {
9711066
/*
9721067
* On error, move entries back from new area to old,
9731068
* which will succeed since page tables still there,
9741069
* and then proceed to unmap new area instead of old.
9751070
*/
976-
move_page_tables(new_vma, new_addr, vma, old_addr, moved_len,
977-
true, false);
978-
vma = new_vma;
979-
old_len = new_len;
980-
old_addr = new_addr;
981-
new_addr = err;
1071+
move_page_tables(new_vma, vrm->new_addr, vma, vrm->addr,
1072+
moved_len, /* need_rmap_locks = */true,
1073+
/* for_stack= */false);
1074+
vrm->vma = new_vma;
1075+
vrm->old_len = vrm->new_len;
1076+
vrm->addr = vrm->new_addr;
9821077
} else {
9831078
mremap_userfaultfd_prep(new_vma, vrm->uf);
9841079
}
9851080

986-
if (is_vm_hugetlb_page(vma)) {
1081+
if (is_vm_hugetlb_page(vma))
9871082
clear_vma_resv_huge_pages(vma);
988-
}
9891083

990-
/* Conceal VM_ACCOUNT so old reservation is not undone */
991-
if (vm_flags & VM_ACCOUNT && !(vrm->flags & MREMAP_DONTUNMAP)) {
992-
vm_flags_clear(vma, VM_ACCOUNT);
993-
if (vma->vm_start < old_addr)
994-
account_start = true;
995-
if (vma->vm_end > old_addr + old_len)
996-
account_end = true;
997-
}
1084+
/* Tell pfnmap has moved from this vma */
1085+
if (unlikely(vma->vm_flags & VM_PFNMAP))
1086+
untrack_pfn_clear(vma);
1087+
1088+
*new_vma_ptr = new_vma;
1089+
return err;
1090+
}
1091+
1092+
/*
1093+
* Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and
1094+
* account flags on remaining VMA by convention (it cannot be mlock()'d any
1095+
* longer, as pages in range are no longer mapped), and removing anon_vma_chain
1096+
* links from it (if the entire VMA was copied over).
1097+
*/
1098+
static void dontunmap_complete(struct vma_remap_struct *vrm,
1099+
struct vm_area_struct *new_vma)
1100+
{
1101+
unsigned long start = vrm->addr;
1102+
unsigned long end = vrm->addr + vrm->old_len;
1103+
unsigned long old_start = vrm->vma->vm_start;
1104+
unsigned long old_end = vrm->vma->vm_end;
1105+
1106+
/*
1107+
* We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old
1108+
* vma.
1109+
*/
1110+
vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT);
1111+
1112+
/*
1113+
* anon_vma links of the old vma is no longer needed after its page
1114+
* table has been moved.
1115+
*/
1116+
if (new_vma != vrm->vma && start == old_start && end == old_end)
1117+
unlink_anon_vmas(vrm->vma);
1118+
1119+
/* Because we won't unmap we don't need to touch locked_vm. */
1120+
}
1121+
1122+
static unsigned long move_vma(struct vma_remap_struct *vrm)
1123+
{
1124+
struct mm_struct *mm = current->mm;
1125+
struct vm_area_struct *new_vma;
1126+
unsigned long hiwater_vm;
1127+
int err;
1128+
1129+
err = prep_move_vma(vrm);
1130+
if (err)
1131+
return err;
1132+
1133+
/* If accounted, charge the number of bytes the operation will use. */
1134+
if (!vrm_charge(vrm))
1135+
return -ENOMEM;
1136+
1137+
/* We don't want racing faults. */
1138+
vma_start_write(vrm->vma);
1139+
1140+
/* Perform copy step. */
1141+
err = copy_vma_and_data(vrm, &new_vma);
1142+
/*
1143+
* If we established the copied-to VMA, we attempt to recover from the
1144+
* error by setting the destination VMA to the source VMA and unmapping
1145+
* it below.
1146+
*/
1147+
if (err && !new_vma)
1148+
return err;
9981149

9991150
/*
10001151
* If we failed to move page tables we still do total_vm increment
@@ -1007,51 +1158,15 @@ static unsigned long move_vma(struct vma_remap_struct *vrm)
10071158
*/
10081159
hiwater_vm = mm->hiwater_vm;
10091160

1010-
/* Tell pfnmap has moved from this vma */
1011-
if (unlikely(vma->vm_flags & VM_PFNMAP))
1012-
untrack_pfn_clear(vma);
1013-
1014-
if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) {
1015-
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
1016-
vm_flags_clear(vma, VM_LOCKED_MASK);
1017-
1018-
/*
1019-
* anon_vma links of the old vma is no longer needed after its page
1020-
* table has been moved.
1021-
*/
1022-
if (new_vma != vma && vma->vm_start == old_addr &&
1023-
vma->vm_end == (old_addr + old_len))
1024-
unlink_anon_vmas(vma);
1025-
1026-
/* Because we won't unmap we don't need to touch locked_vm */
1027-
vrm_stat_account(vrm, new_len);
1028-
return new_addr;
1029-
}
1030-
1031-
vrm_stat_account(vrm, new_len);
1032-
1033-
vma_iter_init(&vmi, mm, old_addr);
1034-
if (do_vmi_munmap(&vmi, mm, old_addr, old_len, vrm->uf_unmap, false) < 0) {
1035-
/* OOM: unable to split vma, just get accounts right */
1036-
if (vm_flags & VM_ACCOUNT && !(vrm->flags & MREMAP_DONTUNMAP))
1037-
vm_acct_memory(old_len >> PAGE_SHIFT);
1038-
account_start = account_end = false;
1039-
}
1161+
vrm_stat_account(vrm, vrm->new_len);
1162+
if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP)))
1163+
dontunmap_complete(vrm, new_vma);
1164+
else
1165+
unmap_source_vma(vrm);
10401166

10411167
mm->hiwater_vm = hiwater_vm;
10421168

1043-
/* Restore VM_ACCOUNT if one or two pieces of vma left */
1044-
if (account_start) {
1045-
vma = vma_prev(&vmi);
1046-
vm_flags_set(vma, VM_ACCOUNT);
1047-
}
1048-
1049-
if (account_end) {
1050-
vma = vma_next(&vmi);
1051-
vm_flags_set(vma, VM_ACCOUNT);
1052-
}
1053-
1054-
return new_addr;
1169+
return err ? (unsigned long)err : vrm->new_addr;
10551170
}
10561171

10571172
/*

0 commit comments

Comments
 (0)