@@ -884,13 +884,13 @@ static void vrm_stat_account(struct vma_remap_struct *vrm,
884
884
* Perform checks before attempting to write a VMA prior to it being
885
885
* moved.
886
886
*/
887
- static unsigned long prep_move_vma (struct vma_remap_struct * vrm ,
888
- unsigned long * vm_flags_ptr )
887
+ static unsigned long prep_move_vma (struct vma_remap_struct * vrm )
889
888
{
890
889
unsigned long err = 0 ;
891
890
struct vm_area_struct * vma = vrm -> vma ;
892
891
unsigned long old_addr = vrm -> addr ;
893
892
unsigned long old_len = vrm -> old_len ;
893
+ unsigned long dummy = vma -> vm_flags ;
894
894
895
895
/*
896
896
* We'd prefer to avoid failure later on in do_munmap:
@@ -916,85 +916,236 @@ static unsigned long prep_move_vma(struct vma_remap_struct *vrm,
916
916
* so KSM can come around to merge on vma and new_vma afterwards.
917
917
*/
918
918
err = ksm_madvise (vma , old_addr , old_addr + old_len ,
919
- MADV_UNMERGEABLE , vm_flags_ptr );
919
+ MADV_UNMERGEABLE , & dummy );
920
920
if (err )
921
921
return err ;
922
922
923
923
return 0 ;
924
924
}
925
925
926
- static unsigned long move_vma (struct vma_remap_struct * vrm )
926
+ /*
927
+ * Unmap source VMA for VMA move, turning it from a copy to a move, being
928
+ * careful to ensure we do not underflow memory account while doing so if an
929
+ * accountable move.
930
+ *
931
+ * This is best effort, if we fail to unmap then we simply try to correct
932
+ * accounting and exit.
933
+ */
934
+ static void unmap_source_vma (struct vma_remap_struct * vrm )
927
935
{
928
936
struct mm_struct * mm = current -> mm ;
937
+ unsigned long addr = vrm -> addr ;
938
+ unsigned long len = vrm -> old_len ;
929
939
struct vm_area_struct * vma = vrm -> vma ;
930
- struct vm_area_struct * new_vma ;
931
- unsigned long vm_flags = vma -> vm_flags ;
932
- unsigned long old_addr = vrm -> addr , new_addr = vrm -> new_addr ;
933
- unsigned long old_len = vrm -> old_len , new_len = vrm -> new_len ;
934
- unsigned long new_pgoff ;
935
- unsigned long moved_len ;
936
- unsigned long account_start = false;
937
- unsigned long account_end = false;
938
- unsigned long hiwater_vm ;
940
+ VMA_ITERATOR (vmi , mm , addr );
939
941
int err ;
940
- bool need_rmap_locks ;
941
- struct vma_iterator vmi ;
942
+ unsigned long vm_start ;
943
+ unsigned long vm_end ;
944
+ /*
945
+ * It might seem odd that we check for MREMAP_DONTUNMAP here, given this
946
+ * function implies that we unmap the original VMA, which seems
947
+ * contradictory.
948
+ *
949
+ * However, this occurs when this operation was attempted and an error
950
+ * arose, in which case we _do_ wish to unmap the _new_ VMA, which means
951
+ * we actually _do_ want it be unaccounted.
952
+ */
953
+ bool accountable_move = (vma -> vm_flags & VM_ACCOUNT ) &&
954
+ !(vrm -> flags & MREMAP_DONTUNMAP );
942
955
943
- err = prep_move_vma (vrm , & vm_flags );
944
- if (err )
945
- return err ;
956
+ /*
957
+ * So we perform a trick here to prevent incorrect accounting. Any merge
958
+ * or new VMA allocation performed in copy_vma() does not adjust
959
+ * accounting, it is expected that callers handle this.
960
+ *
961
+ * And indeed we already have, accounting appropriately in the case of
962
+ * both in vrm_charge().
963
+ *
964
+ * However, when we unmap the existing VMA (to effect the move), this
965
+ * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
966
+ * removed pages.
967
+ *
968
+ * To avoid this we temporarily clear this flag, reinstating on any
969
+ * portions of the original VMA that remain.
970
+ */
971
+ if (accountable_move ) {
972
+ vm_flags_clear (vma , VM_ACCOUNT );
973
+ /* We are about to split vma, so store the start/end. */
974
+ vm_start = vma -> vm_start ;
975
+ vm_end = vma -> vm_end ;
976
+ }
946
977
947
- /* If accounted, charge the number of bytes the operation will use. */
948
- if (!vrm_charge (vrm ))
949
- return - ENOMEM ;
978
+ err = do_vmi_munmap (& vmi , mm , addr , len , vrm -> uf_unmap , /* unlock= */ false);
979
+ vrm -> vma = NULL ; /* Invalidated. */
980
+ if (err ) {
981
+ /* OOM: unable to split vma, just get accounts right */
982
+ vm_acct_memory (len >> PAGE_SHIFT );
983
+ return ;
984
+ }
950
985
951
- vma_start_write (vma );
952
- new_pgoff = vma -> vm_pgoff + ((old_addr - vma -> vm_start ) >> PAGE_SHIFT );
953
- new_vma = copy_vma (& vrm -> vma , new_addr , new_len , new_pgoff ,
986
+ /*
987
+ * If we mremap() from a VMA like this:
988
+ *
989
+ * addr end
990
+ * | |
991
+ * v v
992
+ * |-------------|
993
+ * | |
994
+ * |-------------|
995
+ *
996
+ * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
997
+ * we'll end up with:
998
+ *
999
+ * addr end
1000
+ * | |
1001
+ * v v
1002
+ * |---| |---|
1003
+ * | A | | B |
1004
+ * |---| |---|
1005
+ *
1006
+ * The VMI is still pointing at addr, so vma_prev() will give us A, and
1007
+ * a subsequent or lone vma_next() will give as B.
1008
+ *
1009
+ * do_vmi_munmap() will have restored the VMI back to addr.
1010
+ */
1011
+ if (accountable_move ) {
1012
+ unsigned long end = addr + len ;
1013
+
1014
+ if (vm_start < addr ) {
1015
+ struct vm_area_struct * prev = vma_prev (& vmi );
1016
+
1017
+ vm_flags_set (prev , VM_ACCOUNT ); /* Acquires VMA lock. */
1018
+ }
1019
+
1020
+ if (vm_end > end ) {
1021
+ struct vm_area_struct * next = vma_next (& vmi );
1022
+
1023
+ vm_flags_set (next , VM_ACCOUNT ); /* Acquires VMA lock. */
1024
+ }
1025
+ }
1026
+ }
1027
+
1028
+ /*
1029
+ * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
1030
+ * process. Additionally handle an error occurring on moving of page tables,
1031
+ * where we reset vrm state to cause unmapping of the new VMA.
1032
+ *
1033
+ * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
1034
+ * error code.
1035
+ */
1036
+ static int copy_vma_and_data (struct vma_remap_struct * vrm ,
1037
+ struct vm_area_struct * * new_vma_ptr )
1038
+ {
1039
+ unsigned long internal_offset = vrm -> addr - vrm -> vma -> vm_start ;
1040
+ unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT ;
1041
+ unsigned long new_pgoff = vrm -> vma -> vm_pgoff + internal_pgoff ;
1042
+ unsigned long moved_len ;
1043
+ bool need_rmap_locks ;
1044
+ struct vm_area_struct * vma ;
1045
+ struct vm_area_struct * new_vma ;
1046
+ int err = 0 ;
1047
+
1048
+ new_vma = copy_vma (& vrm -> vma , vrm -> new_addr , vrm -> new_len , new_pgoff ,
954
1049
& need_rmap_locks );
955
- /* This may have been updated. */
956
- vma = vrm -> vma ;
957
1050
if (!new_vma ) {
958
1051
vrm_uncharge (vrm );
1052
+ * new_vma_ptr = NULL ;
959
1053
return - ENOMEM ;
960
1054
}
1055
+ vma = vrm -> vma ;
961
1056
962
- moved_len = move_page_tables (vma , old_addr , new_vma , new_addr , old_len ,
963
- need_rmap_locks , false);
964
- if (moved_len < old_len ) {
1057
+ moved_len = move_page_tables (vma , vrm -> addr , new_vma ,
1058
+ vrm -> new_addr , vrm -> old_len ,
1059
+ need_rmap_locks , /* for_stack= */ false);
1060
+ if (moved_len < vrm -> old_len )
965
1061
err = - ENOMEM ;
966
- } else if (vma -> vm_ops && vma -> vm_ops -> mremap ) {
1062
+ else if (vma -> vm_ops && vma -> vm_ops -> mremap )
967
1063
err = vma -> vm_ops -> mremap (new_vma );
968
- }
969
1064
970
1065
if (unlikely (err )) {
971
1066
/*
972
1067
* On error, move entries back from new area to old,
973
1068
* which will succeed since page tables still there,
974
1069
* and then proceed to unmap new area instead of old.
975
1070
*/
976
- move_page_tables (new_vma , new_addr , vma , old_addr , moved_len ,
977
- true, false);
978
- vma = new_vma ;
979
- old_len = new_len ;
980
- old_addr = new_addr ;
981
- new_addr = err ;
1071
+ move_page_tables (new_vma , vrm -> new_addr , vma , vrm -> addr ,
1072
+ moved_len , /* need_rmap_locks = */ true,
1073
+ /* for_stack= */ false) ;
1074
+ vrm -> vma = new_vma ;
1075
+ vrm -> old_len = vrm -> new_len ;
1076
+ vrm -> addr = vrm -> new_addr ;
982
1077
} else {
983
1078
mremap_userfaultfd_prep (new_vma , vrm -> uf );
984
1079
}
985
1080
986
- if (is_vm_hugetlb_page (vma )) {
1081
+ if (is_vm_hugetlb_page (vma ))
987
1082
clear_vma_resv_huge_pages (vma );
988
- }
989
1083
990
- /* Conceal VM_ACCOUNT so old reservation is not undone */
991
- if (vm_flags & VM_ACCOUNT && !(vrm -> flags & MREMAP_DONTUNMAP )) {
992
- vm_flags_clear (vma , VM_ACCOUNT );
993
- if (vma -> vm_start < old_addr )
994
- account_start = true;
995
- if (vma -> vm_end > old_addr + old_len )
996
- account_end = true;
997
- }
1084
+ /* Tell pfnmap has moved from this vma */
1085
+ if (unlikely (vma -> vm_flags & VM_PFNMAP ))
1086
+ untrack_pfn_clear (vma );
1087
+
1088
+ * new_vma_ptr = new_vma ;
1089
+ return err ;
1090
+ }
1091
+
1092
+ /*
1093
+ * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and
1094
+ * account flags on remaining VMA by convention (it cannot be mlock()'d any
1095
+ * longer, as pages in range are no longer mapped), and removing anon_vma_chain
1096
+ * links from it (if the entire VMA was copied over).
1097
+ */
1098
+ static void dontunmap_complete (struct vma_remap_struct * vrm ,
1099
+ struct vm_area_struct * new_vma )
1100
+ {
1101
+ unsigned long start = vrm -> addr ;
1102
+ unsigned long end = vrm -> addr + vrm -> old_len ;
1103
+ unsigned long old_start = vrm -> vma -> vm_start ;
1104
+ unsigned long old_end = vrm -> vma -> vm_end ;
1105
+
1106
+ /*
1107
+ * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old
1108
+ * vma.
1109
+ */
1110
+ vm_flags_clear (vrm -> vma , VM_LOCKED_MASK | VM_ACCOUNT );
1111
+
1112
+ /*
1113
+ * anon_vma links of the old vma is no longer needed after its page
1114
+ * table has been moved.
1115
+ */
1116
+ if (new_vma != vrm -> vma && start == old_start && end == old_end )
1117
+ unlink_anon_vmas (vrm -> vma );
1118
+
1119
+ /* Because we won't unmap we don't need to touch locked_vm. */
1120
+ }
1121
+
1122
+ static unsigned long move_vma (struct vma_remap_struct * vrm )
1123
+ {
1124
+ struct mm_struct * mm = current -> mm ;
1125
+ struct vm_area_struct * new_vma ;
1126
+ unsigned long hiwater_vm ;
1127
+ int err ;
1128
+
1129
+ err = prep_move_vma (vrm );
1130
+ if (err )
1131
+ return err ;
1132
+
1133
+ /* If accounted, charge the number of bytes the operation will use. */
1134
+ if (!vrm_charge (vrm ))
1135
+ return - ENOMEM ;
1136
+
1137
+ /* We don't want racing faults. */
1138
+ vma_start_write (vrm -> vma );
1139
+
1140
+ /* Perform copy step. */
1141
+ err = copy_vma_and_data (vrm , & new_vma );
1142
+ /*
1143
+ * If we established the copied-to VMA, we attempt to recover from the
1144
+ * error by setting the destination VMA to the source VMA and unmapping
1145
+ * it below.
1146
+ */
1147
+ if (err && !new_vma )
1148
+ return err ;
998
1149
999
1150
/*
1000
1151
* If we failed to move page tables we still do total_vm increment
@@ -1007,51 +1158,15 @@ static unsigned long move_vma(struct vma_remap_struct *vrm)
1007
1158
*/
1008
1159
hiwater_vm = mm -> hiwater_vm ;
1009
1160
1010
- /* Tell pfnmap has moved from this vma */
1011
- if (unlikely (vma -> vm_flags & VM_PFNMAP ))
1012
- untrack_pfn_clear (vma );
1013
-
1014
- if (unlikely (!err && (vrm -> flags & MREMAP_DONTUNMAP ))) {
1015
- /* We always clear VM_LOCKED[ONFAULT] on the old vma */
1016
- vm_flags_clear (vma , VM_LOCKED_MASK );
1017
-
1018
- /*
1019
- * anon_vma links of the old vma is no longer needed after its page
1020
- * table has been moved.
1021
- */
1022
- if (new_vma != vma && vma -> vm_start == old_addr &&
1023
- vma -> vm_end == (old_addr + old_len ))
1024
- unlink_anon_vmas (vma );
1025
-
1026
- /* Because we won't unmap we don't need to touch locked_vm */
1027
- vrm_stat_account (vrm , new_len );
1028
- return new_addr ;
1029
- }
1030
-
1031
- vrm_stat_account (vrm , new_len );
1032
-
1033
- vma_iter_init (& vmi , mm , old_addr );
1034
- if (do_vmi_munmap (& vmi , mm , old_addr , old_len , vrm -> uf_unmap , false) < 0 ) {
1035
- /* OOM: unable to split vma, just get accounts right */
1036
- if (vm_flags & VM_ACCOUNT && !(vrm -> flags & MREMAP_DONTUNMAP ))
1037
- vm_acct_memory (old_len >> PAGE_SHIFT );
1038
- account_start = account_end = false;
1039
- }
1161
+ vrm_stat_account (vrm , vrm -> new_len );
1162
+ if (unlikely (!err && (vrm -> flags & MREMAP_DONTUNMAP )))
1163
+ dontunmap_complete (vrm , new_vma );
1164
+ else
1165
+ unmap_source_vma (vrm );
1040
1166
1041
1167
mm -> hiwater_vm = hiwater_vm ;
1042
1168
1043
- /* Restore VM_ACCOUNT if one or two pieces of vma left */
1044
- if (account_start ) {
1045
- vma = vma_prev (& vmi );
1046
- vm_flags_set (vma , VM_ACCOUNT );
1047
- }
1048
-
1049
- if (account_end ) {
1050
- vma = vma_next (& vmi );
1051
- vm_flags_set (vma , VM_ACCOUNT );
1052
- }
1053
-
1054
- return new_addr ;
1169
+ return err ? (unsigned long )err : vrm -> new_addr ;
1055
1170
}
1056
1171
1057
1172
/*
0 commit comments