Skip to content

Commit 6b3a707

Browse files
committed
Merge branch 'page-refs' (page ref overflow)
Merge page ref overflow branch. Jann Horn reported that he can overflow the page ref count with sufficient memory (and a filesystem that is intentionally extremely slow). Admittedly it's not exactly easy. To have more than four billion references to a page requires a minimum of 32GB of kernel memory just for the pointers to the pages, much less any metadata to keep track of those pointers. Jann needed a total of 140GB of memory and a specially crafted filesystem that leaves all reads pending (in order to not ever free the page references and just keep adding more). Still, we have a fairly straightforward way to limit the two obvious user-controllable sources of page references: direct-IO like page references gotten through get_user_pages(), and the splice pipe page duplication. So let's just do that. * branch page-refs: fs: prevent page refcount overflow in pipe_buf_get mm: prevent get_user_pages() from overflowing page refcount mm: add 'try_get_page()' helper function mm: make page ref count overflow check tighter and more explicit
2 parents 4443f8e + 15fab63 commit 6b3a707

File tree

8 files changed

+92
-28
lines changed

8 files changed

+92
-28
lines changed

fs/fuse/dev.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
20562056
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
20572057

20582058
ret = -EINVAL;
2059-
if (rem < len) {
2060-
pipe_unlock(pipe);
2061-
goto out;
2062-
}
2059+
if (rem < len)
2060+
goto out_free;
20632061

20642062
rem = len;
20652063
while (rem) {
@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
20772075
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
20782076
pipe->nrbufs--;
20792077
} else {
2080-
pipe_buf_get(pipe, ibuf);
2078+
if (!pipe_buf_get(pipe, ibuf))
2079+
goto out_free;
2080+
20812081
*obuf = *ibuf;
20822082
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
20832083
obuf->len = rem;
@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
21002100
ret = fuse_dev_do_write(fud, &cs, len);
21012101

21022102
pipe_lock(pipe);
2103+
out_free:
21032104
for (idx = 0; idx < nbuf; idx++)
21042105
pipe_buf_release(pipe, &bufs[idx]);
21052106
pipe_unlock(pipe);
21062107

2107-
out:
21082108
kvfree(bufs);
21092109
return ret;
21102110
}

fs/pipe.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
188188
* in the tee() system call, when we duplicate the buffers in one
189189
* pipe into another.
190190
*/
191-
void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
191+
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
192192
{
193-
get_page(buf->page);
193+
return try_get_page(buf->page);
194194
}
195195
EXPORT_SYMBOL(generic_pipe_buf_get);
196196

fs/splice.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1593,7 +1593,11 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
15931593
* Get a reference to this pipe buffer,
15941594
* so we can copy the contents over.
15951595
*/
1596-
pipe_buf_get(ipipe, ibuf);
1596+
if (!pipe_buf_get(ipipe, ibuf)) {
1597+
if (ret == 0)
1598+
ret = -EFAULT;
1599+
break;
1600+
}
15971601
*obuf = *ibuf;
15981602

15991603
/*
@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
16671671
* Get a reference to this pipe buffer,
16681672
* so we can copy the contents over.
16691673
*/
1670-
pipe_buf_get(ipipe, ibuf);
1674+
if (!pipe_buf_get(ipipe, ibuf)) {
1675+
if (ret == 0)
1676+
ret = -EFAULT;
1677+
break;
1678+
}
16711679

16721680
obuf = opipe->bufs + nbuf;
16731681
*obuf = *ibuf;

include/linux/mm.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -966,15 +966,28 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
966966
}
967967
#endif /* CONFIG_DEV_PAGEMAP_OPS */
968968

969+
/* 127: arbitrary random number, small enough to assemble well */
970+
#define page_ref_zero_or_close_to_overflow(page) \
971+
((unsigned int) page_ref_count(page) + 127u <= 127u)
972+
969973
static inline void get_page(struct page *page)
970974
{
971975
page = compound_head(page);
972976
/*
973977
* Getting a normal page or the head of a compound page
974978
* requires to already have an elevated page->_refcount.
975979
*/
976-
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
980+
VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
981+
page_ref_inc(page);
982+
}
983+
984+
static inline __must_check bool try_get_page(struct page *page)
985+
{
986+
page = compound_head(page);
987+
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
988+
return false;
977989
page_ref_inc(page);
990+
return true;
978991
}
979992

980993
static inline void put_page(struct page *page)

include/linux/pipe_fs_i.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,18 +101,20 @@ struct pipe_buf_operations {
101101
/*
102102
* Get a reference to the pipe buffer.
103103
*/
104-
void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
104+
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
105105
};
106106

107107
/**
108108
* pipe_buf_get - get a reference to a pipe_buffer
109109
* @pipe: the pipe that the buffer belongs to
110110
* @buf: the buffer to get a reference to
111+
*
112+
* Return: %true if the reference was successfully obtained.
111113
*/
112-
static inline void pipe_buf_get(struct pipe_inode_info *pipe,
114+
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
113115
struct pipe_buffer *buf)
114116
{
115-
buf->ops->get(pipe, buf);
117+
return buf->ops->get(pipe, buf);
116118
}
117119

118120
/**
@@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void);
171173
void free_pipe_info(struct pipe_inode_info *);
172174

173175
/* Generic pipe buffer ops functions */
174-
void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
176+
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
175177
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
176178
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
177179
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);

kernel/trace/trace.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
70417041
buf->private = 0;
70427042
}
70437043

7044-
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
7044+
static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
70457045
struct pipe_buffer *buf)
70467046
{
70477047
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
70487048

7049+
if (ref->ref > INT_MAX/2)
7050+
return false;
7051+
70497052
ref->ref++;
7053+
return true;
70507054
}
70517055

70527056
/* Pipe buffer operations for a buffer. */

mm/gup.c

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,12 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
160160
goto retry;
161161
}
162162

163-
if (flags & FOLL_GET)
164-
get_page(page);
163+
if (flags & FOLL_GET) {
164+
if (unlikely(!try_get_page(page))) {
165+
page = ERR_PTR(-ENOMEM);
166+
goto out;
167+
}
168+
}
165169
if (flags & FOLL_TOUCH) {
166170
if ((flags & FOLL_WRITE) &&
167171
!pte_dirty(pte) && !PageDirty(page))
@@ -298,7 +302,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
298302
if (pmd_trans_unstable(pmd))
299303
ret = -EBUSY;
300304
} else {
301-
get_page(page);
305+
if (unlikely(!try_get_page(page))) {
306+
spin_unlock(ptl);
307+
return ERR_PTR(-ENOMEM);
308+
}
302309
spin_unlock(ptl);
303310
lock_page(page);
304311
ret = split_huge_page(page);
@@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
500507
if (is_device_public_page(*page))
501508
goto unmap;
502509
}
503-
get_page(*page);
510+
if (unlikely(!try_get_page(*page))) {
511+
ret = -ENOMEM;
512+
goto unmap;
513+
}
504514
out:
505515
ret = 0;
506516
unmap:
@@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
15451555
}
15461556
}
15471557

1558+
/*
1559+
* Return the compund head page with ref appropriately incremented,
1560+
* or NULL if that failed.
1561+
*/
1562+
static inline struct page *try_get_compound_head(struct page *page, int refs)
1563+
{
1564+
struct page *head = compound_head(page);
1565+
if (WARN_ON_ONCE(page_ref_count(head) < 0))
1566+
return NULL;
1567+
if (unlikely(!page_cache_add_speculative(head, refs)))
1568+
return NULL;
1569+
return head;
1570+
}
1571+
15481572
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
15491573
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
15501574
int write, struct page **pages, int *nr)
@@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
15791603

15801604
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
15811605
page = pte_page(pte);
1582-
head = compound_head(page);
15831606

1584-
if (!page_cache_get_speculative(head))
1607+
head = try_get_compound_head(page, 1);
1608+
if (!head)
15851609
goto pte_unmap;
15861610

15871611
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
@@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
17201744
refs++;
17211745
} while (addr += PAGE_SIZE, addr != end);
17221746

1723-
head = compound_head(pmd_page(orig));
1724-
if (!page_cache_add_speculative(head, refs)) {
1747+
head = try_get_compound_head(pmd_page(orig), refs);
1748+
if (!head) {
17251749
*nr -= refs;
17261750
return 0;
17271751
}
@@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
17581782
refs++;
17591783
} while (addr += PAGE_SIZE, addr != end);
17601784

1761-
head = compound_head(pud_page(orig));
1762-
if (!page_cache_add_speculative(head, refs)) {
1785+
head = try_get_compound_head(pud_page(orig), refs);
1786+
if (!head) {
17631787
*nr -= refs;
17641788
return 0;
17651789
}
@@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
17951819
refs++;
17961820
} while (addr += PAGE_SIZE, addr != end);
17971821

1798-
head = compound_head(pgd_page(orig));
1799-
if (!page_cache_add_speculative(head, refs)) {
1822+
head = try_get_compound_head(pgd_page(orig), refs);
1823+
if (!head) {
18001824
*nr -= refs;
18011825
return 0;
18021826
}

mm/hugetlb.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
42994299

43004300
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
43014301
page = pte_page(huge_ptep_get(pte));
4302+
4303+
/*
4304+
* Instead of doing 'try_get_page()' below in the same_page
4305+
* loop, just check the count once here.
4306+
*/
4307+
if (unlikely(page_count(page) <= 0)) {
4308+
if (pages) {
4309+
spin_unlock(ptl);
4310+
remainder = 0;
4311+
err = -ENOMEM;
4312+
break;
4313+
}
4314+
}
43024315
same_page:
43034316
if (pages) {
43044317
pages[i] = mem_map_offset(page, pfn_offset);

0 commit comments

Comments
 (0)