Skip to content

Commit 7138970

Browse files
djbwIngo Molnar
authored andcommitted
mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash
The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov <[email protected]> Tested-by: Kirill Shutemov <[email protected]> Signed-off-by: Dan Williams <[email protected]> Reviewed-by: Logan Gunthorpe <[email protected]> Cc: Andrew Morton <[email protected]> Cc: Andy Lutomirski <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: Brian Gerst <[email protected]> Cc: Denys Vlasenko <[email protected]> Cc: H. Peter Anvin <[email protected]> Cc: Josh Poimboeuf <[email protected]> Cc: Jérôme Glisse <[email protected]> Cc: Linus Torvalds <[email protected]> Cc: Peter Zijlstra <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: [email protected] Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar <[email protected]>
1 parent dbd68d8 commit 7138970

File tree

5 files changed

+31
-30
lines changed

5 files changed

+31
-30
lines changed

drivers/dax/pmem.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data)
4343
struct dax_pmem *dax_pmem = to_dax_pmem(ref);
4444

4545
dev_dbg(dax_pmem->dev, "%s\n", __func__);
46+
wait_for_completion(&dax_pmem->cmp);
4647
percpu_ref_exit(ref);
4748
}
4849

@@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data)
5354

5455
dev_dbg(dax_pmem->dev, "%s\n", __func__);
5556
percpu_ref_kill(ref);
56-
wait_for_completion(&dax_pmem->cmp);
5757
}
5858

5959
static int dax_pmem_probe(struct device *dev)

drivers/nvdimm/pmem.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include <linux/badblocks.h>
2626
#include <linux/memremap.h>
2727
#include <linux/vmalloc.h>
28+
#include <linux/blk-mq.h>
2829
#include <linux/pfn_t.h>
2930
#include <linux/slab.h>
3031
#include <linux/pmem.h>
@@ -231,6 +232,11 @@ static void pmem_release_queue(void *q)
231232
blk_cleanup_queue(q);
232233
}
233234

235+
static void pmem_freeze_queue(void *q)
236+
{
237+
blk_mq_freeze_queue_start(q);
238+
}
239+
234240
static void pmem_release_disk(void *disk)
235241
{
236242
del_gendisk(disk);
@@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev,
284290
if (!q)
285291
return -ENOMEM;
286292

293+
if (devm_add_action_or_reset(dev, pmem_release_queue, q))
294+
return -ENOMEM;
295+
287296
pmem->pfn_flags = PFN_DEV;
288297
if (is_nd_pfn(dev)) {
289298
addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter,
@@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev,
303312
pmem->size, ARCH_MEMREMAP_PMEM);
304313

305314
/*
306-
* At release time the queue must be dead before
315+
* At release time the queue must be frozen before
307316
* devm_memremap_pages is unwound
308317
*/
309-
if (devm_add_action_or_reset(dev, pmem_release_queue, q))
318+
if (devm_add_action_or_reset(dev, pmem_freeze_queue, q))
310319
return -ENOMEM;
311320

312321
if (IS_ERR(addr))

include/linux/mm.h

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -762,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page)
762762
}
763763

764764
#ifdef CONFIG_ZONE_DEVICE
765-
void get_zone_device_page(struct page *page);
766-
void put_zone_device_page(struct page *page);
767765
static inline bool is_zone_device_page(const struct page *page)
768766
{
769767
return page_zonenum(page) == ZONE_DEVICE;
770768
}
771769
#else
772-
static inline void get_zone_device_page(struct page *page)
773-
{
774-
}
775-
static inline void put_zone_device_page(struct page *page)
776-
{
777-
}
778770
static inline bool is_zone_device_page(const struct page *page)
779771
{
780772
return false;
@@ -790,9 +782,6 @@ static inline void get_page(struct page *page)
790782
*/
791783
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
792784
page_ref_inc(page);
793-
794-
if (unlikely(is_zone_device_page(page)))
795-
get_zone_device_page(page);
796785
}
797786

798787
static inline void put_page(struct page *page)
@@ -801,9 +790,6 @@ static inline void put_page(struct page *page)
801790

802791
if (put_page_testzero(page))
803792
__put_page(page);
804-
805-
if (unlikely(is_zone_device_page(page)))
806-
put_zone_device_page(page);
807793
}
808794

809795
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)

kernel/memremap.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -182,18 +182,6 @@ struct page_map {
182182
struct vmem_altmap altmap;
183183
};
184184

185-
void get_zone_device_page(struct page *page)
186-
{
187-
percpu_ref_get(page->pgmap->ref);
188-
}
189-
EXPORT_SYMBOL(get_zone_device_page);
190-
191-
void put_zone_device_page(struct page *page)
192-
{
193-
put_dev_pagemap(page->pgmap);
194-
}
195-
EXPORT_SYMBOL(put_zone_device_page);
196-
197185
static void pgmap_radix_release(struct resource *res)
198186
{
199187
resource_size_t key, align_start, align_size, align_end;
@@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
237225
struct resource *res = &page_map->res;
238226
resource_size_t align_start, align_size;
239227
struct dev_pagemap *pgmap = &page_map->pgmap;
228+
unsigned long pfn;
229+
230+
for_each_device_pfn(pfn, page_map)
231+
put_page(pfn_to_page(pfn));
240232

241233
if (percpu_ref_tryget_live(pgmap->ref)) {
242234
dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
@@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
277269
*
278270
* Notes:
279271
* 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time
280-
* (or devm release event).
272+
* (or devm release event). The expected order of events is that @ref has
273+
* been through percpu_ref_kill() before devm_memremap_pages_release(). The
274+
* wait for the completion of all references being dropped and
275+
* percpu_ref_exit() must occur after devm_memremap_pages_release().
281276
*
282277
* 2/ @res is expected to be a host memory range that could feasibly be
283278
* treated as a "System RAM" range, i.e. not a device mmio range, but
@@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
379374
*/
380375
list_del(&page->lru);
381376
page->pgmap = pgmap;
377+
percpu_ref_get(ref);
382378
}
383379
devres_add(dev, page_map);
384380
return __va(res->start);

mm/swap.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page)
9797

9898
void __put_page(struct page *page)
9999
{
100+
if (is_zone_device_page(page)) {
101+
put_dev_pagemap(page->pgmap);
102+
103+
/*
104+
* The page belongs to the device that created pgmap. Do
105+
* not return it to page allocator.
106+
*/
107+
return;
108+
}
109+
100110
if (unlikely(PageCompound(page)))
101111
__put_compound_page(page);
102112
else

0 commit comments

Comments
 (0)