Skip to content

Commit c04e989

Browse files
DemiMariejgross1
authored andcommitted
xen: speed up grant-table reclaim
When a grant entry is still in use by the remote domain, Linux must put it on a deferred list. Normally, this list is very short, because the PV network and block protocols expect the backend to unmap the grant first. However, Qubes OS's GUI protocol is subject to the constraints of the X Window System, and as such winds up with the frontend unmapping the window first. As a result, the list can grow very large, resulting in a massive memory leak and eventual VM freeze. To partially solve this problem, make the number of entries that the VM will attempt to free at each iteration tunable. The default is still 10, but it can be overridden via a module parameter. This is Cc: stable because (when combined with appropriate userspace changes) it fixes a severe performance and stability problem for Qubes OS users. Cc: [email protected] Signed-off-by: Demi Marie Obenour <[email protected]> Reviewed-by: Juergen Gross <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Juergen Gross <[email protected]>
1 parent 58f6259 commit c04e989

File tree

2 files changed

+40
-11
lines changed

2 files changed

+40
-11
lines changed

Documentation/ABI/testing/sysfs-module

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,14 @@ Description: Module taint flags:
6060
C staging driver module
6161
E unsigned module
6262
== =====================
63+
64+
What: /sys/module/grant_table/parameters/free_per_iteration
65+
Date: July 2023
66+
KernelVersion: 6.5 but backported to all supported stable branches
67+
Contact: Xen developer discussion <[email protected]>
68+
Description: Read and write number of grant entries to attempt to free per iteration.
69+
70+
Note: Future versions of Xen and Linux may provide a better
71+
interface for controlling the rate of deferred grant reclaim
72+
or may not need it at all.
73+
Users: Qubes OS (https://www.qubes-os.org)

drivers/xen/grant-table.c

Lines changed: 29 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -498,14 +498,21 @@ static LIST_HEAD(deferred_list);
498498
static void gnttab_handle_deferred(struct timer_list *);
499499
static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred);
500500

501+
static atomic64_t deferred_count;
502+
static atomic64_t leaked_count;
503+
static unsigned int free_per_iteration = 10;
504+
module_param(free_per_iteration, uint, 0600);
505+
501506
static void gnttab_handle_deferred(struct timer_list *unused)
502507
{
503-
unsigned int nr = 10;
508+
unsigned int nr = READ_ONCE(free_per_iteration);
509+
const bool ignore_limit = nr == 0;
504510
struct deferred_entry *first = NULL;
505511
unsigned long flags;
512+
size_t freed = 0;
506513

507514
spin_lock_irqsave(&gnttab_list_lock, flags);
508-
while (nr--) {
515+
while ((ignore_limit || nr--) && !list_empty(&deferred_list)) {
509516
struct deferred_entry *entry
510517
= list_first_entry(&deferred_list,
511518
struct deferred_entry, list);
@@ -515,10 +522,14 @@ static void gnttab_handle_deferred(struct timer_list *unused)
515522
list_del(&entry->list);
516523
spin_unlock_irqrestore(&gnttab_list_lock, flags);
517524
if (_gnttab_end_foreign_access_ref(entry->ref)) {
525+
uint64_t ret = atomic64_dec_return(&deferred_count);
526+
518527
put_free_entry(entry->ref);
519-
pr_debug("freeing g.e. %#x (pfn %#lx)\n",
520-
entry->ref, page_to_pfn(entry->page));
528+
pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n",
529+
entry->ref, page_to_pfn(entry->page),
530+
(unsigned long long)ret);
521531
put_page(entry->page);
532+
freed++;
522533
kfree(entry);
523534
entry = NULL;
524535
} else {
@@ -530,21 +541,22 @@ static void gnttab_handle_deferred(struct timer_list *unused)
530541
spin_lock_irqsave(&gnttab_list_lock, flags);
531542
if (entry)
532543
list_add_tail(&entry->list, &deferred_list);
533-
else if (list_empty(&deferred_list))
534-
break;
535544
}
536-
if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) {
545+
if (list_empty(&deferred_list))
546+
WARN_ON(atomic64_read(&deferred_count));
547+
else if (!timer_pending(&deferred_timer)) {
537548
deferred_timer.expires = jiffies + HZ;
538549
add_timer(&deferred_timer);
539550
}
540551
spin_unlock_irqrestore(&gnttab_list_lock, flags);
552+
pr_debug("Freed %zu references", freed);
541553
}
542554

543555
static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
544556
{
545557
struct deferred_entry *entry;
546558
gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
547-
const char *what = KERN_WARNING "leaking";
559+
uint64_t leaked, deferred;
548560

549561
entry = kmalloc(sizeof(*entry), gfp);
550562
if (!page) {
@@ -567,10 +579,16 @@ static void gnttab_add_deferred(grant_ref_t ref, struct page *page)
567579
add_timer(&deferred_timer);
568580
}
569581
spin_unlock_irqrestore(&gnttab_list_lock, flags);
570-
what = KERN_DEBUG "deferring";
582+
deferred = atomic64_inc_return(&deferred_count);
583+
leaked = atomic64_read(&leaked_count);
584+
pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
585+
ref, page ? page_to_pfn(page) : -1, deferred, leaked);
586+
} else {
587+
deferred = atomic64_read(&deferred_count);
588+
leaked = atomic64_inc_return(&leaked_count);
589+
pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n",
590+
ref, page ? page_to_pfn(page) : -1, deferred, leaked);
571591
}
572-
printk("%s g.e. %#x (pfn %#lx)\n",
573-
what, ref, page ? page_to_pfn(page) : -1);
574592
}
575593

576594
int gnttab_try_end_foreign_access(grant_ref_t ref)

0 commit comments

Comments
 (0)