@@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator {
182
182
shadow_walk_okay(&(_walker)); \
183
183
shadow_walk_next(&(_walker)))
184
184
185
+ #define for_each_shadow_entry_lockless (_vcpu , _addr , _walker , spte ) \
186
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
187
+ shadow_walk_okay(&(_walker)) && \
188
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
189
+ __shadow_walk_next(&(_walker), spte))
190
+
185
191
static struct kmem_cache * pte_list_desc_cache ;
186
192
static struct kmem_cache * mmu_page_header_cache ;
187
193
static struct percpu_counter kvm_total_used_mmu_pages ;
@@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
274
280
{
275
281
return xchg (sptep , spte );
276
282
}
283
+
284
+ static u64 __get_spte_lockless (u64 * sptep )
285
+ {
286
+ return ACCESS_ONCE (* sptep );
287
+ }
277
288
#else
278
289
union split_spte {
279
290
struct {
@@ -283,6 +294,18 @@ union split_spte {
283
294
u64 spte ;
284
295
};
285
296
297
+ static void count_spte_clear (u64 * sptep , u64 spte )
298
+ {
299
+ struct kvm_mmu_page * sp = page_header (__pa (sptep ));
300
+
301
+ if (is_shadow_present_pte (spte ))
302
+ return ;
303
+
304
+ /* Ensure the spte is completely set before we increase the count */
305
+ smp_wmb ();
306
+ sp -> clear_spte_count ++ ;
307
+ }
308
+
286
309
static void __set_spte (u64 * sptep , u64 spte )
287
310
{
288
311
union split_spte * ssptep , sspte ;
@@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
318
341
smp_wmb ();
319
342
320
343
ssptep -> spte_high = sspte .spte_high ;
344
+ count_spte_clear (sptep , spte );
321
345
}
322
346
323
347
static u64 __update_clear_spte_slow (u64 * sptep , u64 spte )
@@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
330
354
/* xchg acts as a barrier before the setting of the high bits */
331
355
orig .spte_low = xchg (& ssptep -> spte_low , sspte .spte_low );
332
356
orig .spte_high = ssptep -> spte_high = sspte .spte_high ;
357
+ count_spte_clear (sptep , spte );
333
358
334
359
return orig .spte ;
335
360
}
361
+
362
+ /*
363
+ * The idea using the light way get the spte on x86_32 guest is from
364
+ * gup_get_pte(arch/x86/mm/gup.c).
365
+ * The difference is we can not catch the spte tlb flush if we leave
366
+ * guest mode, so we emulate it by increase clear_spte_count when spte
367
+ * is cleared.
368
+ */
369
+ static u64 __get_spte_lockless (u64 * sptep )
370
+ {
371
+ struct kvm_mmu_page * sp = page_header (__pa (sptep ));
372
+ union split_spte spte , * orig = (union split_spte * )sptep ;
373
+ int count ;
374
+
375
+ retry :
376
+ count = sp -> clear_spte_count ;
377
+ smp_rmb ();
378
+
379
+ spte .spte_low = orig -> spte_low ;
380
+ smp_rmb ();
381
+
382
+ spte .spte_high = orig -> spte_high ;
383
+ smp_rmb ();
384
+
385
+ if (unlikely (spte .spte_low != orig -> spte_low ||
386
+ count != sp -> clear_spte_count ))
387
+ goto retry ;
388
+
389
+ return spte .spte ;
390
+ }
336
391
#endif
337
392
338
393
static bool spte_has_volatile_bits (u64 spte )
@@ -435,6 +490,28 @@ static void mmu_spte_clear_no_track(u64 *sptep)
435
490
__update_clear_spte_fast (sptep , 0ull );
436
491
}
437
492
493
+ static u64 mmu_spte_get_lockless (u64 * sptep )
494
+ {
495
+ return __get_spte_lockless (sptep );
496
+ }
497
+
498
+ static void walk_shadow_page_lockless_begin (struct kvm_vcpu * vcpu )
499
+ {
500
+ rcu_read_lock ();
501
+ atomic_inc (& vcpu -> kvm -> arch .reader_counter );
502
+
503
+ /* Increase the counter before walking shadow page table */
504
+ smp_mb__after_atomic_inc ();
505
+ }
506
+
507
+ static void walk_shadow_page_lockless_end (struct kvm_vcpu * vcpu )
508
+ {
509
+ /* Decrease the counter after walking shadow page table finished */
510
+ smp_mb__before_atomic_dec ();
511
+ atomic_dec (& vcpu -> kvm -> arch .reader_counter );
512
+ rcu_read_unlock ();
513
+ }
514
+
438
515
static int mmu_topup_memory_cache (struct kvm_mmu_memory_cache * cache ,
439
516
struct kmem_cache * base_cache , int min )
440
517
{
@@ -1597,17 +1674,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
1597
1674
return true;
1598
1675
}
1599
1676
1600
- static void shadow_walk_next (struct kvm_shadow_walk_iterator * iterator )
1677
+ static void __shadow_walk_next (struct kvm_shadow_walk_iterator * iterator ,
1678
+ u64 spte )
1601
1679
{
1602
- if (is_last_spte (* iterator -> sptep , iterator -> level )) {
1680
+ if (is_last_spte (spte , iterator -> level )) {
1603
1681
iterator -> level = 0 ;
1604
1682
return ;
1605
1683
}
1606
1684
1607
- iterator -> shadow_addr = * iterator -> sptep & PT64_BASE_ADDR_MASK ;
1685
+ iterator -> shadow_addr = spte & PT64_BASE_ADDR_MASK ;
1608
1686
-- iterator -> level ;
1609
1687
}
1610
1688
1689
+ static void shadow_walk_next (struct kvm_shadow_walk_iterator * iterator )
1690
+ {
1691
+ return __shadow_walk_next (iterator , * iterator -> sptep );
1692
+ }
1693
+
1611
1694
static void link_shadow_page (u64 * sptep , struct kvm_mmu_page * sp )
1612
1695
{
1613
1696
u64 spte ;
@@ -1754,6 +1837,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
1754
1837
return ret ;
1755
1838
}
1756
1839
1840
+ static void kvm_mmu_isolate_pages (struct list_head * invalid_list )
1841
+ {
1842
+ struct kvm_mmu_page * sp ;
1843
+
1844
+ list_for_each_entry (sp , invalid_list , link )
1845
+ kvm_mmu_isolate_page (sp );
1846
+ }
1847
+
1848
+ static void free_pages_rcu (struct rcu_head * head )
1849
+ {
1850
+ struct kvm_mmu_page * next , * sp ;
1851
+
1852
+ sp = container_of (head , struct kvm_mmu_page , rcu );
1853
+ while (sp ) {
1854
+ if (!list_empty (& sp -> link ))
1855
+ next = list_first_entry (& sp -> link ,
1856
+ struct kvm_mmu_page , link );
1857
+ else
1858
+ next = NULL ;
1859
+ kvm_mmu_free_page (sp );
1860
+ sp = next ;
1861
+ }
1862
+ }
1863
+
1757
1864
static void kvm_mmu_commit_zap_page (struct kvm * kvm ,
1758
1865
struct list_head * invalid_list )
1759
1866
{
@@ -1764,6 +1871,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1764
1871
1765
1872
kvm_flush_remote_tlbs (kvm );
1766
1873
1874
+ if (atomic_read (& kvm -> arch .reader_counter )) {
1875
+ kvm_mmu_isolate_pages (invalid_list );
1876
+ sp = list_first_entry (invalid_list , struct kvm_mmu_page , link );
1877
+ list_del_init (invalid_list );
1878
+ call_rcu (& sp -> rcu , free_pages_rcu );
1879
+ return ;
1880
+ }
1881
+
1767
1882
do {
1768
1883
sp = list_first_entry (invalid_list , struct kvm_mmu_page , link );
1769
1884
WARN_ON (!sp -> role .invalid || sp -> root_count );
@@ -3784,16 +3899,17 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
3784
3899
int kvm_mmu_get_spte_hierarchy (struct kvm_vcpu * vcpu , u64 addr , u64 sptes [4 ])
3785
3900
{
3786
3901
struct kvm_shadow_walk_iterator iterator ;
3902
+ u64 spte ;
3787
3903
int nr_sptes = 0 ;
3788
3904
3789
- spin_lock ( & vcpu -> kvm -> mmu_lock );
3790
- for_each_shadow_entry (vcpu , addr , iterator ) {
3791
- sptes [iterator .level - 1 ] = * iterator . sptep ;
3905
+ walk_shadow_page_lockless_begin ( vcpu );
3906
+ for_each_shadow_entry_lockless (vcpu , addr , iterator , spte ) {
3907
+ sptes [iterator .level - 1 ] = spte ;
3792
3908
nr_sptes ++ ;
3793
- if (!is_shadow_present_pte (* iterator . sptep ))
3909
+ if (!is_shadow_present_pte (spte ))
3794
3910
break ;
3795
3911
}
3796
- spin_unlock ( & vcpu -> kvm -> mmu_lock );
3912
+ walk_shadow_page_lockless_end ( vcpu );
3797
3913
3798
3914
return nr_sptes ;
3799
3915
}
0 commit comments