Skip to content

Commit eedd5b1

Browse files
committed
RDMA/umem: Store ODP access mask information in PFN
As a preparation to remove dma_list, store access mask in PFN pointer and not in dma_addr_t. Tested-by: Jens Axboe <[email protected]> Reviewed-by: Jason Gunthorpe <[email protected]> Signed-off-by: Leon Romanovsky <[email protected]>
1 parent 8cad471 commit eedd5b1

File tree

5 files changed

+70
-99
lines changed

5 files changed

+70
-99
lines changed

drivers/infiniband/core/umem_odp.c

Lines changed: 43 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -296,30 +296,18 @@ EXPORT_SYMBOL(ib_umem_odp_release);
296296
static int ib_umem_odp_map_dma_single_page(
297297
struct ib_umem_odp *umem_odp,
298298
unsigned int dma_index,
299-
struct page *page,
300-
u64 access_mask)
299+
struct page *page)
301300
{
302301
struct ib_device *dev = umem_odp->umem.ibdev;
303302
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
304303

305-
if (*dma_addr) {
306-
/*
307-
* If the page is already dma mapped it means it went through
308-
* a non-invalidating trasition, like read-only to writable.
309-
* Resync the flags.
310-
*/
311-
*dma_addr = (*dma_addr & ODP_DMA_ADDR_MASK) | access_mask;
312-
return 0;
313-
}
314-
315304
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
316305
DMA_BIDIRECTIONAL);
317306
if (ib_dma_mapping_error(dev, *dma_addr)) {
318307
*dma_addr = 0;
319308
return -EFAULT;
320309
}
321310
umem_odp->npages++;
322-
*dma_addr |= access_mask;
323311
return 0;
324312
}
325313

@@ -355,9 +343,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
355343
struct hmm_range range = {};
356344
unsigned long timeout;
357345

358-
if (access_mask == 0)
359-
return -EINVAL;
360-
361346
if (user_virt < ib_umem_start(umem_odp) ||
362347
user_virt + bcnt > ib_umem_end(umem_odp))
363348
return -EFAULT;
@@ -383,7 +368,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
383368
if (fault) {
384369
range.default_flags = HMM_PFN_REQ_FAULT;
385370

386-
if (access_mask & ODP_WRITE_ALLOWED_BIT)
371+
if (access_mask & HMM_PFN_WRITE)
387372
range.default_flags |= HMM_PFN_REQ_WRITE;
388373
}
389374

@@ -415,22 +400,17 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
415400
for (pfn_index = 0; pfn_index < num_pfns;
416401
pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) {
417402

418-
if (fault) {
419-
/*
420-
* Since we asked for hmm_range_fault() to populate
421-
* pages it shouldn't return an error entry on success.
422-
*/
423-
WARN_ON(range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
424-
WARN_ON(!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
425-
} else {
426-
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) {
427-
WARN_ON(umem_odp->dma_list[dma_index]);
428-
continue;
429-
}
430-
access_mask = ODP_READ_ALLOWED_BIT;
431-
if (range.hmm_pfns[pfn_index] & HMM_PFN_WRITE)
432-
access_mask |= ODP_WRITE_ALLOWED_BIT;
433-
}
403+
/*
404+
* Since we asked for hmm_range_fault() to populate
405+
* pages it shouldn't return an error entry on success.
406+
*/
407+
WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR);
408+
WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID));
409+
if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID))
410+
continue;
411+
412+
if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED)
413+
continue;
434414

435415
hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]);
436416
/* If a hugepage was detected and ODP wasn't set for, the umem
@@ -445,13 +425,14 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
445425
}
446426

447427
ret = ib_umem_odp_map_dma_single_page(
448-
umem_odp, dma_index, hmm_pfn_to_page(range.hmm_pfns[pfn_index]),
449-
access_mask);
428+
umem_odp, dma_index,
429+
hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
450430
if (ret < 0) {
451431
ibdev_dbg(umem_odp->umem.ibdev,
452432
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
453433
break;
454434
}
435+
range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
455436
}
456437
/* upon success lock should stay on hold for the callee */
457438
if (!ret)
@@ -471,7 +452,6 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
471452
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
472453
u64 bound)
473454
{
474-
dma_addr_t dma_addr;
475455
dma_addr_t dma;
476456
int idx;
477457
u64 addr;
@@ -482,34 +462,37 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
482462
virt = max_t(u64, virt, ib_umem_start(umem_odp));
483463
bound = min_t(u64, bound, ib_umem_end(umem_odp));
484464
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
465+
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
466+
PAGE_SHIFT;
467+
struct page *page =
468+
hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
469+
485470
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
486471
dma = umem_odp->dma_list[idx];
487472

488-
/* The access flags guaranteed a valid DMA address in case was NULL */
489-
if (dma) {
490-
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >> PAGE_SHIFT;
491-
struct page *page = hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
492-
493-
dma_addr = dma & ODP_DMA_ADDR_MASK;
494-
ib_dma_unmap_page(dev, dma_addr,
495-
BIT(umem_odp->page_shift),
496-
DMA_BIDIRECTIONAL);
497-
if (dma & ODP_WRITE_ALLOWED_BIT) {
498-
struct page *head_page = compound_head(page);
499-
/*
500-
* set_page_dirty prefers being called with
501-
* the page lock. However, MMU notifiers are
502-
* called sometimes with and sometimes without
503-
* the lock. We rely on the umem_mutex instead
504-
* to prevent other mmu notifiers from
505-
* continuing and allowing the page mapping to
506-
* be removed.
507-
*/
508-
set_page_dirty(head_page);
509-
}
510-
umem_odp->dma_list[idx] = 0;
511-
umem_odp->npages--;
473+
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
474+
goto clear;
475+
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
476+
goto clear;
477+
478+
ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
479+
DMA_BIDIRECTIONAL);
480+
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
481+
struct page *head_page = compound_head(page);
482+
/*
483+
* set_page_dirty prefers being called with
484+
* the page lock. However, MMU notifiers are
485+
* called sometimes with and sometimes without
486+
* the lock. We rely on the umem_mutex instead
487+
* to prevent other mmu notifiers from
488+
* continuing and allowing the page mapping to
489+
* be removed.
490+
*/
491+
set_page_dirty(head_page);
512492
}
493+
umem_odp->npages--;
494+
clear:
495+
umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
513496
}
514497
}
515498
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,7 @@ struct mlx5_ib_flow_db {
351351
#define MLX5_IB_UPD_XLT_PD BIT(4)
352352
#define MLX5_IB_UPD_XLT_ACCESS BIT(5)
353353
#define MLX5_IB_UPD_XLT_INDIRECT BIT(6)
354+
#define MLX5_IB_UPD_XLT_DOWNGRADE BIT(7)
354355

355356
/* Private QP creation flags to be passed in ib_qp_init_attr.create_flags.
356357
*

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <linux/kernel.h>
3535
#include <linux/dma-buf.h>
3636
#include <linux/dma-resv.h>
37+
#include <linux/hmm.h>
3738

3839
#include "mlx5_ib.h"
3940
#include "cmd.h"
@@ -158,31 +159,30 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
158159
}
159160
}
160161

161-
static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
162-
{
163-
u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
164-
165-
if (umem_dma & ODP_READ_ALLOWED_BIT)
166-
mtt_entry |= MLX5_IB_MTT_READ;
167-
if (umem_dma & ODP_WRITE_ALLOWED_BIT)
168-
mtt_entry |= MLX5_IB_MTT_WRITE;
169-
170-
return mtt_entry;
171-
}
172-
173162
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
174163
struct mlx5_ib_mr *mr, int flags)
175164
{
176165
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
166+
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
167+
unsigned long pfn;
177168
dma_addr_t pa;
178169
size_t i;
179170

180171
if (flags & MLX5_IB_UPD_XLT_ZAP)
181172
return;
182173

183174
for (i = 0; i < nentries; i++) {
175+
pfn = odp->pfn_list[idx + i];
176+
if (!(pfn & HMM_PFN_VALID))
177+
/* ODP initialization */
178+
continue;
179+
184180
pa = odp->dma_list[idx + i];
185-
pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
181+
pa |= MLX5_IB_MTT_READ;
182+
if ((pfn & HMM_PFN_WRITE) && !downgrade)
183+
pa |= MLX5_IB_MTT_WRITE;
184+
185+
pas[i] = cpu_to_be64(pa);
186186
}
187187
}
188188

@@ -303,8 +303,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
303303
* estimate the cost of another UMR vs. the cost of bigger
304304
* UMR.
305305
*/
306-
if (umem_odp->dma_list[idx] &
307-
(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
306+
if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
308307
if (!in_block) {
309308
blk_start_idx = idx;
310309
in_block = 1;
@@ -687,20 +686,22 @@ static int pagefault_real_mr(struct mlx5_ib_mr *mr, struct ib_umem_odp *odp,
687686
{
688687
int page_shift, ret, np;
689688
bool downgrade = flags & MLX5_PF_FLAGS_DOWNGRADE;
690-
u64 access_mask;
689+
u64 access_mask = 0;
691690
u64 start_idx;
692691
bool fault = !(flags & MLX5_PF_FLAGS_SNAPSHOT);
693692
u32 xlt_flags = MLX5_IB_UPD_XLT_ATOMIC;
694693

695694
if (flags & MLX5_PF_FLAGS_ENABLE)
696695
xlt_flags |= MLX5_IB_UPD_XLT_ENABLE;
697696

697+
if (flags & MLX5_PF_FLAGS_DOWNGRADE)
698+
xlt_flags |= MLX5_IB_UPD_XLT_DOWNGRADE;
699+
698700
page_shift = odp->page_shift;
699701
start_idx = (user_va - ib_umem_start(odp)) >> page_shift;
700-
access_mask = ODP_READ_ALLOWED_BIT;
701702

702703
if (odp->umem.writable && !downgrade)
703-
access_mask |= ODP_WRITE_ALLOWED_BIT;
704+
access_mask |= HMM_PFN_WRITE;
704705

705706
np = ib_umem_odp_map_dma_and_lock(odp, user_va, bcnt, access_mask, fault);
706707
if (np < 0)

drivers/infiniband/sw/rxe/rxe_odp.c

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ static bool rxe_ib_invalidate_range(struct mmu_interval_notifier *mni,
2727
start = max_t(u64, ib_umem_start(umem_odp), range->start);
2828
end = min_t(u64, ib_umem_end(umem_odp), range->end);
2929

30-
/* update umem_odp->dma_list */
30+
/* update umem_odp->map.pfn_list */
3131
ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
3232

3333
mutex_unlock(&umem_odp->umem_mutex);
@@ -45,12 +45,11 @@ static int rxe_odp_do_pagefault_and_lock(struct rxe_mr *mr, u64 user_va, int bcn
4545
{
4646
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
4747
bool fault = !(flags & RXE_PAGEFAULT_SNAPSHOT);
48-
u64 access_mask;
48+
u64 access_mask = 0;
4949
int np;
5050

51-
access_mask = ODP_READ_ALLOWED_BIT;
5251
if (umem_odp->umem.writable && !(flags & RXE_PAGEFAULT_RDONLY))
53-
access_mask |= ODP_WRITE_ALLOWED_BIT;
52+
access_mask |= HMM_PFN_WRITE;
5453

5554
/*
5655
* ib_umem_odp_map_dma_and_lock() locks umem_mutex on success.
@@ -138,7 +137,7 @@ static inline bool rxe_check_pagefault(struct ib_umem_odp *umem_odp,
138137
while (addr < iova + length) {
139138
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
140139

141-
if (!(umem_odp->dma_list[idx] & perm)) {
140+
if (!(umem_odp->map.pfn_list[idx] & perm)) {
142141
need_fault = true;
143142
break;
144143
}
@@ -162,15 +161,14 @@ static int rxe_odp_map_range_and_lock(struct rxe_mr *mr, u64 iova, int length, u
162161
{
163162
struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
164163
bool need_fault;
165-
u64 perm;
164+
u64 perm = 0;
166165
int err;
167166

168167
if (unlikely(length < 1))
169168
return -EINVAL;
170169

171-
perm = ODP_READ_ALLOWED_BIT;
172170
if (!(flags & RXE_PAGEFAULT_RDONLY))
173-
perm |= ODP_WRITE_ALLOWED_BIT;
171+
perm |= HMM_PFN_WRITE;
174172

175173
mutex_lock(&umem_odp->umem_mutex);
176174

include/rdma/ib_umem_odp.h

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include <rdma/ib_umem.h>
1010
#include <rdma/ib_verbs.h>
11+
#include <linux/hmm.h>
1112

1213
struct ib_umem_odp {
1314
struct ib_umem umem;
@@ -67,19 +68,6 @@ static inline size_t ib_umem_odp_num_pages(struct ib_umem_odp *umem_odp)
6768
umem_odp->page_shift;
6869
}
6970

70-
/*
71-
* The lower 2 bits of the DMA address signal the R/W permissions for
72-
* the entry. To upgrade the permissions, provide the appropriate
73-
* bitmask to the map_dma_pages function.
74-
*
75-
* Be aware that upgrading a mapped address might result in change of
76-
* the DMA address for the page.
77-
*/
78-
#define ODP_READ_ALLOWED_BIT (1<<0ULL)
79-
#define ODP_WRITE_ALLOWED_BIT (1<<1ULL)
80-
81-
#define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT))
82-
8371
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
8472

8573
struct ib_umem_odp *

0 commit comments

Comments
 (0)