Skip to content

Commit 1efe8c0

Browse files
committed
RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage
Reuse newly added DMA API to cache IOVA and only link/unlink pages in fast path for UMEM ODP flow. Tested-by: Jens Axboe <[email protected]> Reviewed-by: Jason Gunthorpe <[email protected]> Signed-off-by: Leon Romanovsky <[email protected]>
1 parent eedd5b1 commit 1efe8c0

File tree

6 files changed

+73
-115
lines changed

6 files changed

+73
-115
lines changed

drivers/infiniband/core/umem_odp.c

Lines changed: 22 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include <linux/hugetlb.h>
4242
#include <linux/interval_tree.h>
4343
#include <linux/hmm.h>
44+
#include <linux/hmm-dma.h>
4445
#include <linux/pagemap.h>
4546

4647
#include <rdma/ib_umem_odp.h>
@@ -50,6 +51,7 @@
5051
static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
5152
const struct mmu_interval_notifier_ops *ops)
5253
{
54+
struct ib_device *dev = umem_odp->umem.ibdev;
5355
int ret;
5456

5557
umem_odp->umem.is_odp = 1;
@@ -59,7 +61,6 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
5961
size_t page_size = 1UL << umem_odp->page_shift;
6062
unsigned long start;
6163
unsigned long end;
62-
size_t ndmas, npfns;
6364

6465
start = ALIGN_DOWN(umem_odp->umem.address, page_size);
6566
if (check_add_overflow(umem_odp->umem.address,
@@ -70,36 +71,23 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
7071
if (unlikely(end < page_size))
7172
return -EOVERFLOW;
7273

73-
ndmas = (end - start) >> umem_odp->page_shift;
74-
if (!ndmas)
75-
return -EINVAL;
76-
77-
npfns = (end - start) >> PAGE_SHIFT;
78-
umem_odp->pfn_list = kvcalloc(
79-
npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
80-
if (!umem_odp->pfn_list)
81-
return -ENOMEM;
82-
83-
umem_odp->dma_list = kvcalloc(
84-
ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
85-
if (!umem_odp->dma_list) {
86-
ret = -ENOMEM;
87-
goto out_pfn_list;
88-
}
74+
ret = hmm_dma_map_alloc(dev->dma_device, &umem_odp->map,
75+
(end - start) >> PAGE_SHIFT,
76+
1 << umem_odp->page_shift);
77+
if (ret)
78+
return ret;
8979

9080
ret = mmu_interval_notifier_insert(&umem_odp->notifier,
9181
umem_odp->umem.owning_mm,
9282
start, end - start, ops);
9383
if (ret)
94-
goto out_dma_list;
84+
goto out_free_map;
9585
}
9686

9787
return 0;
9888

99-
out_dma_list:
100-
kvfree(umem_odp->dma_list);
101-
out_pfn_list:
102-
kvfree(umem_odp->pfn_list);
89+
out_free_map:
90+
hmm_dma_map_free(dev->dma_device, &umem_odp->map);
10391
return ret;
10492
}
10593

@@ -262,6 +250,8 @@ EXPORT_SYMBOL(ib_umem_odp_get);
262250

263251
void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
264252
{
253+
struct ib_device *dev = umem_odp->umem.ibdev;
254+
265255
/*
266256
* Ensure that no more pages are mapped in the umem.
267257
*
@@ -274,48 +264,17 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp)
274264
ib_umem_end(umem_odp));
275265
mutex_unlock(&umem_odp->umem_mutex);
276266
mmu_interval_notifier_remove(&umem_odp->notifier);
277-
kvfree(umem_odp->dma_list);
278-
kvfree(umem_odp->pfn_list);
267+
hmm_dma_map_free(dev->dma_device, &umem_odp->map);
279268
}
280269
put_pid(umem_odp->tgid);
281270
kfree(umem_odp);
282271
}
283272
EXPORT_SYMBOL(ib_umem_odp_release);
284273

285-
/*
286-
* Map for DMA and insert a single page into the on-demand paging page tables.
287-
*
288-
* @umem: the umem to insert the page to.
289-
* @dma_index: index in the umem to add the dma to.
290-
* @page: the page struct to map and add.
291-
* @access_mask: access permissions needed for this page.
292-
*
293-
* The function returns -EFAULT if the DMA mapping operation fails.
294-
*
295-
*/
296-
static int ib_umem_odp_map_dma_single_page(
297-
struct ib_umem_odp *umem_odp,
298-
unsigned int dma_index,
299-
struct page *page)
300-
{
301-
struct ib_device *dev = umem_odp->umem.ibdev;
302-
dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];
303-
304-
*dma_addr = ib_dma_map_page(dev, page, 0, 1 << umem_odp->page_shift,
305-
DMA_BIDIRECTIONAL);
306-
if (ib_dma_mapping_error(dev, *dma_addr)) {
307-
*dma_addr = 0;
308-
return -EFAULT;
309-
}
310-
umem_odp->npages++;
311-
return 0;
312-
}
313-
314274
/**
315275
* ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it.
316276
*
317277
* Maps the range passed in the argument to DMA addresses.
318-
* The DMA addresses of the mapped pages is updated in umem_odp->dma_list.
319278
* Upon success the ODP MR will be locked to let caller complete its device
320279
* page table update.
321280
*
@@ -372,7 +331,7 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
372331
range.default_flags |= HMM_PFN_REQ_WRITE;
373332
}
374333

375-
range.hmm_pfns = &(umem_odp->pfn_list[pfn_start_idx]);
334+
range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]);
376335
timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
377336

378337
retry:
@@ -423,16 +382,6 @@ int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt,
423382
__func__, hmm_order, page_shift);
424383
break;
425384
}
426-
427-
ret = ib_umem_odp_map_dma_single_page(
428-
umem_odp, dma_index,
429-
hmm_pfn_to_page(range.hmm_pfns[pfn_index]));
430-
if (ret < 0) {
431-
ibdev_dbg(umem_odp->umem.ibdev,
432-
"ib_umem_odp_map_dma_single_page failed with error %d\n", ret);
433-
break;
434-
}
435-
range.hmm_pfns[pfn_index] |= HMM_PFN_DMA_MAPPED;
436385
}
437386
/* upon success lock should stay on hold for the callee */
438387
if (!ret)
@@ -452,32 +401,23 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock);
452401
void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
453402
u64 bound)
454403
{
455-
dma_addr_t dma;
456-
int idx;
457-
u64 addr;
458404
struct ib_device *dev = umem_odp->umem.ibdev;
405+
u64 addr;
459406

460407
lockdep_assert_held(&umem_odp->umem_mutex);
461408

462409
virt = max_t(u64, virt, ib_umem_start(umem_odp));
463410
bound = min_t(u64, bound, ib_umem_end(umem_odp));
464411
for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) {
465-
unsigned long pfn_idx = (addr - ib_umem_start(umem_odp)) >>
466-
PAGE_SHIFT;
467-
struct page *page =
468-
hmm_pfn_to_page(umem_odp->pfn_list[pfn_idx]);
469-
470-
idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
471-
dma = umem_odp->dma_list[idx];
412+
u64 offset = addr - ib_umem_start(umem_odp);
413+
size_t idx = offset >> umem_odp->page_shift;
414+
unsigned long pfn = umem_odp->map.pfn_list[idx];
472415

473-
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_VALID))
474-
goto clear;
475-
if (!(umem_odp->pfn_list[pfn_idx] & HMM_PFN_DMA_MAPPED))
416+
if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx))
476417
goto clear;
477418

478-
ib_dma_unmap_page(dev, dma, BIT(umem_odp->page_shift),
479-
DMA_BIDIRECTIONAL);
480-
if (umem_odp->pfn_list[pfn_idx] & HMM_PFN_WRITE) {
419+
if (pfn & HMM_PFN_WRITE) {
420+
struct page *page = hmm_pfn_to_page(pfn);
481421
struct page *head_page = compound_head(page);
482422
/*
483423
* set_page_dirty prefers being called with
@@ -492,7 +432,7 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
492432
}
493433
umem_odp->npages--;
494434
clear:
495-
umem_odp->pfn_list[pfn_idx] &= ~HMM_PFN_FLAGS;
435+
umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS;
496436
}
497437
}
498438
EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);

drivers/infiniband/hw/mlx5/mlx5_ib.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1474,8 +1474,8 @@ void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev *ibdev);
14741474
int __init mlx5_ib_odp_init(void);
14751475
void mlx5_ib_odp_cleanup(void);
14761476
int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev);
1477-
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1478-
struct mlx5_ib_mr *mr, int flags);
1477+
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1478+
struct mlx5_ib_mr *mr, int flags);
14791479

14801480
int mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,
14811481
enum ib_uverbs_advise_mr_advice advice,
@@ -1496,8 +1496,11 @@ static inline int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev *dev)
14961496
{
14971497
return 0;
14981498
}
1499-
static inline void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1500-
struct mlx5_ib_mr *mr, int flags) {}
1499+
static inline int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
1500+
struct mlx5_ib_mr *mr, int flags)
1501+
{
1502+
return -EOPNOTSUPP;
1503+
}
15011504

15021505
static inline int
15031506
mlx5_ib_advise_mr_prefetch(struct ib_pd *pd,

drivers/infiniband/hw/mlx5/odp.c

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@
3535
#include <linux/dma-buf.h>
3636
#include <linux/dma-resv.h>
3737
#include <linux/hmm.h>
38+
#include <linux/hmm-dma.h>
39+
#include <linux/pci-p2pdma.h>
3840

3941
#include "mlx5_ib.h"
4042
#include "cmd.h"
@@ -159,40 +161,50 @@ static void populate_klm(struct mlx5_klm *pklm, size_t idx, size_t nentries,
159161
}
160162
}
161163

162-
static void populate_mtt(__be64 *pas, size_t idx, size_t nentries,
163-
struct mlx5_ib_mr *mr, int flags)
164+
static int populate_mtt(__be64 *pas, size_t start, size_t nentries,
165+
struct mlx5_ib_mr *mr, int flags)
164166
{
165167
struct ib_umem_odp *odp = to_ib_umem_odp(mr->umem);
166168
bool downgrade = flags & MLX5_IB_UPD_XLT_DOWNGRADE;
167-
unsigned long pfn;
168-
dma_addr_t pa;
169+
struct pci_p2pdma_map_state p2pdma_state = {};
170+
struct ib_device *dev = odp->umem.ibdev;
169171
size_t i;
170172

171173
if (flags & MLX5_IB_UPD_XLT_ZAP)
172-
return;
174+
return 0;
173175

174176
for (i = 0; i < nentries; i++) {
175-
pfn = odp->pfn_list[idx + i];
177+
unsigned long pfn = odp->map.pfn_list[start + i];
178+
dma_addr_t dma_addr;
179+
180+
pfn = odp->map.pfn_list[start + i];
176181
if (!(pfn & HMM_PFN_VALID))
177182
/* ODP initialization */
178183
continue;
179184

180-
pa = odp->dma_list[idx + i];
181-
pa |= MLX5_IB_MTT_READ;
185+
dma_addr = hmm_dma_map_pfn(dev->dma_device, &odp->map,
186+
start + i, &p2pdma_state);
187+
if (ib_dma_mapping_error(dev, dma_addr))
188+
return -EFAULT;
189+
190+
dma_addr |= MLX5_IB_MTT_READ;
182191
if ((pfn & HMM_PFN_WRITE) && !downgrade)
183-
pa |= MLX5_IB_MTT_WRITE;
192+
dma_addr |= MLX5_IB_MTT_WRITE;
184193

185-
pas[i] = cpu_to_be64(pa);
194+
pas[i] = cpu_to_be64(dma_addr);
195+
odp->npages++;
186196
}
197+
return 0;
187198
}
188199

189-
void mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
190-
struct mlx5_ib_mr *mr, int flags)
200+
int mlx5_odp_populate_xlt(void *xlt, size_t idx, size_t nentries,
201+
struct mlx5_ib_mr *mr, int flags)
191202
{
192203
if (flags & MLX5_IB_UPD_XLT_INDIRECT) {
193204
populate_klm(xlt, idx, nentries, mr, flags);
205+
return 0;
194206
} else {
195-
populate_mtt(xlt, idx, nentries, mr, flags);
207+
return populate_mtt(xlt, idx, nentries, mr, flags);
196208
}
197209
}
198210

@@ -303,7 +315,7 @@ static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier *mni,
303315
* estimate the cost of another UMR vs. the cost of bigger
304316
* UMR.
305317
*/
306-
if (umem_odp->pfn_list[idx] & HMM_PFN_VALID) {
318+
if (umem_odp->map.pfn_list[idx] & HMM_PFN_VALID) {
307319
if (!in_block) {
308320
blk_start_idx = idx;
309321
in_block = 1;

drivers/infiniband/hw/mlx5/umr.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -840,7 +840,17 @@ int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
840840
size_to_map = npages * desc_size;
841841
dma_sync_single_for_cpu(ddev, sg.addr, sg.length,
842842
DMA_TO_DEVICE);
843-
mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
843+
/*
844+
* npages is the maximum number of pages to map, but we
845+
* can't guarantee that all pages are actually mapped.
846+
*
847+
* For example, if page is p2p of type which is not supported
848+
* for mapping, the number of pages mapped will be less than
849+
* requested.
850+
*/
851+
err = mlx5_odp_populate_xlt(xlt, idx, npages, mr, flags);
852+
if (err)
853+
return err;
844854
dma_sync_single_for_device(ddev, sg.addr, sg.length,
845855
DMA_TO_DEVICE);
846856
sg.length = ALIGN(size_to_map, MLX5_UMR_FLEX_ALIGNMENT);

drivers/infiniband/sw/rxe/rxe_odp.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,7 @@ static int __rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
205205
while (length > 0) {
206206
u8 *src, *dest;
207207

208-
page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
208+
page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
209209
user_va = kmap_local_page(page);
210210
if (!user_va)
211211
return -EFAULT;
@@ -289,7 +289,7 @@ static enum resp_states rxe_odp_do_atomic_op(struct rxe_mr *mr, u64 iova,
289289

290290
idx = rxe_odp_iova_to_index(umem_odp, iova);
291291
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
292-
page = hmm_pfn_to_page(umem_odp->pfn_list[idx]);
292+
page = hmm_pfn_to_page(umem_odp->map.pfn_list[idx]);
293293
if (!page)
294294
return RESPST_ERR_RKEY_VIOLATION;
295295

@@ -355,7 +355,7 @@ int rxe_odp_flush_pmem_iova(struct rxe_mr *mr, u64 iova,
355355
index = rxe_odp_iova_to_index(umem_odp, iova);
356356
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
357357

358-
page = hmm_pfn_to_page(umem_odp->pfn_list[index]);
358+
page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
359359
if (!page) {
360360
mutex_unlock(&umem_odp->umem_mutex);
361361
return -EFAULT;
@@ -401,7 +401,7 @@ enum resp_states rxe_odp_do_atomic_write(struct rxe_mr *mr, u64 iova, u64 value)
401401

402402
page_offset = rxe_odp_iova_to_page_offset(umem_odp, iova);
403403
index = rxe_odp_iova_to_index(umem_odp, iova);
404-
page = hmm_pfn_to_page(umem_odp->pfn_list[index]);
404+
page = hmm_pfn_to_page(umem_odp->map.pfn_list[index]);
405405
if (!page) {
406406
mutex_unlock(&umem_odp->umem_mutex);
407407
return RESPST_ERR_RKEY_VIOLATION;

include/rdma/ib_umem_odp.h

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,17 @@
88

99
#include <rdma/ib_umem.h>
1010
#include <rdma/ib_verbs.h>
11-
#include <linux/hmm.h>
11+
#include <linux/hmm-dma.h>
1212

1313
struct ib_umem_odp {
1414
struct ib_umem umem;
1515
struct mmu_interval_notifier notifier;
1616
struct pid *tgid;
1717

18-
/* An array of the pfns included in the on-demand paging umem. */
19-
unsigned long *pfn_list;
18+
struct hmm_dma_map map;
2019

2120
/*
22-
* An array with DMA addresses mapped for pfns in pfn_list.
23-
* The lower two bits designate access permissions.
24-
* See ODP_READ_ALLOWED_BIT and ODP_WRITE_ALLOWED_BIT.
25-
*/
26-
dma_addr_t *dma_list;
27-
/*
28-
* The umem_mutex protects the page_list and dma_list fields of an ODP
21+
* The umem_mutex protects the page_list field of an ODP
2922
* umem, allowing only a single thread to map/unmap pages. The mutex
3023
* also protects access to the mmu notifier counters.
3124
*/

0 commit comments

Comments
 (0)