Skip to content

Commit ad06307

Browse files
committed
Leon Romanovsky says: ==================== Use ODP MRs for kernel ULPs The following series extends MR creation routines to allow creation of user MRs through kernel ULPs as a proxy. The immediate use case is to allow RDS to work over FS-DAX, which requires ODP (on-demand-paging) MRs to be created and such MRs were not possible to create prior this series. The first part of this patchset extends RDMA to have special verb ib_reg_user_mr(). The common use case that uses this function is a userspace application that allocates memory for HCA access but the responsibility to register the memory at the HCA is on an kernel ULP. This ULP acts as an agent for the userspace application. The second part provides advise MR functionality for ULPs. This is integral part of ODP flows and used to trigger pagefaults in advance to prepare memory before running working set. The third part is actual user of those in-kernel APIs. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 17e10a1 + b2dfc67 commit ad06307

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+561
-256
lines changed

drivers/infiniband/core/umem.c

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -181,15 +181,14 @@ EXPORT_SYMBOL(ib_umem_find_best_pgsz);
181181
/**
182182
* ib_umem_get - Pin and DMA map userspace memory.
183183
*
184-
* @udata: userspace context to pin memory for
184+
* @device: IB device to connect UMEM
185185
* @addr: userspace virtual address to start at
186186
* @size: length of region to pin
187187
* @access: IB_ACCESS_xxx flags for memory being pinned
188188
*/
189-
struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
189+
struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
190190
size_t size, int access)
191191
{
192-
struct ib_ucontext *context;
193192
struct ib_umem *umem;
194193
struct page **page_list;
195194
unsigned long lock_limit;
@@ -201,14 +200,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
201200
struct scatterlist *sg;
202201
unsigned int gup_flags = FOLL_WRITE;
203202

204-
if (!udata)
205-
return ERR_PTR(-EIO);
206-
207-
context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
208-
->context;
209-
if (!context)
210-
return ERR_PTR(-EIO);
211-
212203
/*
213204
* If the combination of the addr and size requested for this memory
214205
* region causes an integer overflow, return error.
@@ -226,7 +217,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
226217
umem = kzalloc(sizeof(*umem), GFP_KERNEL);
227218
if (!umem)
228219
return ERR_PTR(-ENOMEM);
229-
umem->ibdev = context->device;
220+
umem->ibdev = device;
230221
umem->length = size;
231222
umem->address = addr;
232223
umem->writable = ib_access_writable(access);
@@ -281,18 +272,18 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
281272
npages -= ret;
282273

283274
sg = ib_umem_add_sg_table(sg, page_list, ret,
284-
dma_get_max_seg_size(context->device->dma_device),
275+
dma_get_max_seg_size(device->dma_device),
285276
&umem->sg_nents);
286277

287278
up_read(&mm->mmap_sem);
288279
}
289280

290281
sg_mark_end(sg);
291282

292-
umem->nmap = ib_dma_map_sg(context->device,
293-
umem->sg_head.sgl,
294-
umem->sg_nents,
295-
DMA_BIDIRECTIONAL);
283+
umem->nmap = ib_dma_map_sg(device,
284+
umem->sg_head.sgl,
285+
umem->sg_nents,
286+
DMA_BIDIRECTIONAL);
296287

297288
if (!umem->nmap) {
298289
ret = -ENOMEM;
@@ -303,7 +294,7 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
303294
goto out;
304295

305296
umem_release:
306-
__ib_umem_release(context->device, umem, 0);
297+
__ib_umem_release(device, umem, 0);
307298
vma:
308299
atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
309300
out:

drivers/infiniband/core/umem_odp.c

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -110,30 +110,24 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
110110
* They exist only to hold the per_mm reference to help the driver create
111111
* children umems.
112112
*
113-
* @udata: udata from the syscall being used to create the umem
113+
* @device: IB device to create UMEM
114114
* @access: ib_reg_mr access flags
115115
*/
116-
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_udata *udata,
116+
struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device,
117117
int access)
118118
{
119-
struct ib_ucontext *context =
120-
container_of(udata, struct uverbs_attr_bundle, driver_udata)
121-
->context;
122119
struct ib_umem *umem;
123120
struct ib_umem_odp *umem_odp;
124121
int ret;
125122

126123
if (access & IB_ACCESS_HUGETLB)
127124
return ERR_PTR(-EINVAL);
128125

129-
if (!context)
130-
return ERR_PTR(-EIO);
131-
132126
umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL);
133127
if (!umem_odp)
134128
return ERR_PTR(-ENOMEM);
135129
umem = &umem_odp->umem;
136-
umem->ibdev = context->device;
130+
umem->ibdev = device;
137131
umem->writable = ib_access_writable(access);
138132
umem->owning_mm = current->mm;
139133
umem_odp->is_implicit_odp = 1;
@@ -201,7 +195,7 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
201195
/**
202196
* ib_umem_odp_get - Create a umem_odp for a userspace va
203197
*
204-
* @udata: userspace context to pin memory for
198+
* @device: IB device struct to get UMEM
205199
* @addr: userspace virtual address to start at
206200
* @size: length of region to pin
207201
* @access: IB_ACCESS_xxx flags for memory being pinned
@@ -210,31 +204,22 @@ EXPORT_SYMBOL(ib_umem_odp_alloc_child);
210204
* pinning, instead, stores the mm for future page fault handling in
211205
* conjunction with MMU notifiers.
212206
*/
213-
struct ib_umem_odp *ib_umem_odp_get(struct ib_udata *udata, unsigned long addr,
214-
size_t size, int access,
207+
struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device,
208+
unsigned long addr, size_t size, int access,
215209
const struct mmu_interval_notifier_ops *ops)
216210
{
217211
struct ib_umem_odp *umem_odp;
218-
struct ib_ucontext *context;
219212
struct mm_struct *mm;
220213
int ret;
221214

222-
if (!udata)
223-
return ERR_PTR(-EIO);
224-
225-
context = container_of(udata, struct uverbs_attr_bundle, driver_udata)
226-
->context;
227-
if (!context)
228-
return ERR_PTR(-EIO);
229-
230215
if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND)))
231216
return ERR_PTR(-EINVAL);
232217

233218
umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL);
234219
if (!umem_odp)
235220
return ERR_PTR(-ENOMEM);
236221

237-
umem_odp->umem.ibdev = context->device;
222+
umem_odp->umem.ibdev = device;
238223
umem_odp->umem.length = size;
239224
umem_odp->umem.address = addr;
240225
umem_odp->umem.writable = ib_access_writable(access);

drivers/infiniband/core/verbs.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1990,6 +1990,47 @@ EXPORT_SYMBOL(ib_resize_cq);
19901990

19911991
/* Memory regions */
19921992

1993+
struct ib_mr *ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
1994+
u64 virt_addr, int access_flags)
1995+
{
1996+
struct ib_mr *mr;
1997+
1998+
if (access_flags & IB_ACCESS_ON_DEMAND) {
1999+
if (!(pd->device->attrs.device_cap_flags &
2000+
IB_DEVICE_ON_DEMAND_PAGING)) {
2001+
pr_debug("ODP support not available\n");
2002+
return ERR_PTR(-EINVAL);
2003+
}
2004+
}
2005+
2006+
mr = pd->device->ops.reg_user_mr(pd, start, length, virt_addr,
2007+
access_flags, NULL);
2008+
2009+
if (IS_ERR(mr))
2010+
return mr;
2011+
2012+
mr->device = pd->device;
2013+
mr->pd = pd;
2014+
mr->dm = NULL;
2015+
atomic_inc(&pd->usecnt);
2016+
mr->res.type = RDMA_RESTRACK_MR;
2017+
rdma_restrack_kadd(&mr->res);
2018+
2019+
return mr;
2020+
}
2021+
EXPORT_SYMBOL(ib_reg_user_mr);
2022+
2023+
int ib_advise_mr(struct ib_pd *pd, enum ib_uverbs_advise_mr_advice advice,
2024+
u32 flags, struct ib_sge *sg_list, u32 num_sge)
2025+
{
2026+
if (!pd->device->ops.advise_mr)
2027+
return -EOPNOTSUPP;
2028+
2029+
return pd->device->ops.advise_mr(pd, advice, flags, sg_list, num_sge,
2030+
NULL);
2031+
}
2032+
EXPORT_SYMBOL(ib_advise_mr);
2033+
19932034
int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
19942035
{
19952036
struct ib_pd *pd = mr->pd;

drivers/infiniband/hw/bnxt_re/ib_verbs.c

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -837,7 +837,8 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
837837
bytes += (qplib_qp->sq.max_wqe * psn_sz);
838838
}
839839
bytes = PAGE_ALIGN(bytes);
840-
umem = ib_umem_get(udata, ureq.qpsva, bytes, IB_ACCESS_LOCAL_WRITE);
840+
umem = ib_umem_get(&rdev->ibdev, ureq.qpsva, bytes,
841+
IB_ACCESS_LOCAL_WRITE);
841842
if (IS_ERR(umem))
842843
return PTR_ERR(umem);
843844

@@ -850,7 +851,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
850851
if (!qp->qplib_qp.srq) {
851852
bytes = (qplib_qp->rq.max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
852853
bytes = PAGE_ALIGN(bytes);
853-
umem = ib_umem_get(udata, ureq.qprva, bytes,
854+
umem = ib_umem_get(&rdev->ibdev, ureq.qprva, bytes,
854855
IB_ACCESS_LOCAL_WRITE);
855856
if (IS_ERR(umem))
856857
goto rqfail;
@@ -1304,7 +1305,8 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
13041305

13051306
bytes = (qplib_srq->max_wqe * BNXT_QPLIB_MAX_RQE_ENTRY_SIZE);
13061307
bytes = PAGE_ALIGN(bytes);
1307-
umem = ib_umem_get(udata, ureq.srqva, bytes, IB_ACCESS_LOCAL_WRITE);
1308+
umem = ib_umem_get(&rdev->ibdev, ureq.srqva, bytes,
1309+
IB_ACCESS_LOCAL_WRITE);
13081310
if (IS_ERR(umem))
13091311
return PTR_ERR(umem);
13101312

@@ -2545,7 +2547,7 @@ int bnxt_re_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
25452547
goto fail;
25462548
}
25472549

2548-
cq->umem = ib_umem_get(udata, req.cq_va,
2550+
cq->umem = ib_umem_get(&rdev->ibdev, req.cq_va,
25492551
entries * sizeof(struct cq_base),
25502552
IB_ACCESS_LOCAL_WRITE);
25512553
if (IS_ERR(cq->umem)) {
@@ -3514,7 +3516,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
35143516
/* The fixed portion of the rkey is the same as the lkey */
35153517
mr->ib_mr.rkey = mr->qplib_mr.rkey;
35163518

3517-
umem = ib_umem_get(udata, start, length, mr_access_flags);
3519+
umem = ib_umem_get(&rdev->ibdev, start, length, mr_access_flags);
35183520
if (IS_ERR(umem)) {
35193521
dev_err(rdev_to_dev(rdev), "Failed to get umem");
35203522
rc = -EFAULT;

drivers/infiniband/hw/cxgb4/mem.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -543,7 +543,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
543543

544544
mhp->rhp = rhp;
545545

546-
mhp->umem = ib_umem_get(udata, start, length, acc);
546+
mhp->umem = ib_umem_get(pd->device, start, length, acc);
547547
if (IS_ERR(mhp->umem))
548548
goto err_free_skb;
549549

drivers/infiniband/hw/efa/efa_verbs.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1358,7 +1358,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
13581358
int inline_size;
13591359
int err;
13601360

1361-
if (udata->inlen &&
1361+
if (udata && udata->inlen &&
13621362
!ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
13631363
ibdev_dbg(&dev->ibdev,
13641364
"Incompatible ABI params, udata not cleared\n");
@@ -1384,7 +1384,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
13841384
goto err_out;
13851385
}
13861386

1387-
mr->umem = ib_umem_get(udata, start, length, access_flags);
1387+
mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
13881388
if (IS_ERR(mr->umem)) {
13891389
err = PTR_ERR(mr->umem);
13901390
ibdev_dbg(&dev->ibdev,

drivers/infiniband/hw/hns/hns_roce_cq.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ static int get_cq_umem(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq,
163163
u32 npages;
164164
int ret;
165165

166-
*umem = ib_umem_get(udata, ucmd.buf_addr, buf->size,
166+
*umem = ib_umem_get(&hr_dev->ib_dev, ucmd.buf_addr, buf->size,
167167
IB_ACCESS_LOCAL_WRITE);
168168
if (IS_ERR(*umem))
169169
return PTR_ERR(*umem);

drivers/infiniband/hw/hns/hns_roce_db.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ int hns_roce_db_map_user(struct hns_roce_ucontext *context,
3131

3232
refcount_set(&page->refcount, 1);
3333
page->user_virt = page_addr;
34-
page->umem = ib_umem_get(udata, page_addr, PAGE_SIZE, 0);
34+
page->umem = ib_umem_get(context->ibucontext.device, page_addr,
35+
PAGE_SIZE, 0);
3536
if (IS_ERR(page->umem)) {
3637
ret = PTR_ERR(page->umem);
3738
kfree(page);

drivers/infiniband/hw/hns/hns_roce_mr.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1145,7 +1145,7 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
11451145
if (!mr)
11461146
return ERR_PTR(-ENOMEM);
11471147

1148-
mr->umem = ib_umem_get(udata, start, length, access_flags);
1148+
mr->umem = ib_umem_get(pd->device, start, length, access_flags);
11491149
if (IS_ERR(mr->umem)) {
11501150
ret = PTR_ERR(mr->umem);
11511151
goto err_free;
@@ -1230,7 +1230,7 @@ static int rereg_mr_trans(struct ib_mr *ibmr, int flags,
12301230
}
12311231
ib_umem_release(mr->umem);
12321232

1233-
mr->umem = ib_umem_get(udata, start, length, mr_access_flags);
1233+
mr->umem = ib_umem_get(ibmr->device, start, length, mr_access_flags);
12341234
if (IS_ERR(mr->umem)) {
12351235
ret = PTR_ERR(mr->umem);
12361236
mr->umem = NULL;

drivers/infiniband/hw/hns/hns_roce_qp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -744,7 +744,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
744744
goto err_alloc_rq_inline_buf;
745745
}
746746

747-
hr_qp->umem = ib_umem_get(udata, ucmd.buf_addr,
747+
hr_qp->umem = ib_umem_get(ib_pd->device, ucmd.buf_addr,
748748
hr_qp->buff_size, 0);
749749
if (IS_ERR(hr_qp->umem)) {
750750
dev_err(dev, "ib_umem_get error for create qp\n");

drivers/infiniband/hw/hns/hns_roce_srq.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,8 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
186186
if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
187187
return -EFAULT;
188188

189-
srq->umem = ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0);
189+
srq->umem =
190+
ib_umem_get(srq->ibsrq.device, ucmd.buf_addr, srq_buf_size, 0);
190191
if (IS_ERR(srq->umem))
191192
return PTR_ERR(srq->umem);
192193

@@ -205,7 +206,7 @@ static int create_user_srq(struct hns_roce_srq *srq, struct ib_udata *udata,
205206
goto err_user_srq_mtt;
206207

207208
/* config index queue BA */
208-
srq->idx_que.umem = ib_umem_get(udata, ucmd.que_addr,
209+
srq->idx_que.umem = ib_umem_get(srq->ibsrq.device, ucmd.que_addr,
209210
srq->idx_que.buf_size, 0);
210211
if (IS_ERR(srq->idx_que.umem)) {
211212
dev_err(hr_dev->dev, "ib_umem_get error for index queue\n");

drivers/infiniband/hw/i40iw/i40iw_verbs.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1756,12 +1756,15 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
17561756
int ret;
17571757
int pg_shift;
17581758

1759+
if (!udata)
1760+
return ERR_PTR(-EOPNOTSUPP);
1761+
17591762
if (iwdev->closing)
17601763
return ERR_PTR(-ENODEV);
17611764

17621765
if (length > I40IW_MAX_MR_SIZE)
17631766
return ERR_PTR(-EINVAL);
1764-
region = ib_umem_get(udata, start, length, acc);
1767+
region = ib_umem_get(pd->device, start, length, acc);
17651768
if (IS_ERR(region))
17661769
return (struct ib_mr *)region;
17671770

drivers/infiniband/hw/mlx4/cq.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_udata *udata,
144144
int shift;
145145
int n;
146146

147-
*umem = ib_umem_get(udata, buf_addr, cqe * cqe_size,
147+
*umem = ib_umem_get(&dev->ib_dev, buf_addr, cqe * cqe_size,
148148
IB_ACCESS_LOCAL_WRITE);
149149
if (IS_ERR(*umem))
150150
return PTR_ERR(*umem);

drivers/infiniband/hw/mlx4/doorbell.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,8 @@ int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
6464

6565
page->user_virt = (virt & PAGE_MASK);
6666
page->refcnt = 0;
67-
page->umem = ib_umem_get(udata, virt & PAGE_MASK, PAGE_SIZE, 0);
67+
page->umem = ib_umem_get(context->ibucontext.device, virt & PAGE_MASK,
68+
PAGE_SIZE, 0);
6869
if (IS_ERR(page->umem)) {
6970
err = PTR_ERR(page->umem);
7071
kfree(page);

0 commit comments

Comments
 (0)