Skip to content

Commit 7e7a436

Browse files
dhharaladledford
authored andcommitted
staging/hfi1: Add TID entry program function body
The previous patch in the series added the free/invalidate function bodies. Now, it's time for the programming side. This large function takes the user's buffer, breaks it up into manageable chunks, allocates enough RcvArray groups and programs the chunks into the RcvArray entries in the hardware. With this function, the TID caching functionality is implemented. However, it is still unused. The switch will come in a later patch in the series, which will remove the old functionality and switch the driver over to TID caching. Signed-off-by: Mitko Haralanov <[email protected]> Reviewed-by: Ira Weiny <[email protected]> Signed-off-by: Doug Ledford <[email protected]>
1 parent 455d7f1 commit 7e7a436

File tree

1 file changed

+259
-4
lines changed

1 file changed

+259
-4
lines changed

drivers/staging/rdma/hfi1/user_exp_rcv.c

Lines changed: 259 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,7 @@ struct tid_pageset {
9797

9898
static void unlock_exp_tids(struct hfi1_ctxtdata *, struct exp_tid_set *,
9999
struct rb_root *);
100-
static u32 find_phys_blocks(struct page **, unsigned,
101-
struct tid_pageset *) __maybe_unused;
100+
static u32 find_phys_blocks(struct page **, unsigned, struct tid_pageset *);
102101
static int set_rcvarray_entry(struct file *, unsigned long, u32,
103102
struct tid_group *, struct page **, unsigned);
104103
static inline int mmu_addr_cmp(struct mmu_rb_node *, unsigned long,
@@ -119,7 +118,7 @@ static inline void mmu_notifier_range_start(struct mmu_notifier *,
119118
unsigned long, unsigned long);
120119
static int program_rcvarray(struct file *, unsigned long, struct tid_group *,
121120
struct tid_pageset *, unsigned, u16, struct page **,
122-
u32 *, unsigned *, unsigned *) __maybe_unused;
121+
u32 *, unsigned *, unsigned *);
123122
static int unprogram_rcvarray(struct file *, u32, struct tid_group **);
124123
static void clear_tid_node(struct hfi1_filedata *, u16, struct mmu_rb_node *);
125124

@@ -339,9 +338,265 @@ static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
339338
writeq(0, dd->rcvarray_wc + (index * 8));
340339
}
341340

341+
/*
342+
* RcvArray entry allocation for Expected Receives is done by the
343+
* following algorithm:
344+
*
345+
* The context keeps 3 lists of groups of RcvArray entries:
346+
* 1. List of empty groups - tid_group_list
347+
* This list is created during user context creation and
348+
* contains elements which describe sets (of 8) of empty
349+
* RcvArray entries.
350+
* 2. List of partially used groups - tid_used_list
351+
* This list contains sets of RcvArray entries which are
352+
* not completely used up. Another mapping request could
353+
* use some of all of the remaining entries.
354+
* 3. List of full groups - tid_full_list
355+
* This is the list where sets that are completely used
356+
* up go.
357+
*
358+
* An attempt to optimize the usage of RcvArray entries is
359+
* made by finding all sets of physically contiguous pages in a
360+
* user's buffer.
361+
* These physically contiguous sets are further split into
362+
* sizes supported by the receive engine of the HFI. The
363+
* resulting sets of pages are stored in struct tid_pageset,
364+
* which describes the sets as:
365+
* * .count - number of pages in this set
366+
* * .idx - starting index into struct page ** array
367+
* of this set
368+
*
369+
* From this point on, the algorithm deals with the page sets
370+
* described above. The number of pagesets is divided by the
371+
* RcvArray group size to produce the number of full groups
372+
* needed.
373+
*
374+
* Groups from the 3 lists are manipulated using the following
375+
* rules:
376+
* 1. For each set of 8 pagesets, a complete group from
377+
* tid_group_list is taken, programmed, and moved to
378+
* the tid_full_list list.
379+
* 2. For all remaining pagesets:
380+
* 2.1 If the tid_used_list is empty and the tid_group_list
381+
* is empty, stop processing pageset and return only
382+
* what has been programmed up to this point.
383+
* 2.2 If the tid_used_list is empty and the tid_group_list
384+
* is not empty, move a group from tid_group_list to
385+
* tid_used_list.
386+
* 2.3 For each group is tid_used_group, program as much as
387+
* can fit into the group. If the group becomes fully
388+
* used, move it to tid_full_list.
389+
*/
342390
int hfi1_user_exp_rcv_setup(struct file *fp, struct hfi1_tid_info *tinfo)
343391
{
344-
return -EINVAL;
392+
int ret = 0, need_group = 0, pinned;
393+
struct hfi1_filedata *fd = fp->private_data;
394+
struct hfi1_ctxtdata *uctxt = fd->uctxt;
395+
struct hfi1_devdata *dd = uctxt->dd;
396+
unsigned npages, ngroups, pageidx = 0, pageset_count, npagesets,
397+
tididx = 0, mapped, mapped_pages = 0;
398+
unsigned long vaddr = tinfo->vaddr;
399+
struct page **pages = NULL;
400+
u32 *tidlist = NULL;
401+
struct tid_pageset *pagesets = NULL;
402+
403+
/* Get the number of pages the user buffer spans */
404+
npages = num_user_pages(vaddr, tinfo->length);
405+
if (!npages)
406+
return -EINVAL;
407+
408+
if (npages > uctxt->expected_count) {
409+
dd_dev_err(dd, "Expected buffer too big\n");
410+
return -EINVAL;
411+
}
412+
413+
/* Verify that access is OK for the user buffer */
414+
if (!access_ok(VERIFY_WRITE, (void __user *)vaddr,
415+
npages * PAGE_SIZE)) {
416+
dd_dev_err(dd, "Fail vaddr %p, %u pages, !access_ok\n",
417+
(void *)vaddr, npages);
418+
return -EFAULT;
419+
}
420+
421+
pagesets = kcalloc(uctxt->expected_count, sizeof(*pagesets),
422+
GFP_KERNEL);
423+
if (!pagesets)
424+
return -ENOMEM;
425+
426+
/* Allocate the array of struct page pointers needed for pinning */
427+
pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
428+
if (!pages) {
429+
ret = -ENOMEM;
430+
goto bail;
431+
}
432+
433+
/*
434+
* Pin all the pages of the user buffer. If we can't pin all the
435+
* pages, accept the amount pinned so far and program only that.
436+
* User space knows how to deal with partially programmed buffers.
437+
*/
438+
pinned = hfi1_acquire_user_pages(vaddr, npages, true, pages);
439+
if (pinned <= 0) {
440+
ret = pinned;
441+
goto bail;
442+
}
443+
444+
/* Find sets of physically contiguous pages */
445+
npagesets = find_phys_blocks(pages, pinned, pagesets);
446+
447+
/*
448+
* We don't need to access this under a lock since tid_used is per
449+
* process and the same process cannot be in hfi1_user_exp_rcv_clear()
450+
* and hfi1_user_exp_rcv_setup() at the same time.
451+
*/
452+
spin_lock(&fd->tid_lock);
453+
if (fd->tid_used + npagesets > fd->tid_limit)
454+
pageset_count = fd->tid_limit - fd->tid_used;
455+
else
456+
pageset_count = npagesets;
457+
spin_unlock(&fd->tid_lock);
458+
459+
if (!pageset_count)
460+
goto bail;
461+
462+
ngroups = pageset_count / dd->rcv_entries.group_size;
463+
tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
464+
if (!tidlist) {
465+
ret = -ENOMEM;
466+
goto nomem;
467+
}
468+
469+
tididx = 0;
470+
471+
/*
472+
* From this point on, we are going to be using shared (between master
473+
* and subcontexts) context resources. We need to take the lock.
474+
*/
475+
mutex_lock(&uctxt->exp_lock);
476+
/*
477+
* The first step is to program the RcvArray entries which are complete
478+
* groups.
479+
*/
480+
while (ngroups && uctxt->tid_group_list.count) {
481+
struct tid_group *grp =
482+
tid_group_pop(&uctxt->tid_group_list);
483+
484+
ret = program_rcvarray(fp, vaddr, grp, pagesets,
485+
pageidx, dd->rcv_entries.group_size,
486+
pages, tidlist, &tididx, &mapped);
487+
/*
488+
* If there was a failure to program the RcvArray
489+
* entries for the entire group, reset the grp fields
490+
* and add the grp back to the free group list.
491+
*/
492+
if (ret <= 0) {
493+
tid_group_add_tail(grp, &uctxt->tid_group_list);
494+
hfi1_cdbg(TID,
495+
"Failed to program RcvArray group %d", ret);
496+
goto unlock;
497+
}
498+
499+
tid_group_add_tail(grp, &uctxt->tid_full_list);
500+
ngroups--;
501+
pageidx += ret;
502+
mapped_pages += mapped;
503+
}
504+
505+
while (pageidx < pageset_count) {
506+
struct tid_group *grp, *ptr;
507+
/*
508+
* If we don't have any partially used tid groups, check
509+
* if we have empty groups. If so, take one from there and
510+
* put in the partially used list.
511+
*/
512+
if (!uctxt->tid_used_list.count || need_group) {
513+
if (!uctxt->tid_group_list.count)
514+
goto unlock;
515+
516+
grp = tid_group_pop(&uctxt->tid_group_list);
517+
tid_group_add_tail(grp, &uctxt->tid_used_list);
518+
need_group = 0;
519+
}
520+
/*
521+
* There is an optimization opportunity here - instead of
522+
* fitting as many page sets as we can, check for a group
523+
* later on in the list that could fit all of them.
524+
*/
525+
list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
526+
list) {
527+
unsigned use = min_t(unsigned, pageset_count - pageidx,
528+
grp->size - grp->used);
529+
530+
ret = program_rcvarray(fp, vaddr, grp, pagesets,
531+
pageidx, use, pages, tidlist,
532+
&tididx, &mapped);
533+
if (ret < 0) {
534+
hfi1_cdbg(TID,
535+
"Failed to program RcvArray entries %d",
536+
ret);
537+
ret = -EFAULT;
538+
goto unlock;
539+
} else if (ret > 0) {
540+
if (grp->used == grp->size)
541+
tid_group_move(grp,
542+
&uctxt->tid_used_list,
543+
&uctxt->tid_full_list);
544+
pageidx += ret;
545+
mapped_pages += mapped;
546+
need_group = 0;
547+
/* Check if we are done so we break out early */
548+
if (pageidx >= pageset_count)
549+
break;
550+
} else if (WARN_ON(ret == 0)) {
551+
/*
552+
* If ret is 0, we did not program any entries
553+
* into this group, which can only happen if
554+
* we've screwed up the accounting somewhere.
555+
* Warn and try to continue.
556+
*/
557+
need_group = 1;
558+
}
559+
}
560+
}
561+
unlock:
562+
mutex_unlock(&uctxt->exp_lock);
563+
nomem:
564+
hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
565+
mapped_pages, ret);
566+
if (tididx) {
567+
spin_lock(&fd->tid_lock);
568+
fd->tid_used += tididx;
569+
spin_unlock(&fd->tid_lock);
570+
tinfo->tidcnt = tididx;
571+
tinfo->length = mapped_pages * PAGE_SIZE;
572+
573+
if (copy_to_user((void __user *)(unsigned long)tinfo->tidlist,
574+
tidlist, sizeof(tidlist[0]) * tididx)) {
575+
/*
576+
* On failure to copy to the user level, we need to undo
577+
* everything done so far so we don't leak resources.
578+
*/
579+
tinfo->tidlist = (unsigned long)&tidlist;
580+
hfi1_user_exp_rcv_clear(fp, tinfo);
581+
tinfo->tidlist = 0;
582+
ret = -EFAULT;
583+
goto bail;
584+
}
585+
}
586+
587+
/*
588+
* If not everything was mapped (due to insufficient RcvArray entries,
589+
* for example), unpin all unmapped pages so we can pin them nex time.
590+
*/
591+
if (mapped_pages != pinned)
592+
hfi1_release_user_pages(&pages[mapped_pages],
593+
pinned - mapped_pages,
594+
false);
595+
bail:
596+
kfree(pagesets);
597+
kfree(pages);
598+
kfree(tidlist);
599+
return ret > 0 ? 0 : ret;
345600
}
346601

347602
int hfi1_user_exp_rcv_clear(struct file *fp, struct hfi1_tid_info *tinfo)

0 commit comments

Comments
 (0)