@@ -97,8 +97,7 @@ struct tid_pageset {
97
97
98
98
static void unlock_exp_tids (struct hfi1_ctxtdata * , struct exp_tid_set * ,
99
99
struct rb_root * );
100
- static u32 find_phys_blocks (struct page * * , unsigned ,
101
- struct tid_pageset * ) __maybe_unused ;
100
+ static u32 find_phys_blocks (struct page * * , unsigned , struct tid_pageset * );
102
101
static int set_rcvarray_entry (struct file * , unsigned long , u32 ,
103
102
struct tid_group * , struct page * * , unsigned );
104
103
static inline int mmu_addr_cmp (struct mmu_rb_node * , unsigned long ,
@@ -119,7 +118,7 @@ static inline void mmu_notifier_range_start(struct mmu_notifier *,
119
118
unsigned long , unsigned long );
120
119
static int program_rcvarray (struct file * , unsigned long , struct tid_group * ,
121
120
struct tid_pageset * , unsigned , u16 , struct page * * ,
122
- u32 * , unsigned * , unsigned * ) __maybe_unused ;
121
+ u32 * , unsigned * , unsigned * );
123
122
static int unprogram_rcvarray (struct file * , u32 , struct tid_group * * );
124
123
static void clear_tid_node (struct hfi1_filedata * , u16 , struct mmu_rb_node * );
125
124
@@ -339,9 +338,265 @@ static inline void rcv_array_wc_fill(struct hfi1_devdata *dd, u32 index)
339
338
writeq (0 , dd -> rcvarray_wc + (index * 8 ));
340
339
}
341
340
341
+ /*
342
+ * RcvArray entry allocation for Expected Receives is done by the
343
+ * following algorithm:
344
+ *
345
+ * The context keeps 3 lists of groups of RcvArray entries:
346
+ * 1. List of empty groups - tid_group_list
347
+ * This list is created during user context creation and
348
+ * contains elements which describe sets (of 8) of empty
349
+ * RcvArray entries.
350
+ * 2. List of partially used groups - tid_used_list
351
+ * This list contains sets of RcvArray entries which are
352
+ * not completely used up. Another mapping request could
353
+ * use some of all of the remaining entries.
354
+ * 3. List of full groups - tid_full_list
355
+ * This is the list where sets that are completely used
356
+ * up go.
357
+ *
358
+ * An attempt to optimize the usage of RcvArray entries is
359
+ * made by finding all sets of physically contiguous pages in a
360
+ * user's buffer.
361
+ * These physically contiguous sets are further split into
362
+ * sizes supported by the receive engine of the HFI. The
363
+ * resulting sets of pages are stored in struct tid_pageset,
364
+ * which describes the sets as:
365
+ * * .count - number of pages in this set
366
+ * * .idx - starting index into struct page ** array
367
+ * of this set
368
+ *
369
+ * From this point on, the algorithm deals with the page sets
370
+ * described above. The number of pagesets is divided by the
371
+ * RcvArray group size to produce the number of full groups
372
+ * needed.
373
+ *
374
+ * Groups from the 3 lists are manipulated using the following
375
+ * rules:
376
+ * 1. For each set of 8 pagesets, a complete group from
377
+ * tid_group_list is taken, programmed, and moved to
378
+ * the tid_full_list list.
379
+ * 2. For all remaining pagesets:
380
+ * 2.1 If the tid_used_list is empty and the tid_group_list
381
+ * is empty, stop processing pageset and return only
382
+ * what has been programmed up to this point.
383
+ * 2.2 If the tid_used_list is empty and the tid_group_list
384
+ * is not empty, move a group from tid_group_list to
385
+ * tid_used_list.
386
+ * 2.3 For each group is tid_used_group, program as much as
387
+ * can fit into the group. If the group becomes fully
388
+ * used, move it to tid_full_list.
389
+ */
342
390
int hfi1_user_exp_rcv_setup (struct file * fp , struct hfi1_tid_info * tinfo )
343
391
{
344
- return - EINVAL ;
392
+ int ret = 0 , need_group = 0 , pinned ;
393
+ struct hfi1_filedata * fd = fp -> private_data ;
394
+ struct hfi1_ctxtdata * uctxt = fd -> uctxt ;
395
+ struct hfi1_devdata * dd = uctxt -> dd ;
396
+ unsigned npages , ngroups , pageidx = 0 , pageset_count , npagesets ,
397
+ tididx = 0 , mapped , mapped_pages = 0 ;
398
+ unsigned long vaddr = tinfo -> vaddr ;
399
+ struct page * * pages = NULL ;
400
+ u32 * tidlist = NULL ;
401
+ struct tid_pageset * pagesets = NULL ;
402
+
403
+ /* Get the number of pages the user buffer spans */
404
+ npages = num_user_pages (vaddr , tinfo -> length );
405
+ if (!npages )
406
+ return - EINVAL ;
407
+
408
+ if (npages > uctxt -> expected_count ) {
409
+ dd_dev_err (dd , "Expected buffer too big\n" );
410
+ return - EINVAL ;
411
+ }
412
+
413
+ /* Verify that access is OK for the user buffer */
414
+ if (!access_ok (VERIFY_WRITE , (void __user * )vaddr ,
415
+ npages * PAGE_SIZE )) {
416
+ dd_dev_err (dd , "Fail vaddr %p, %u pages, !access_ok\n" ,
417
+ (void * )vaddr , npages );
418
+ return - EFAULT ;
419
+ }
420
+
421
+ pagesets = kcalloc (uctxt -> expected_count , sizeof (* pagesets ),
422
+ GFP_KERNEL );
423
+ if (!pagesets )
424
+ return - ENOMEM ;
425
+
426
+ /* Allocate the array of struct page pointers needed for pinning */
427
+ pages = kcalloc (npages , sizeof (* pages ), GFP_KERNEL );
428
+ if (!pages ) {
429
+ ret = - ENOMEM ;
430
+ goto bail ;
431
+ }
432
+
433
+ /*
434
+ * Pin all the pages of the user buffer. If we can't pin all the
435
+ * pages, accept the amount pinned so far and program only that.
436
+ * User space knows how to deal with partially programmed buffers.
437
+ */
438
+ pinned = hfi1_acquire_user_pages (vaddr , npages , true, pages );
439
+ if (pinned <= 0 ) {
440
+ ret = pinned ;
441
+ goto bail ;
442
+ }
443
+
444
+ /* Find sets of physically contiguous pages */
445
+ npagesets = find_phys_blocks (pages , pinned , pagesets );
446
+
447
+ /*
448
+ * We don't need to access this under a lock since tid_used is per
449
+ * process and the same process cannot be in hfi1_user_exp_rcv_clear()
450
+ * and hfi1_user_exp_rcv_setup() at the same time.
451
+ */
452
+ spin_lock (& fd -> tid_lock );
453
+ if (fd -> tid_used + npagesets > fd -> tid_limit )
454
+ pageset_count = fd -> tid_limit - fd -> tid_used ;
455
+ else
456
+ pageset_count = npagesets ;
457
+ spin_unlock (& fd -> tid_lock );
458
+
459
+ if (!pageset_count )
460
+ goto bail ;
461
+
462
+ ngroups = pageset_count / dd -> rcv_entries .group_size ;
463
+ tidlist = kcalloc (pageset_count , sizeof (* tidlist ), GFP_KERNEL );
464
+ if (!tidlist ) {
465
+ ret = - ENOMEM ;
466
+ goto nomem ;
467
+ }
468
+
469
+ tididx = 0 ;
470
+
471
+ /*
472
+ * From this point on, we are going to be using shared (between master
473
+ * and subcontexts) context resources. We need to take the lock.
474
+ */
475
+ mutex_lock (& uctxt -> exp_lock );
476
+ /*
477
+ * The first step is to program the RcvArray entries which are complete
478
+ * groups.
479
+ */
480
+ while (ngroups && uctxt -> tid_group_list .count ) {
481
+ struct tid_group * grp =
482
+ tid_group_pop (& uctxt -> tid_group_list );
483
+
484
+ ret = program_rcvarray (fp , vaddr , grp , pagesets ,
485
+ pageidx , dd -> rcv_entries .group_size ,
486
+ pages , tidlist , & tididx , & mapped );
487
+ /*
488
+ * If there was a failure to program the RcvArray
489
+ * entries for the entire group, reset the grp fields
490
+ * and add the grp back to the free group list.
491
+ */
492
+ if (ret <= 0 ) {
493
+ tid_group_add_tail (grp , & uctxt -> tid_group_list );
494
+ hfi1_cdbg (TID ,
495
+ "Failed to program RcvArray group %d" , ret );
496
+ goto unlock ;
497
+ }
498
+
499
+ tid_group_add_tail (grp , & uctxt -> tid_full_list );
500
+ ngroups -- ;
501
+ pageidx += ret ;
502
+ mapped_pages += mapped ;
503
+ }
504
+
505
+ while (pageidx < pageset_count ) {
506
+ struct tid_group * grp , * ptr ;
507
+ /*
508
+ * If we don't have any partially used tid groups, check
509
+ * if we have empty groups. If so, take one from there and
510
+ * put in the partially used list.
511
+ */
512
+ if (!uctxt -> tid_used_list .count || need_group ) {
513
+ if (!uctxt -> tid_group_list .count )
514
+ goto unlock ;
515
+
516
+ grp = tid_group_pop (& uctxt -> tid_group_list );
517
+ tid_group_add_tail (grp , & uctxt -> tid_used_list );
518
+ need_group = 0 ;
519
+ }
520
+ /*
521
+ * There is an optimization opportunity here - instead of
522
+ * fitting as many page sets as we can, check for a group
523
+ * later on in the list that could fit all of them.
524
+ */
525
+ list_for_each_entry_safe (grp , ptr , & uctxt -> tid_used_list .list ,
526
+ list ) {
527
+ unsigned use = min_t (unsigned , pageset_count - pageidx ,
528
+ grp -> size - grp -> used );
529
+
530
+ ret = program_rcvarray (fp , vaddr , grp , pagesets ,
531
+ pageidx , use , pages , tidlist ,
532
+ & tididx , & mapped );
533
+ if (ret < 0 ) {
534
+ hfi1_cdbg (TID ,
535
+ "Failed to program RcvArray entries %d" ,
536
+ ret );
537
+ ret = - EFAULT ;
538
+ goto unlock ;
539
+ } else if (ret > 0 ) {
540
+ if (grp -> used == grp -> size )
541
+ tid_group_move (grp ,
542
+ & uctxt -> tid_used_list ,
543
+ & uctxt -> tid_full_list );
544
+ pageidx += ret ;
545
+ mapped_pages += mapped ;
546
+ need_group = 0 ;
547
+ /* Check if we are done so we break out early */
548
+ if (pageidx >= pageset_count )
549
+ break ;
550
+ } else if (WARN_ON (ret == 0 )) {
551
+ /*
552
+ * If ret is 0, we did not program any entries
553
+ * into this group, which can only happen if
554
+ * we've screwed up the accounting somewhere.
555
+ * Warn and try to continue.
556
+ */
557
+ need_group = 1 ;
558
+ }
559
+ }
560
+ }
561
+ unlock :
562
+ mutex_unlock (& uctxt -> exp_lock );
563
+ nomem :
564
+ hfi1_cdbg (TID , "total mapped: tidpairs:%u pages:%u (%d)" , tididx ,
565
+ mapped_pages , ret );
566
+ if (tididx ) {
567
+ spin_lock (& fd -> tid_lock );
568
+ fd -> tid_used += tididx ;
569
+ spin_unlock (& fd -> tid_lock );
570
+ tinfo -> tidcnt = tididx ;
571
+ tinfo -> length = mapped_pages * PAGE_SIZE ;
572
+
573
+ if (copy_to_user ((void __user * )(unsigned long )tinfo -> tidlist ,
574
+ tidlist , sizeof (tidlist [0 ]) * tididx )) {
575
+ /*
576
+ * On failure to copy to the user level, we need to undo
577
+ * everything done so far so we don't leak resources.
578
+ */
579
+ tinfo -> tidlist = (unsigned long )& tidlist ;
580
+ hfi1_user_exp_rcv_clear (fp , tinfo );
581
+ tinfo -> tidlist = 0 ;
582
+ ret = - EFAULT ;
583
+ goto bail ;
584
+ }
585
+ }
586
+
587
+ /*
588
+ * If not everything was mapped (due to insufficient RcvArray entries,
589
+ * for example), unpin all unmapped pages so we can pin them nex time.
590
+ */
591
+ if (mapped_pages != pinned )
592
+ hfi1_release_user_pages (& pages [mapped_pages ],
593
+ pinned - mapped_pages ,
594
+ false);
595
+ bail :
596
+ kfree (pagesets );
597
+ kfree (pages );
598
+ kfree (tidlist );
599
+ return ret > 0 ? 0 : ret ;
345
600
}
346
601
347
602
int hfi1_user_exp_rcv_clear (struct file * fp , struct hfi1_tid_info * tinfo )
0 commit comments