Skip to content

Commit 67e303e

Browse files
VSR Burrudavem330
authored andcommitted
liquidio: improve UDP TX performance
Improve UDP TX performance by: * reducing the ring size from 2K to 512 * replacing the numerous streaming DMA allocations for info buffers and gather lists with one large consistent DMA allocation per ring BQL is not effective here. We reduced the ring size because there is heavy overhead with dma_map_single every so often. With iommu=on, dma_map_single in PF Tx data path was taking longer time (~700usec) for every ~250 packets. Debugged intel_iommu code, and found that PF driver is utilizing too many static IO virtual address mapping entries (for gather list entries and info buffers): about 100K entries for two PF's each using 8 rings. Also, finding an empty entry (in rbtree of device domain's iova mapping in kernel) during Tx path becomes a bottleneck every so often; the loop to find the empty entry goes through over 40K iterations; this is too costly and was the major overhead. Overhead is low when this loop quits quickly. Netperf benchmark numbers before and after patch: PF UDP TX +--------+--------+------------+------------+---------+ | | | Before | After | | | Number | | Patch | Patch | | | of | Packet | Throughput | Throughput | Percent | | Flows | Size | (Gbps) | (Gbps) | Change | +--------+--------+------------+------------+---------+ | | 360 | 0.52 | 0.93 | +78.9 | | 1 | 1024 | 1.62 | 2.84 | +75.3 | | | 1518 | 2.44 | 4.21 | +72.5 | +--------+--------+------------+------------+---------+ | | 360 | 0.45 | 1.59 | +253.3 | | 4 | 1024 | 1.34 | 5.48 | +308.9 | | | 1518 | 2.27 | 8.31 | +266.1 | +--------+--------+------------+------------+---------+ | | 360 | 0.40 | 1.61 | +302.5 | | 8 | 1024 | 1.64 | 4.24 | +158.5 | | | 1518 | 2.87 | 6.52 | +127.2 | +--------+--------+------------+------------+---------+ VF UDP TX +--------+--------+------------+------------+---------+ | | | Before | After | | | Number | | Patch | Patch | | | of | Packet | Throughput | Throughput | Percent | | Flows | Size | (Gbps) | (Gbps) | Change | +--------+--------+------------+------------+---------+ | | 360 | 1.28 | 1.49 | +16.4 | | 1 | 1024 | 4.44 | 4.39 | -1.1 | | | 1518 | 6.08 | 6.51 | +7.1 | +--------+--------+------------+------------+---------+ | | 360 | 2.35 | 2.35 | 0.0 | | 4 | 1024 | 6.41 | 8.07 | +25.9 | | | 1518 | 9.56 | 9.54 | -0.2 | +--------+--------+------------+------------+---------+ | | 360 | 3.41 | 3.65 | +7.0 | | 8 | 1024 | 9.35 | 9.34 | -0.1 | | | 1518 | 9.56 | 9.57 | +0.1 | +--------+--------+------------+------------+---------+ Signed-off-by: VSR Burru <[email protected]> Signed-off-by: Felix Manlunas <[email protected]> Signed-off-by: Derek Chickles <[email protected]> Signed-off-by: Raghu Vatsavayi <[email protected]> Signed-off-by: David S. Miller <[email protected]>
1 parent 5be083c commit 67e303e

File tree

7 files changed

+144
-182
lines changed

7 files changed

+144
-182
lines changed

drivers/net/ethernet/cavium/liquidio/lio_main.c

Lines changed: 55 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ struct octnic_gather {
152152
*/
153153
struct octeon_sg_entry *sg;
154154

155-
u64 sg_dma_ptr;
155+
dma_addr_t sg_dma_ptr;
156156
};
157157

158158
struct handshake {
@@ -734,30 +734,36 @@ static void delete_glists(struct lio *lio)
734734
struct octnic_gather *g;
735735
int i;
736736

737+
kfree(lio->glist_lock);
738+
lio->glist_lock = NULL;
739+
737740
if (!lio->glist)
738741
return;
739742

740743
for (i = 0; i < lio->linfo.num_txpciq; i++) {
741744
do {
742745
g = (struct octnic_gather *)
743746
list_delete_head(&lio->glist[i]);
744-
if (g) {
745-
if (g->sg) {
746-
dma_unmap_single(&lio->oct_dev->
747-
pci_dev->dev,
748-
g->sg_dma_ptr,
749-
g->sg_size,
750-
DMA_TO_DEVICE);
751-
kfree((void *)((unsigned long)g->sg -
752-
g->adjust));
753-
}
747+
if (g)
754748
kfree(g);
755-
}
756749
} while (g);
750+
751+
if (lio->glists_virt_base && lio->glists_virt_base[i]) {
752+
lio_dma_free(lio->oct_dev,
753+
lio->glist_entry_size * lio->tx_qsize,
754+
lio->glists_virt_base[i],
755+
lio->glists_dma_base[i]);
756+
}
757757
}
758758

759-
kfree((void *)lio->glist);
760-
kfree((void *)lio->glist_lock);
759+
kfree(lio->glists_virt_base);
760+
lio->glists_virt_base = NULL;
761+
762+
kfree(lio->glists_dma_base);
763+
lio->glists_dma_base = NULL;
764+
765+
kfree(lio->glist);
766+
lio->glist = NULL;
761767
}
762768

763769
/**
@@ -772,13 +778,30 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
772778
lio->glist_lock = kcalloc(num_iqs, sizeof(*lio->glist_lock),
773779
GFP_KERNEL);
774780
if (!lio->glist_lock)
775-
return 1;
781+
return -ENOMEM;
776782

777783
lio->glist = kcalloc(num_iqs, sizeof(*lio->glist),
778784
GFP_KERNEL);
779785
if (!lio->glist) {
780-
kfree((void *)lio->glist_lock);
781-
return 1;
786+
kfree(lio->glist_lock);
787+
lio->glist_lock = NULL;
788+
return -ENOMEM;
789+
}
790+
791+
lio->glist_entry_size =
792+
ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);
793+
794+
/* allocate memory to store virtual and dma base address of
795+
* per glist consistent memory
796+
*/
797+
lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
798+
GFP_KERNEL);
799+
lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
800+
GFP_KERNEL);
801+
802+
if (!lio->glists_virt_base || !lio->glists_dma_base) {
803+
delete_glists(lio);
804+
return -ENOMEM;
782805
}
783806

784807
for (i = 0; i < num_iqs; i++) {
@@ -788,6 +811,16 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
788811

789812
INIT_LIST_HEAD(&lio->glist[i]);
790813

814+
lio->glists_virt_base[i] =
815+
lio_dma_alloc(oct,
816+
lio->glist_entry_size * lio->tx_qsize,
817+
&lio->glists_dma_base[i]);
818+
819+
if (!lio->glists_virt_base[i]) {
820+
delete_glists(lio);
821+
return -ENOMEM;
822+
}
823+
791824
for (j = 0; j < lio->tx_qsize; j++) {
792825
g = kzalloc_node(sizeof(*g), GFP_KERNEL,
793826
numa_node);
@@ -796,43 +829,18 @@ static int setup_glists(struct octeon_device *oct, struct lio *lio, int num_iqs)
796829
if (!g)
797830
break;
798831

799-
g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
800-
OCT_SG_ENTRY_SIZE);
832+
g->sg = lio->glists_virt_base[i] +
833+
(j * lio->glist_entry_size);
801834

802-
g->sg = kmalloc_node(g->sg_size + 8,
803-
GFP_KERNEL, numa_node);
804-
if (!g->sg)
805-
g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
806-
if (!g->sg) {
807-
kfree(g);
808-
break;
809-
}
810-
811-
/* The gather component should be aligned on 64-bit
812-
* boundary
813-
*/
814-
if (((unsigned long)g->sg) & 7) {
815-
g->adjust = 8 - (((unsigned long)g->sg) & 7);
816-
g->sg = (struct octeon_sg_entry *)
817-
((unsigned long)g->sg + g->adjust);
818-
}
819-
g->sg_dma_ptr = dma_map_single(&oct->pci_dev->dev,
820-
g->sg, g->sg_size,
821-
DMA_TO_DEVICE);
822-
if (dma_mapping_error(&oct->pci_dev->dev,
823-
g->sg_dma_ptr)) {
824-
kfree((void *)((unsigned long)g->sg -
825-
g->adjust));
826-
kfree(g);
827-
break;
828-
}
835+
g->sg_dma_ptr = lio->glists_dma_base[i] +
836+
(j * lio->glist_entry_size);
829837

830838
list_add_tail(&g->list, &lio->glist[i]);
831839
}
832840

833841
if (j != lio->tx_qsize) {
834842
delete_glists(lio);
835-
return 1;
843+
return -ENOMEM;
836844
}
837845
}
838846

@@ -1885,9 +1893,6 @@ static void free_netsgbuf(void *buf)
18851893
i++;
18861894
}
18871895

1888-
dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev,
1889-
g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE);
1890-
18911896
iq = skb_iq(lio, skb);
18921897
spin_lock(&lio->glist_lock[iq]);
18931898
list_add_tail(&g->list, &lio->glist[iq]);
@@ -1933,9 +1938,6 @@ static void free_netsgbuf_with_resp(void *buf)
19331938
i++;
19341939
}
19351940

1936-
dma_sync_single_for_cpu(&lio->oct_dev->pci_dev->dev,
1937-
g->sg_dma_ptr, g->sg_size, DMA_TO_DEVICE);
1938-
19391941
iq = skb_iq(lio, skb);
19401942

19411943
spin_lock(&lio->glist_lock[iq]);
@@ -3273,8 +3275,6 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
32733275
i++;
32743276
}
32753277

3276-
dma_sync_single_for_device(&oct->pci_dev->dev, g->sg_dma_ptr,
3277-
g->sg_size, DMA_TO_DEVICE);
32783278
dptr = g->sg_dma_ptr;
32793279

32803280
if (OCTEON_CN23XX_PF(oct))

drivers/net/ethernet/cavium/liquidio/lio_vf_main.c

Lines changed: 55 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ struct octnic_gather {
108108
* received from the IP layer.
109109
*/
110110
struct octeon_sg_entry *sg;
111+
112+
dma_addr_t sg_dma_ptr;
111113
};
112114

113115
struct octeon_device_priv {
@@ -490,24 +492,36 @@ static void delete_glists(struct lio *lio)
490492
struct octnic_gather *g;
491493
int i;
492494

495+
kfree(lio->glist_lock);
496+
lio->glist_lock = NULL;
497+
493498
if (!lio->glist)
494499
return;
495500

496501
for (i = 0; i < lio->linfo.num_txpciq; i++) {
497502
do {
498503
g = (struct octnic_gather *)
499504
list_delete_head(&lio->glist[i]);
500-
if (g) {
501-
if (g->sg)
502-
kfree((void *)((unsigned long)g->sg -
503-
g->adjust));
505+
if (g)
504506
kfree(g);
505-
}
506507
} while (g);
508+
509+
if (lio->glists_virt_base && lio->glists_virt_base[i]) {
510+
lio_dma_free(lio->oct_dev,
511+
lio->glist_entry_size * lio->tx_qsize,
512+
lio->glists_virt_base[i],
513+
lio->glists_dma_base[i]);
514+
}
507515
}
508516

517+
kfree(lio->glists_virt_base);
518+
lio->glists_virt_base = NULL;
519+
520+
kfree(lio->glists_dma_base);
521+
lio->glists_dma_base = NULL;
522+
509523
kfree(lio->glist);
510-
kfree(lio->glist_lock);
524+
lio->glist = NULL;
511525
}
512526

513527
/**
@@ -522,48 +536,64 @@ static int setup_glists(struct lio *lio, int num_iqs)
522536
lio->glist_lock =
523537
kzalloc(sizeof(*lio->glist_lock) * num_iqs, GFP_KERNEL);
524538
if (!lio->glist_lock)
525-
return 1;
539+
return -ENOMEM;
526540

527541
lio->glist =
528542
kzalloc(sizeof(*lio->glist) * num_iqs, GFP_KERNEL);
529543
if (!lio->glist) {
530544
kfree(lio->glist_lock);
531-
return 1;
545+
lio->glist_lock = NULL;
546+
return -ENOMEM;
547+
}
548+
549+
lio->glist_entry_size =
550+
ROUNDUP8((ROUNDUP4(OCTNIC_MAX_SG) >> 2) * OCT_SG_ENTRY_SIZE);
551+
552+
/* allocate memory to store virtual and dma base address of
553+
* per glist consistent memory
554+
*/
555+
lio->glists_virt_base = kcalloc(num_iqs, sizeof(*lio->glists_virt_base),
556+
GFP_KERNEL);
557+
lio->glists_dma_base = kcalloc(num_iqs, sizeof(*lio->glists_dma_base),
558+
GFP_KERNEL);
559+
560+
if (!lio->glists_virt_base || !lio->glists_dma_base) {
561+
delete_glists(lio);
562+
return -ENOMEM;
532563
}
533564

534565
for (i = 0; i < num_iqs; i++) {
535566
spin_lock_init(&lio->glist_lock[i]);
536567

537568
INIT_LIST_HEAD(&lio->glist[i]);
538569

570+
lio->glists_virt_base[i] =
571+
lio_dma_alloc(lio->oct_dev,
572+
lio->glist_entry_size * lio->tx_qsize,
573+
&lio->glists_dma_base[i]);
574+
575+
if (!lio->glists_virt_base[i]) {
576+
delete_glists(lio);
577+
return -ENOMEM;
578+
}
579+
539580
for (j = 0; j < lio->tx_qsize; j++) {
540581
g = kzalloc(sizeof(*g), GFP_KERNEL);
541582
if (!g)
542583
break;
543584

544-
g->sg_size = ((ROUNDUP4(OCTNIC_MAX_SG) >> 2) *
545-
OCT_SG_ENTRY_SIZE);
585+
g->sg = lio->glists_virt_base[i] +
586+
(j * lio->glist_entry_size);
546587

547-
g->sg = kmalloc(g->sg_size + 8, GFP_KERNEL);
548-
if (!g->sg) {
549-
kfree(g);
550-
break;
551-
}
588+
g->sg_dma_ptr = lio->glists_dma_base[i] +
589+
(j * lio->glist_entry_size);
552590

553-
/* The gather component should be aligned on 64-bit
554-
* boundary
555-
*/
556-
if (((unsigned long)g->sg) & 7) {
557-
g->adjust = 8 - (((unsigned long)g->sg) & 7);
558-
g->sg = (struct octeon_sg_entry *)
559-
((unsigned long)g->sg + g->adjust);
560-
}
561591
list_add_tail(&g->list, &lio->glist[i]);
562592
}
563593

564594
if (j != lio->tx_qsize) {
565595
delete_glists(lio);
566-
return 1;
596+
return -ENOMEM;
567597
}
568598
}
569599

@@ -1324,10 +1354,6 @@ static void free_netsgbuf(void *buf)
13241354
i++;
13251355
}
13261356

1327-
dma_unmap_single(&lio->oct_dev->pci_dev->dev,
1328-
finfo->dptr, g->sg_size,
1329-
DMA_TO_DEVICE);
1330-
13311357
iq = skb_iq(lio, skb);
13321358

13331359
spin_lock(&lio->glist_lock[iq]);
@@ -1374,10 +1400,6 @@ static void free_netsgbuf_with_resp(void *buf)
13741400
i++;
13751401
}
13761402

1377-
dma_unmap_single(&lio->oct_dev->pci_dev->dev,
1378-
finfo->dptr, g->sg_size,
1379-
DMA_TO_DEVICE);
1380-
13811403
iq = skb_iq(lio, skb);
13821404

13831405
spin_lock(&lio->glist_lock[iq]);
@@ -2382,23 +2404,7 @@ static int liquidio_xmit(struct sk_buff *skb, struct net_device *netdev)
23822404
i++;
23832405
}
23842406

2385-
dptr = dma_map_single(&oct->pci_dev->dev,
2386-
g->sg, g->sg_size,
2387-
DMA_TO_DEVICE);
2388-
if (dma_mapping_error(&oct->pci_dev->dev, dptr)) {
2389-
dev_err(&oct->pci_dev->dev, "%s DMA mapping error 4\n",
2390-
__func__);
2391-
dma_unmap_single(&oct->pci_dev->dev, g->sg[0].ptr[0],
2392-
skb->len - skb->data_len,
2393-
DMA_TO_DEVICE);
2394-
for (j = 1; j <= frags; j++) {
2395-
frag = &skb_shinfo(skb)->frags[j - 1];
2396-
dma_unmap_page(&oct->pci_dev->dev,
2397-
g->sg[j >> 2].ptr[j & 3],
2398-
frag->size, DMA_TO_DEVICE);
2399-
}
2400-
return NETDEV_TX_BUSY;
2401-
}
2407+
dptr = g->sg_dma_ptr;
24022408

24032409
ndata.cmd.cmd3.dptr = dptr;
24042410
finfo->dptr = dptr;

drivers/net/ethernet/cavium/liquidio/octeon_config.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,17 +71,17 @@
7171
#define CN23XX_MAX_RINGS_PER_VF 8
7272

7373
#define CN23XX_MAX_INPUT_QUEUES CN23XX_MAX_RINGS_PER_PF
74-
#define CN23XX_MAX_IQ_DESCRIPTORS 2048
74+
#define CN23XX_MAX_IQ_DESCRIPTORS 512
7575
#define CN23XX_DB_MIN 1
7676
#define CN23XX_DB_MAX 8
7777
#define CN23XX_DB_TIMEOUT 1
7878

7979
#define CN23XX_MAX_OUTPUT_QUEUES CN23XX_MAX_RINGS_PER_PF
80-
#define CN23XX_MAX_OQ_DESCRIPTORS 2048
80+
#define CN23XX_MAX_OQ_DESCRIPTORS 512
8181
#define CN23XX_OQ_BUF_SIZE 1536
8282
#define CN23XX_OQ_PKTSPER_INTR 128
8383
/*#define CAVIUM_ONLY_CN23XX_RX_PERF*/
84-
#define CN23XX_OQ_REFIL_THRESHOLD 128
84+
#define CN23XX_OQ_REFIL_THRESHOLD 16
8585

8686
#define CN23XX_OQ_INTR_PKT 64
8787
#define CN23XX_OQ_INTR_TIME 100

0 commit comments

Comments
 (0)