Skip to content

Commit cd15b04

Browse files
committed
powerpc/powernv: Add iommu DMA bypass support for IODA2
This patch adds the support for to create a direct iommu "bypass" window on IODA2 bridges (such as Power8) allowing to bypass iommu page translation completely for 64-bit DMA capable devices, thus significantly improving DMA performances. Additionally, this adds a hook to the struct iommu_table so that the IOMMU API / VFIO can disable the bypass when external ownership is requested, since in that case, the device will be used by an environment such as userspace or a KVM guest which must not be allowed to bypass translations. Signed-off-by: Benjamin Herrenschmidt <[email protected]>
1 parent ea961a8 commit cd15b04

File tree

9 files changed

+137
-4
lines changed

9 files changed

+137
-4
lines changed

arch/powerpc/include/asm/dma-mapping.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,7 @@ static inline int dma_supported(struct device *dev, u64 mask)
134134
}
135135

136136
extern int dma_set_mask(struct device *dev, u64 dma_mask);
137+
extern int __dma_set_mask(struct device *dev, u64 dma_mask);
137138

138139
#define dma_alloc_coherent(d,s,h,f) dma_alloc_attrs(d,s,h,f,NULL)
139140

arch/powerpc/include/asm/iommu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ struct iommu_table {
7777
#ifdef CONFIG_IOMMU_API
7878
struct iommu_group *it_group;
7979
#endif
80+
void (*set_bypass)(struct iommu_table *tbl, bool enable);
8081
};
8182

8283
/* Pure 2^n version of get_order */

arch/powerpc/kernel/dma.c

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -191,19 +191,23 @@ EXPORT_SYMBOL(dma_direct_ops);
191191

192192
#define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16)
193193

194-
int dma_set_mask(struct device *dev, u64 dma_mask)
194+
int __dma_set_mask(struct device *dev, u64 dma_mask)
195195
{
196196
struct dma_map_ops *dma_ops = get_dma_ops(dev);
197197

198-
if (ppc_md.dma_set_mask)
199-
return ppc_md.dma_set_mask(dev, dma_mask);
200198
if ((dma_ops != NULL) && (dma_ops->set_dma_mask != NULL))
201199
return dma_ops->set_dma_mask(dev, dma_mask);
202200
if (!dev->dma_mask || !dma_supported(dev, dma_mask))
203201
return -EIO;
204202
*dev->dma_mask = dma_mask;
205203
return 0;
206204
}
205+
int dma_set_mask(struct device *dev, u64 dma_mask)
206+
{
207+
if (ppc_md.dma_set_mask)
208+
return ppc_md.dma_set_mask(dev, dma_mask);
209+
return __dma_set_mask(dev, dma_mask);
210+
}
207211
EXPORT_SYMBOL(dma_set_mask);
208212

209213
u64 dma_get_required_mask(struct device *dev)

arch/powerpc/kernel/iommu.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,6 +1088,14 @@ int iommu_take_ownership(struct iommu_table *tbl)
10881088
memset(tbl->it_map, 0xff, sz);
10891089
iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
10901090

1091+
/*
1092+
* Disable iommu bypass, otherwise the user can DMA to all of
1093+
* our physical memory via the bypass window instead of just
1094+
* the pages that has been explicitly mapped into the iommu
1095+
*/
1096+
if (tbl->set_bypass)
1097+
tbl->set_bypass(tbl, false);
1098+
10911099
return 0;
10921100
}
10931101
EXPORT_SYMBOL_GPL(iommu_take_ownership);
@@ -1102,6 +1110,10 @@ void iommu_release_ownership(struct iommu_table *tbl)
11021110
/* Restore bit#0 set by iommu_init_table() */
11031111
if (tbl->it_offset == 0)
11041112
set_bit(0, tbl->it_map);
1113+
1114+
/* The kernel owns the device now, we can restore the iommu bypass */
1115+
if (tbl->set_bypass)
1116+
tbl->set_bypass(tbl, true);
11051117
}
11061118
EXPORT_SYMBOL_GPL(iommu_release_ownership);
11071119

arch/powerpc/platforms/powernv/pci-ioda.c

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <linux/irq.h>
2222
#include <linux/io.h>
2323
#include <linux/msi.h>
24+
#include <linux/memblock.h>
2425

2526
#include <asm/sections.h>
2627
#include <asm/io.h>
@@ -460,9 +461,39 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev
460461
return;
461462

462463
pe = &phb->ioda.pe_array[pdn->pe_number];
464+
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
463465
set_iommu_table_base_and_group(&pdev->dev, &pe->tce32_table);
464466
}
465467

468+
static int pnv_pci_ioda_dma_set_mask(struct pnv_phb *phb,
469+
struct pci_dev *pdev, u64 dma_mask)
470+
{
471+
struct pci_dn *pdn = pci_get_pdn(pdev);
472+
struct pnv_ioda_pe *pe;
473+
uint64_t top;
474+
bool bypass = false;
475+
476+
if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
477+
return -ENODEV;;
478+
479+
pe = &phb->ioda.pe_array[pdn->pe_number];
480+
if (pe->tce_bypass_enabled) {
481+
top = pe->tce_bypass_base + memblock_end_of_DRAM() - 1;
482+
bypass = (dma_mask >= top);
483+
}
484+
485+
if (bypass) {
486+
dev_info(&pdev->dev, "Using 64-bit DMA iommu bypass\n");
487+
set_dma_ops(&pdev->dev, &dma_direct_ops);
488+
set_dma_offset(&pdev->dev, pe->tce_bypass_base);
489+
} else {
490+
dev_info(&pdev->dev, "Using 32-bit DMA via iommu\n");
491+
set_dma_ops(&pdev->dev, &dma_iommu_ops);
492+
set_iommu_table_base(&pdev->dev, &pe->tce32_table);
493+
}
494+
return 0;
495+
}
496+
466497
static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
467498
{
468499
struct pci_dev *dev;
@@ -657,6 +688,56 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
657688
__free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
658689
}
659690

691+
static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
692+
{
693+
struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
694+
tce32_table);
695+
uint16_t window_id = (pe->pe_number << 1 ) + 1;
696+
int64_t rc;
697+
698+
pe_info(pe, "%sabling 64-bit DMA bypass\n", enable ? "En" : "Dis");
699+
if (enable) {
700+
phys_addr_t top = memblock_end_of_DRAM();
701+
702+
top = roundup_pow_of_two(top);
703+
rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
704+
pe->pe_number,
705+
window_id,
706+
pe->tce_bypass_base,
707+
top);
708+
} else {
709+
rc = opal_pci_map_pe_dma_window_real(pe->phb->opal_id,
710+
pe->pe_number,
711+
window_id,
712+
pe->tce_bypass_base,
713+
0);
714+
715+
/*
716+
* We might want to reset the DMA ops of all devices on
717+
* this PE. However in theory, that shouldn't be necessary
718+
* as this is used for VFIO/KVM pass-through and the device
719+
* hasn't yet been returned to its kernel driver
720+
*/
721+
}
722+
if (rc)
723+
pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
724+
else
725+
pe->tce_bypass_enabled = enable;
726+
}
727+
728+
static void pnv_pci_ioda2_setup_bypass_pe(struct pnv_phb *phb,
729+
struct pnv_ioda_pe *pe)
730+
{
731+
/* TVE #1 is selected by PCI address bit 59 */
732+
pe->tce_bypass_base = 1ull << 59;
733+
734+
/* Install set_bypass callback for VFIO */
735+
pe->tce32_table.set_bypass = pnv_pci_ioda2_set_bypass;
736+
737+
/* Enable bypass by default */
738+
pnv_pci_ioda2_set_bypass(&pe->tce32_table, true);
739+
}
740+
660741
static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
661742
struct pnv_ioda_pe *pe)
662743
{
@@ -727,6 +808,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
727808
else
728809
pnv_ioda_setup_bus_dma(pe, pe->pbus);
729810

811+
/* Also create a bypass window */
812+
pnv_pci_ioda2_setup_bypass_pe(phb, pe);
730813
return;
731814
fail:
732815
if (pe->tce32_seg >= 0)
@@ -1286,6 +1369,7 @@ void __init pnv_pci_init_ioda_phb(struct device_node *np,
12861369

12871370
/* Setup TCEs */
12881371
phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
1372+
phb->dma_set_mask = pnv_pci_ioda_dma_set_mask;
12891373

12901374
/* Setup shutdown function for kexec */
12911375
phb->shutdown = pnv_pci_ioda_shutdown;

arch/powerpc/platforms/powernv/pci.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,16 @@ static void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
634634
pnv_pci_dma_fallback_setup(hose, pdev);
635635
}
636636

637+
int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
638+
{
639+
struct pci_controller *hose = pci_bus_to_host(pdev->bus);
640+
struct pnv_phb *phb = hose->private_data;
641+
642+
if (phb && phb->dma_set_mask)
643+
return phb->dma_set_mask(phb, pdev, dma_mask);
644+
return __dma_set_mask(&pdev->dev, dma_mask);
645+
}
646+
637647
void pnv_pci_shutdown(void)
638648
{
639649
struct pci_controller *hose;

arch/powerpc/platforms/powernv/pci.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,9 @@ struct pnv_ioda_pe {
5454
struct iommu_table tce32_table;
5555
phys_addr_t tce_inval_reg_phys;
5656

57-
/* XXX TODO: Add support for additional 64-bit iommus */
57+
/* 64-bit TCE bypass region */
58+
bool tce_bypass_enabled;
59+
uint64_t tce_bypass_base;
5860

5961
/* MSIs. MVE index is identical for for 32 and 64 bit MSI
6062
* and -1 if not supported. (It's actually identical to the
@@ -113,6 +115,8 @@ struct pnv_phb {
113115
unsigned int hwirq, unsigned int virq,
114116
unsigned int is_64, struct msi_msg *msg);
115117
void (*dma_dev_setup)(struct pnv_phb *phb, struct pci_dev *pdev);
118+
int (*dma_set_mask)(struct pnv_phb *phb, struct pci_dev *pdev,
119+
u64 dma_mask);
116120
void (*fixup_phb)(struct pci_controller *hose);
117121
u32 (*bdfn_to_pe)(struct pnv_phb *phb, struct pci_bus *bus, u32 devfn);
118122
void (*shutdown)(struct pnv_phb *phb);

arch/powerpc/platforms/powernv/powernv.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,20 @@ extern void pnv_smp_init(void);
77
static inline void pnv_smp_init(void) { }
88
#endif
99

10+
struct pci_dev;
11+
1012
#ifdef CONFIG_PCI
1113
extern void pnv_pci_init(void);
1214
extern void pnv_pci_shutdown(void);
15+
extern int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask);
1316
#else
1417
static inline void pnv_pci_init(void) { }
1518
static inline void pnv_pci_shutdown(void) { }
19+
20+
static inline int pnv_pci_dma_set_mask(struct pci_dev *pdev, u64 dma_mask)
21+
{
22+
return -ENODEV;
23+
}
1624
#endif
1725

1826
extern void pnv_lpc_init(void);

arch/powerpc/platforms/powernv/setup.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
#include <linux/interrupt.h>
2828
#include <linux/bug.h>
2929
#include <linux/cpuidle.h>
30+
#include <linux/pci.h>
3031

3132
#include <asm/machdep.h>
3233
#include <asm/firmware.h>
@@ -141,6 +142,13 @@ static void pnv_progress(char *s, unsigned short hex)
141142
{
142143
}
143144

145+
static int pnv_dma_set_mask(struct device *dev, u64 dma_mask)
146+
{
147+
if (dev_is_pci(dev))
148+
return pnv_pci_dma_set_mask(to_pci_dev(dev), dma_mask);
149+
return __dma_set_mask(dev, dma_mask);
150+
}
151+
144152
static void pnv_shutdown(void)
145153
{
146154
/* Let the PCI code clear up IODA tables */
@@ -238,6 +246,7 @@ define_machine(powernv) {
238246
.machine_shutdown = pnv_shutdown,
239247
.power_save = powernv_idle,
240248
.calibrate_decr = generic_calibrate_decr,
249+
.dma_set_mask = pnv_dma_set_mask,
241250
#ifdef CONFIG_KEXEC
242251
.kexec_cpu_down = pnv_kexec_cpu_down,
243252
#endif

0 commit comments

Comments
 (0)