Skip to content

Commit c5552fd

Browse files
amlutoaxboe
authored andcommitted
nvme: Enable autonomous power state transitions
NVMe devices can advertise multiple power states. These states can be either "operational" (the device is fully functional but possibly slow) or "non-operational" (the device is asleep until woken up). Some devices can automatically enter a non-operational state when idle for a specified amount of time and then automatically wake back up when needed. The hardware configuration is a table. For each state, an entry in the table indicates the next deeper non-operational state, if any, to autonomously transition to and the idle time required before transitioning. This patch teaches the driver to program APST so that each successive non-operational state will be entered after an idle time equal to 100% of the total latency (entry plus exit) associated with that state. The maximum acceptable latency is controlled using dev_pm_qos (e.g. power/pm_qos_latency_tolerance_us in sysfs); non-operational states with total latency greater than this value will not be used. As a special case, setting the latency tolerance to 0 will disable APST entirely. On hardware without APST support, the sysfs file will not be exposed. The latency tolerance for newly-probed devices is set by the module parameter nvme_core.default_ps_max_latency_us. In theory, the device can expose "default" APST table, but this doesn't seem to function correctly on my device (Samsung 950), nor does it seem particularly useful. There is also an optional mechanism by which a configuration can be "saved" so it will be automatically loaded on reset. This can be configured from userspace, but it doesn't seem useful to support in the driver. On my laptop, enabling APST seems to save nearly 1W. The hardware tables can be decoded in userspace with nvme-cli. 'nvme id-ctrl /dev/nvmeN' will show the power state table and 'nvme get-feature -f 0x0c -H /dev/nvme0' will show the current APST configuration. This feature is quirked off on a known-buggy Samsung device. Signed-off-by: Andy Lutomirski <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Signed-off-by: Sagi Grimberg <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent bd4da3a commit c5552fd

File tree

3 files changed

+171
-0
lines changed

3 files changed

+171
-0
lines changed

drivers/nvme/host/core.c

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <linux/ptrace.h>
2727
#include <linux/nvme_ioctl.h>
2828
#include <linux/t10-pi.h>
29+
#include <linux/pm_qos.h>
2930
#include <scsi/sg.h>
3031
#include <asm/unaligned.h>
3132

@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
5657
static int nvme_char_major;
5758
module_param(nvme_char_major, int, 0);
5859

60+
static unsigned long default_ps_max_latency_us = 25000;
61+
module_param(default_ps_max_latency_us, ulong, 0644);
62+
MODULE_PARM_DESC(default_ps_max_latency_us,
63+
"max power saving latency for new devices; use PM QOS to change per device");
64+
5965
static LIST_HEAD(nvme_ctrl_list);
6066
static DEFINE_SPINLOCK(dev_list_lock);
6167

@@ -1252,6 +1258,122 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
12521258
blk_queue_write_cache(q, vwc, vwc);
12531259
}
12541260

1261+
static void nvme_configure_apst(struct nvme_ctrl *ctrl)
1262+
{
1263+
/*
1264+
* APST (Autonomous Power State Transition) lets us program a
1265+
* table of power state transitions that the controller will
1266+
* perform automatically. We configure it with a simple
1267+
* heuristic: we are willing to spend at most 2% of the time
1268+
* transitioning between power states. Therefore, when running
1269+
* in any given state, we will enter the next lower-power
1270+
* non-operational state after waiting 100 * (enlat + exlat)
1271+
* microseconds, as long as that state's total latency is under
1272+
* the requested maximum latency.
1273+
*
1274+
* We will not autonomously enter any non-operational state for
1275+
* which the total latency exceeds ps_max_latency_us. Users
1276+
* can set ps_max_latency_us to zero to turn off APST.
1277+
*/
1278+
1279+
unsigned apste;
1280+
struct nvme_feat_auto_pst *table;
1281+
int ret;
1282+
1283+
/*
1284+
* If APST isn't supported or if we haven't been initialized yet,
1285+
* then don't do anything.
1286+
*/
1287+
if (!ctrl->apsta)
1288+
return;
1289+
1290+
if (ctrl->npss > 31) {
1291+
dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
1292+
return;
1293+
}
1294+
1295+
table = kzalloc(sizeof(*table), GFP_KERNEL);
1296+
if (!table)
1297+
return;
1298+
1299+
if (ctrl->ps_max_latency_us == 0) {
1300+
/* Turn off APST. */
1301+
apste = 0;
1302+
} else {
1303+
__le64 target = cpu_to_le64(0);
1304+
int state;
1305+
1306+
/*
1307+
* Walk through all states from lowest- to highest-power.
1308+
* According to the spec, lower-numbered states use more
1309+
* power. NPSS, despite the name, is the index of the
1310+
* lowest-power state, not the number of states.
1311+
*/
1312+
for (state = (int)ctrl->npss; state >= 0; state--) {
1313+
u64 total_latency_us, transition_ms;
1314+
1315+
if (target)
1316+
table->entries[state] = target;
1317+
1318+
/*
1319+
* Is this state a useful non-operational state for
1320+
* higher-power states to autonomously transition to?
1321+
*/
1322+
if (!(ctrl->psd[state].flags &
1323+
NVME_PS_FLAGS_NON_OP_STATE))
1324+
continue;
1325+
1326+
total_latency_us =
1327+
(u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
1328+
+ le32_to_cpu(ctrl->psd[state].exit_lat);
1329+
if (total_latency_us > ctrl->ps_max_latency_us)
1330+
continue;
1331+
1332+
/*
1333+
* This state is good. Use it as the APST idle
1334+
* target for higher power states.
1335+
*/
1336+
transition_ms = total_latency_us + 19;
1337+
do_div(transition_ms, 20);
1338+
if (transition_ms > (1 << 24) - 1)
1339+
transition_ms = (1 << 24) - 1;
1340+
1341+
target = cpu_to_le64((state << 3) |
1342+
(transition_ms << 8));
1343+
}
1344+
1345+
apste = 1;
1346+
}
1347+
1348+
ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
1349+
table, sizeof(*table), NULL);
1350+
if (ret)
1351+
dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
1352+
1353+
kfree(table);
1354+
}
1355+
1356+
static void nvme_set_latency_tolerance(struct device *dev, s32 val)
1357+
{
1358+
struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
1359+
u64 latency;
1360+
1361+
switch (val) {
1362+
case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
1363+
case PM_QOS_LATENCY_ANY:
1364+
latency = U64_MAX;
1365+
break;
1366+
1367+
default:
1368+
latency = val;
1369+
}
1370+
1371+
if (ctrl->ps_max_latency_us != latency) {
1372+
ctrl->ps_max_latency_us = latency;
1373+
nvme_configure_apst(ctrl);
1374+
}
1375+
}
1376+
12551377
struct nvme_core_quirk_entry {
12561378
/*
12571379
* NVMe model and firmware strings are padded with spaces. For
@@ -1265,6 +1387,16 @@ struct nvme_core_quirk_entry {
12651387
};
12661388

12671389
static const struct nvme_core_quirk_entry core_quirks[] = {
1390+
/*
1391+
* Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
1392+
* the controller to go out to lunch. It dies when the watchdog
1393+
* timer reads CSTS and gets 0xffffffff.
1394+
*/
1395+
{
1396+
.vid = 0x144d,
1397+
.fr = "BXW75D0Q",
1398+
.quirks = NVME_QUIRK_NO_APST,
1399+
},
12681400
};
12691401

12701402
/* match is null-terminated but idstr is space-padded. */
@@ -1307,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
13071439
u64 cap;
13081440
int ret, page_shift;
13091441
u32 max_hw_sectors;
1442+
u8 prev_apsta;
13101443

13111444
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
13121445
if (ret) {
@@ -1368,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
13681501
ctrl->sgls = le32_to_cpu(id->sgls);
13691502
ctrl->kas = le16_to_cpu(id->kas);
13701503

1504+
ctrl->npss = id->npss;
1505+
prev_apsta = ctrl->apsta;
1506+
ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
1507+
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
1508+
13711509
if (ctrl->ops->is_fabrics) {
13721510
ctrl->icdoff = le16_to_cpu(id->icdoff);
13731511
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
@@ -1392,7 +1530,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
13921530

13931531
kfree(id);
13941532

1533+
if (ctrl->apsta && !prev_apsta)
1534+
dev_pm_qos_expose_latency_tolerance(ctrl->device);
1535+
else if (!ctrl->apsta && prev_apsta)
1536+
dev_pm_qos_hide_latency_tolerance(ctrl->device);
1537+
1538+
nvme_configure_apst(ctrl);
1539+
13951540
ctrl->identified = true;
1541+
13961542
return ret;
13971543
}
13981544
EXPORT_SYMBOL_GPL(nvme_init_identify);
@@ -2154,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
21542300
list_add_tail(&ctrl->node, &nvme_ctrl_list);
21552301
spin_unlock(&dev_list_lock);
21562302

2303+
/*
2304+
* Initialize latency tolerance controls. The sysfs files won't
2305+
* be visible to userspace unless the device actually supports APST.
2306+
*/
2307+
ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
2308+
dev_pm_qos_update_user_latency_tolerance(ctrl->device,
2309+
min(default_ps_max_latency_us, (unsigned long)S32_MAX));
2310+
21572311
return 0;
21582312
out_release_instance:
21592313
nvme_release_instance(ctrl);

drivers/nvme/host/nvme.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ enum nvme_quirks {
7878
* readiness, which is done by reading the NVME_CSTS_RDY bit.
7979
*/
8080
NVME_QUIRK_DELAY_BEFORE_CHK_RDY = (1 << 3),
81+
82+
/*
83+
* APST should not be used.
84+
*/
85+
NVME_QUIRK_NO_APST = (1 << 4),
8186
};
8287

8388
/*
@@ -148,13 +153,19 @@ struct nvme_ctrl {
148153
u32 vs;
149154
u32 sgls;
150155
u16 kas;
156+
u8 npss;
157+
u8 apsta;
151158
unsigned int kato;
152159
bool subsystem;
153160
unsigned long quirks;
161+
struct nvme_id_power_state psd[32];
154162
struct work_struct scan_work;
155163
struct work_struct async_event_work;
156164
struct delayed_work ka_work;
157165

166+
/* Power saving configuration */
167+
u64 ps_max_latency_us;
168+
158169
/* Fabrics only */
159170
u16 sqsize;
160171
u32 ioccsz;

include/linux/nvme.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,12 @@ struct nvme_write_zeroes_cmd {
579579
__le16 appmask;
580580
};
581581

582+
/* Features */
583+
584+
struct nvme_feat_auto_pst {
585+
__le64 entries[32];
586+
};
587+
582588
/* Admin commands */
583589

584590
enum nvme_admin_opcode {

0 commit comments

Comments
 (0)