Skip to content

Commit 930c532

Browse files
committed
libceph: apply new_state before new_up_client on incrementals
Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: [email protected] # 3.15+: 6dd74e4: libceph: set 'exists' flag for newly up osd Cc: [email protected] # 3.15+ Signed-off-by: Ilya Dryomov <[email protected]> Reviewed-by: Josh Durgin <[email protected]>
1 parent 92d21ac commit 930c532

File tree

1 file changed

+113
-43
lines changed

1 file changed

+113
-43
lines changed

net/ceph/osdmap.c

Lines changed: 113 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1260,6 +1260,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
12601260
return map;
12611261
}
12621262

1263+
/*
1264+
* Encoding order is (new_up_client, new_state, new_weight). Need to
1265+
* apply in the (new_weight, new_state, new_up_client) order, because
1266+
* an incremental map may look like e.g.
1267+
*
1268+
* new_up_client: { osd=6, addr=... } # set osd_state and addr
1269+
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state
1270+
*/
1271+
static int decode_new_up_state_weight(void **p, void *end,
1272+
struct ceph_osdmap *map)
1273+
{
1274+
void *new_up_client;
1275+
void *new_state;
1276+
void *new_weight_end;
1277+
u32 len;
1278+
1279+
new_up_client = *p;
1280+
ceph_decode_32_safe(p, end, len, e_inval);
1281+
len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
1282+
ceph_decode_need(p, end, len, e_inval);
1283+
*p += len;
1284+
1285+
new_state = *p;
1286+
ceph_decode_32_safe(p, end, len, e_inval);
1287+
len *= sizeof(u32) + sizeof(u8);
1288+
ceph_decode_need(p, end, len, e_inval);
1289+
*p += len;
1290+
1291+
/* new_weight */
1292+
ceph_decode_32_safe(p, end, len, e_inval);
1293+
while (len--) {
1294+
s32 osd;
1295+
u32 w;
1296+
1297+
ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
1298+
osd = ceph_decode_32(p);
1299+
w = ceph_decode_32(p);
1300+
BUG_ON(osd >= map->max_osd);
1301+
pr_info("osd%d weight 0x%x %s\n", osd, w,
1302+
w == CEPH_OSD_IN ? "(in)" :
1303+
(w == CEPH_OSD_OUT ? "(out)" : ""));
1304+
map->osd_weight[osd] = w;
1305+
1306+
/*
1307+
* If we are marking in, set the EXISTS, and clear the
1308+
* AUTOOUT and NEW bits.
1309+
*/
1310+
if (w) {
1311+
map->osd_state[osd] |= CEPH_OSD_EXISTS;
1312+
map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
1313+
CEPH_OSD_NEW);
1314+
}
1315+
}
1316+
new_weight_end = *p;
1317+
1318+
/* new_state (up/down) */
1319+
*p = new_state;
1320+
len = ceph_decode_32(p);
1321+
while (len--) {
1322+
s32 osd;
1323+
u8 xorstate;
1324+
int ret;
1325+
1326+
osd = ceph_decode_32(p);
1327+
xorstate = ceph_decode_8(p);
1328+
if (xorstate == 0)
1329+
xorstate = CEPH_OSD_UP;
1330+
BUG_ON(osd >= map->max_osd);
1331+
if ((map->osd_state[osd] & CEPH_OSD_UP) &&
1332+
(xorstate & CEPH_OSD_UP))
1333+
pr_info("osd%d down\n", osd);
1334+
if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
1335+
(xorstate & CEPH_OSD_EXISTS)) {
1336+
pr_info("osd%d does not exist\n", osd);
1337+
map->osd_weight[osd] = CEPH_OSD_IN;
1338+
ret = set_primary_affinity(map, osd,
1339+
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1340+
if (ret)
1341+
return ret;
1342+
memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
1343+
map->osd_state[osd] = 0;
1344+
} else {
1345+
map->osd_state[osd] ^= xorstate;
1346+
}
1347+
}
1348+
1349+
/* new_up_client */
1350+
*p = new_up_client;
1351+
len = ceph_decode_32(p);
1352+
while (len--) {
1353+
s32 osd;
1354+
struct ceph_entity_addr addr;
1355+
1356+
osd = ceph_decode_32(p);
1357+
ceph_decode_copy(p, &addr, sizeof(addr));
1358+
ceph_decode_addr(&addr);
1359+
BUG_ON(osd >= map->max_osd);
1360+
pr_info("osd%d up\n", osd);
1361+
map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1362+
map->osd_addr[osd] = addr;
1363+
}
1364+
1365+
*p = new_weight_end;
1366+
return 0;
1367+
1368+
e_inval:
1369+
return -EINVAL;
1370+
}
1371+
12631372
/*
12641373
* decode and apply an incremental map update.
12651374
*/
@@ -1358,49 +1467,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
13581467
__remove_pg_pool(&map->pg_pools, pi);
13591468
}
13601469

1361-
/* new_up */
1362-
ceph_decode_32_safe(p, end, len, e_inval);
1363-
while (len--) {
1364-
u32 osd;
1365-
struct ceph_entity_addr addr;
1366-
ceph_decode_32_safe(p, end, osd, e_inval);
1367-
ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
1368-
ceph_decode_addr(&addr);
1369-
pr_info("osd%d up\n", osd);
1370-
BUG_ON(osd >= map->max_osd);
1371-
map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
1372-
map->osd_addr[osd] = addr;
1373-
}
1374-
1375-
/* new_state */
1376-
ceph_decode_32_safe(p, end, len, e_inval);
1377-
while (len--) {
1378-
u32 osd;
1379-
u8 xorstate;
1380-
ceph_decode_32_safe(p, end, osd, e_inval);
1381-
xorstate = **(u8 **)p;
1382-
(*p)++; /* clean flag */
1383-
if (xorstate == 0)
1384-
xorstate = CEPH_OSD_UP;
1385-
if (xorstate & CEPH_OSD_UP)
1386-
pr_info("osd%d down\n", osd);
1387-
if (osd < map->max_osd)
1388-
map->osd_state[osd] ^= xorstate;
1389-
}
1390-
1391-
/* new_weight */
1392-
ceph_decode_32_safe(p, end, len, e_inval);
1393-
while (len--) {
1394-
u32 osd, off;
1395-
ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
1396-
osd = ceph_decode_32(p);
1397-
off = ceph_decode_32(p);
1398-
pr_info("osd%d weight 0x%x %s\n", osd, off,
1399-
off == CEPH_OSD_IN ? "(in)" :
1400-
(off == CEPH_OSD_OUT ? "(out)" : ""));
1401-
if (osd < map->max_osd)
1402-
map->osd_weight[osd] = off;
1403-
}
1470+
/* new_up_client, new_state, new_weight */
1471+
err = decode_new_up_state_weight(p, end, map);
1472+
if (err)
1473+
goto bad;
14041474

14051475
/* new_pg_temp */
14061476
err = decode_new_pg_temp(p, end, map);

0 commit comments

Comments
 (0)