Skip to content

Commit 5d6a6a7

Browse files
committed
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph fixes from Sage Weil: "We have a few wire protocol compatibility fixes, ports of a few recent CRUSH mapping changes, and a couple error path fixes" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: libceph: MOSDOpReply v7 encoding libceph: advertise support for TUNABLES5 crush: decode and initialize chooseleaf_stable crush: add chooseleaf_stable tunable crush: ensure take bucket value is valid crush: ensure bucket id is valid before indexing buckets array ceph: fix snap context leak in error path ceph: checking for IS_ERR instead of NULL
2 parents 9b10882 + b0b31a8 commit 5d6a6a7

File tree

6 files changed

+75
-17
lines changed

6 files changed

+75
-17
lines changed

fs/ceph/file.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -698,8 +698,8 @@ static void ceph_aio_retry_work(struct work_struct *work)
698698

699699
req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
700700
false, GFP_NOFS);
701-
if (IS_ERR(req)) {
702-
ret = PTR_ERR(req);
701+
if (!req) {
702+
ret = -ENOMEM;
703703
req = orig_req;
704704
goto out;
705705
}
@@ -716,7 +716,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
716716
ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
717717
snapc, CEPH_NOSNAP, &aio_req->mtime);
718718

719-
ceph_put_snap_context(snapc);
720719
ceph_osdc_put_request(orig_req);
721720

722721
req->r_callback = ceph_aio_complete_req;
@@ -731,6 +730,7 @@ static void ceph_aio_retry_work(struct work_struct *work)
731730
ceph_aio_complete_req(req, NULL);
732731
}
733732

733+
ceph_put_snap_context(snapc);
734734
kfree(aio_work);
735735
}
736736

include/linux/ceph/ceph_features.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,18 @@
6363
#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
6464
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
6565
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
66+
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
67+
#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
68+
#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
69+
#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
70+
#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
71+
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
72+
#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */
73+
#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
74+
#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
75+
#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */
76+
// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
77+
#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */
6678

6779
/*
6880
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature
@@ -108,7 +120,9 @@ static inline u64 ceph_sanitize_features(u64 features)
108120
CEPH_FEATURE_CRUSH_TUNABLES3 | \
109121
CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
110122
CEPH_FEATURE_MSGR_KEEPALIVE2 | \
111-
CEPH_FEATURE_CRUSH_V4)
123+
CEPH_FEATURE_CRUSH_V4 | \
124+
CEPH_FEATURE_CRUSH_TUNABLES5 | \
125+
CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
112126

113127
#define CEPH_FEATURES_REQUIRED_DEFAULT \
114128
(CEPH_FEATURE_NOSRCADDR | \

include/linux/crush/crush.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ enum {
5959
CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
6060
CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
6161
CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
62-
CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12
62+
CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
63+
CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
6364
};
6465

6566
/*
@@ -205,6 +206,11 @@ struct crush_map {
205206
* mappings line up a bit better with previous mappings. */
206207
__u8 chooseleaf_vary_r;
207208

209+
/* if true, it makes chooseleaf firstn to return stable results (if
210+
* no local retry) so that data migrations would be optimal when some
211+
* device fails. */
212+
__u8 chooseleaf_stable;
213+
208214
#ifndef __KERNEL__
209215
/*
210216
* version 0 (original) of straw_calc has various flaws. version 1

net/ceph/crush/mapper.c

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
403403
* @local_retries: localized retries
404404
* @local_fallback_retries: localized fallback retries
405405
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
406+
* @stable: stable mode starts rep=0 in the recursive call for all replicas
406407
* @vary_r: pass r to recursive calls
407408
* @out2: second output vector for leaf items (if @recurse_to_leaf)
408409
* @parent_r: r value passed from the parent
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
419420
unsigned int local_fallback_retries,
420421
int recurse_to_leaf,
421422
unsigned int vary_r,
423+
unsigned int stable,
422424
int *out2,
423425
int parent_r)
424426
{
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
433435
int collide, reject;
434436
int count = out_size;
435437

436-
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
438+
dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
437439
recurse_to_leaf ? "_LEAF" : "",
438440
bucket->id, x, outpos, numrep,
439441
tries, recurse_tries, local_retries, local_fallback_retries,
440-
parent_r);
442+
parent_r, stable);
441443

442-
for (rep = outpos; rep < numrep && count > 0 ; rep++) {
444+
for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
443445
/* keep trying until we get a non-out, non-colliding item */
444446
ftotal = 0;
445447
skip_rep = 0;
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
512514
if (crush_choose_firstn(map,
513515
map->buckets[-1-item],
514516
weight, weight_max,
515-
x, outpos+1, 0,
517+
x, stable ? 1 : outpos+1, 0,
516518
out2, outpos, count,
517519
recurse_tries, 0,
518520
local_retries,
519521
local_fallback_retries,
520522
0,
521523
vary_r,
524+
stable,
522525
NULL,
523526
sub_r) <= outpos)
524527
/* didn't get leaf */
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
816819
int choose_local_fallback_retries = map->choose_local_fallback_tries;
817820

818821
int vary_r = map->chooseleaf_vary_r;
822+
int stable = map->chooseleaf_stable;
819823

820824
if ((__u32)ruleno >= map->max_rules) {
821825
dprintk(" bad ruleno %d\n", ruleno);
@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map,
835839
case CRUSH_RULE_TAKE:
836840
if ((curstep->arg1 >= 0 &&
837841
curstep->arg1 < map->max_devices) ||
838-
(-1-curstep->arg1 < map->max_buckets &&
842+
(-1-curstep->arg1 >= 0 &&
843+
-1-curstep->arg1 < map->max_buckets &&
839844
map->buckets[-1-curstep->arg1])) {
840845
w[0] = curstep->arg1;
841846
wsize = 1;
@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
869874
vary_r = curstep->arg1;
870875
break;
871876

877+
case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
878+
if (curstep->arg1 >= 0)
879+
stable = curstep->arg1;
880+
break;
881+
872882
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
873883
case CRUSH_RULE_CHOOSE_FIRSTN:
874884
firstn = 1;
@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map,
888898
osize = 0;
889899

890900
for (i = 0; i < wsize; i++) {
901+
int bno;
891902
/*
892903
* see CRUSH_N, CRUSH_N_MINUS macros.
893904
* basically, numrep <= 0 means relative to
@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map,
900911
continue;
901912
}
902913
j = 0;
914+
/* make sure bucket id is valid */
915+
bno = -1 - w[i];
916+
if (bno < 0 || bno >= map->max_buckets) {
917+
/* w[i] is probably CRUSH_ITEM_NONE */
918+
dprintk(" bad w[i] %d\n", w[i]);
919+
continue;
920+
}
903921
if (firstn) {
904922
int recurse_tries;
905923
if (choose_leaf_tries)
@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map,
911929
recurse_tries = choose_tries;
912930
osize += crush_choose_firstn(
913931
map,
914-
map->buckets[-1-w[i]],
932+
map->buckets[bno],
915933
weight, weight_max,
916934
x, numrep,
917935
curstep->arg2,
@@ -923,14 +941,15 @@ int crush_do_rule(const struct crush_map *map,
923941
choose_local_fallback_retries,
924942
recurse_to_leaf,
925943
vary_r,
944+
stable,
926945
c+osize,
927946
0);
928947
} else {
929948
out_size = ((numrep < (result_max-osize)) ?
930949
numrep : (result_max-osize));
931950
crush_choose_indep(
932951
map,
933-
map->buckets[-1-w[i]],
952+
map->buckets[bno],
934953
weight, weight_max,
935954
x, out_size, numrep,
936955
curstep->arg2,

net/ceph/osd_client.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1770,6 +1770,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
17701770
u32 osdmap_epoch;
17711771
int already_completed;
17721772
u32 bytes;
1773+
u8 decode_redir;
17731774
unsigned int i;
17741775

17751776
tid = le64_to_cpu(msg->hdr.tid);
@@ -1841,6 +1842,15 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
18411842
p += 8 + 4; /* skip replay_version */
18421843
p += 8; /* skip user_version */
18431844

1845+
if (le16_to_cpu(msg->hdr.version) >= 7)
1846+
ceph_decode_8_safe(&p, end, decode_redir, bad_put);
1847+
else
1848+
decode_redir = 1;
1849+
} else {
1850+
decode_redir = 0;
1851+
}
1852+
1853+
if (decode_redir) {
18441854
err = ceph_redirect_decode(&p, end, &redir);
18451855
if (err)
18461856
goto bad_put;

net/ceph/osdmap.c

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
342342
c->choose_local_tries = ceph_decode_32(p);
343343
c->choose_local_fallback_tries = ceph_decode_32(p);
344344
c->choose_total_tries = ceph_decode_32(p);
345-
dout("crush decode tunable choose_local_tries = %d",
345+
dout("crush decode tunable choose_local_tries = %d\n",
346346
c->choose_local_tries);
347-
dout("crush decode tunable choose_local_fallback_tries = %d",
347+
dout("crush decode tunable choose_local_fallback_tries = %d\n",
348348
c->choose_local_fallback_tries);
349-
dout("crush decode tunable choose_total_tries = %d",
349+
dout("crush decode tunable choose_total_tries = %d\n",
350350
c->choose_total_tries);
351351

352352
ceph_decode_need(p, end, sizeof(u32), done);
353353
c->chooseleaf_descend_once = ceph_decode_32(p);
354-
dout("crush decode tunable chooseleaf_descend_once = %d",
354+
dout("crush decode tunable chooseleaf_descend_once = %d\n",
355355
c->chooseleaf_descend_once);
356356

357357
ceph_decode_need(p, end, sizeof(u8), done);
358358
c->chooseleaf_vary_r = ceph_decode_8(p);
359-
dout("crush decode tunable chooseleaf_vary_r = %d",
359+
dout("crush decode tunable chooseleaf_vary_r = %d\n",
360360
c->chooseleaf_vary_r);
361361

362+
/* skip straw_calc_version, allowed_bucket_algs */
363+
ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
364+
*p += sizeof(u8) + sizeof(u32);
365+
366+
ceph_decode_need(p, end, sizeof(u8), done);
367+
c->chooseleaf_stable = ceph_decode_8(p);
368+
dout("crush decode tunable chooseleaf_stable = %d\n",
369+
c->chooseleaf_stable);
370+
362371
done:
363372
dout("crush_decode success\n");
364373
return c;

0 commit comments

Comments
 (0)