Skip to content

Commit bdc7276

Browse files
committed
Merge tag 'bcachefs-2024-10-14' of git://evilpiepirate.org/bcachefs
Pull bcachefs fixes from Kent Overstreet: - New metadata version inode_has_child_snapshots This fixes bugs with handling of unlinked inodes + snapshots, in particular when an inode is reattached after taking a snapshot; deleted inodes now get correctly cleaned up across snapshots. - Disk accounting rewrite fixes - validation fixes for when a device has been removed - fix journal replay failing with "journal_reclaim_would_deadlock" - Some more small fixes for erasure coding + device removal - Assorted small syzbot fixes * tag 'bcachefs-2024-10-14' of git://evilpiepirate.org/bcachefs: (27 commits) bcachefs: Fix sysfs warning in fstests generic/730,731 bcachefs: Handle race between stripe reuse, invalidate_stripe_to_dev bcachefs: Fix kasan splat in new_stripe_alloc_buckets() bcachefs: Add missing validation for bch_stripe.csum_granularity_bits bcachefs: Fix missing bounds checks in bch2_alloc_read() bcachefs: fix uaf in bch2_dio_write_done() bcachefs: Improve check_snapshot_exists() bcachefs: Fix bkey_nocow_lock() bcachefs: Fix accounting replay flags bcachefs: Fix invalid shift in member_to_text() bcachefs: Fix bch2_have_enough_devs() for BCH_SB_MEMBER_INVALID bcachefs: __wait_for_freeing_inode: Switch to wait_bit_queue_entry bcachefs: Check if stuck in journal_res_get() closures: Add closure_wait_event_timeout() bcachefs: Fix state lock involved deadlock bcachefs: Fix NULL pointer dereference in bch2_opt_to_text bcachefs: Release transaction before wake up bcachefs: add check for btree id against max in try read node bcachefs: Disk accounting device validation fixes bcachefs: bch2_inode_or_descendents_is_open() ...
2 parents eca631b + 5e3b723 commit bdc7276

32 files changed

+976
-400
lines changed

fs/bcachefs/alloc_background.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c)
639639
continue;
640640
}
641641

642+
if (k.k->p.offset < ca->mi.first_bucket) {
643+
bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
644+
continue;
645+
}
646+
647+
if (k.k->p.offset >= ca->mi.nbuckets) {
648+
bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
649+
continue;
650+
}
651+
642652
struct bch_alloc_v4 a;
643653
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
644654
0;

fs/bcachefs/bcachefs_format.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -678,7 +678,8 @@ struct bch_sb_field_ext {
678678
x(disk_accounting_v2, BCH_VERSION(1, 9)) \
679679
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
680680
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
681-
x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
681+
x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
682+
x(inode_has_child_snapshots, BCH_VERSION(1, 13))
682683

683684
enum bcachefs_metadata_version {
684685
bcachefs_metadata_version_min = 9,

fs/bcachefs/btree_gc.c

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,17 +1224,20 @@ int bch2_gc_gens(struct bch_fs *c)
12241224
u64 b, start_time = local_clock();
12251225
int ret;
12261226

1227-
/*
1228-
* Ideally we would be using state_lock and not gc_gens_lock here, but that
1229-
* introduces a deadlock in the RO path - we currently take the state
1230-
* lock at the start of going RO, thus the gc thread may get stuck:
1231-
*/
12321227
if (!mutex_trylock(&c->gc_gens_lock))
12331228
return 0;
12341229

12351230
trace_and_count(c, gc_gens_start, c);
12361231

1237-
down_read(&c->state_lock);
1232+
/*
1233+
* We have to use trylock here. Otherwise, we would
1234+
* introduce a deadlock in the RO path - we take the
1235+
* state lock at the start of going RO.
1236+
*/
1237+
if (!down_read_trylock(&c->state_lock)) {
1238+
mutex_unlock(&c->gc_gens_lock);
1239+
return 0;
1240+
}
12381241

12391242
for_each_member_device(c, ca) {
12401243
struct bucket_gens *gens = bucket_gens(ca);

fs/bcachefs/btree_io.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1838,10 +1838,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
18381838
struct btree_trans *trans = bch2_trans_get(c);
18391839

18401840
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
1841-
__btree_node_write_done(c, b);
1842-
six_unlock_read(&b->c.lock);
18431841

1842+
/* we don't need transaction context anymore after we got the lock. */
18441843
bch2_trans_put(trans);
1844+
__btree_node_write_done(c, b);
1845+
six_unlock_read(&b->c.lock);
18451846
}
18461847

18471848
static void btree_node_write_work(struct work_struct *work)

fs/bcachefs/btree_iter.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2381,9 +2381,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
23812381
else
23822382
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
23832383

2384-
if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
2385-
? bkey_gt(iter_pos, end)
2386-
: bkey_ge(iter_pos, end)))
2384+
if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
2385+
iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
2386+
bkey_gt(iter_pos, end)))
23872387
goto end;
23882388

23892389
break;

fs/bcachefs/btree_iter.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
857857
for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
858858
SPOS_MAX, _flags, _k, _ret)
859859

860+
#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
861+
_start, _flags, _k, _ret) \
862+
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
863+
(_start), (_flags)); \
864+
(_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
865+
!((_ret) = bkey_err(_k)) && (_k).k; \
866+
bch2_btree_iter_rewind(&(_iter)))
867+
860868
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
861869
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
862870

fs/bcachefs/btree_node_scan.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
171171
if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
172172
return;
173173

174+
if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
175+
return;
176+
174177
rcu_read_lock();
175178
struct found_btree_node n = {
176179
.btree_id = BTREE_NODE_ID(bn),

fs/bcachefs/data_update.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
8080
if (ptr2 == ptr)
8181
break;
8282

83+
ca = bch2_dev_have_ref(c, ptr2->dev);
8384
bucket = PTR_BUCKET_POS(ca, ptr2);
8485
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
8586
}

fs/bcachefs/disk_accounting.c

Lines changed: 114 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -242,16 +242,22 @@ void bch2_accounting_swab(struct bkey_s k)
242242
*p = swab64(*p);
243243
}
244244

245+
static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
246+
struct disk_accounting_pos acc)
247+
{
248+
unsafe_memcpy(r, &acc.replicas,
249+
replicas_entry_bytes(&acc.replicas),
250+
"variable length struct");
251+
}
252+
245253
static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
246254
{
247255
struct disk_accounting_pos acc_k;
248256
bpos_to_disk_accounting_pos(&acc_k, p);
249257

250258
switch (acc_k.type) {
251259
case BCH_DISK_ACCOUNTING_replicas:
252-
unsafe_memcpy(r, &acc_k.replicas,
253-
replicas_entry_bytes(&acc_k.replicas),
254-
"variable length struct");
260+
__accounting_to_replicas(r, acc_k);
255261
return true;
256262
default:
257263
return false;
@@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
608614
return ret;
609615
}
610616

617+
static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
618+
struct disk_accounting_pos acc,
619+
u64 *v, unsigned nr)
620+
{
621+
struct bch_fs *c = trans->c;
622+
struct printbuf buf = PRINTBUF;
623+
int ret = 0, invalid_dev = -1;
624+
625+
switch (acc.type) {
626+
case BCH_DISK_ACCOUNTING_replicas: {
627+
struct bch_replicas_padded r;
628+
__accounting_to_replicas(&r.e, acc);
629+
630+
for (unsigned i = 0; i < r.e.nr_devs; i++)
631+
if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
632+
!bch2_dev_exists(c, r.e.devs[i])) {
633+
invalid_dev = r.e.devs[i];
634+
goto invalid_device;
635+
}
636+
637+
/*
638+
* All replicas entry checks except for invalid device are done
639+
* in bch2_accounting_validate
640+
*/
641+
BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
642+
643+
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
644+
trans, accounting_replicas_not_marked,
645+
"accounting not marked in superblock replicas\n %s",
646+
(printbuf_reset(&buf),
647+
bch2_accounting_key_to_text(&buf, &acc),
648+
buf.buf))) {
649+
/*
650+
* We're not RW yet and still single threaded, dropping
651+
* and retaking lock is ok:
652+
*/
653+
percpu_up_write(&c->mark_lock);
654+
ret = bch2_mark_replicas(c, &r.e);
655+
if (ret)
656+
goto fsck_err;
657+
percpu_down_write(&c->mark_lock);
658+
}
659+
break;
660+
}
661+
662+
case BCH_DISK_ACCOUNTING_dev_data_type:
663+
if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
664+
invalid_dev = acc.dev_data_type.dev;
665+
goto invalid_device;
666+
}
667+
break;
668+
}
669+
670+
fsck_err:
671+
printbuf_exit(&buf);
672+
return ret;
673+
invalid_device:
674+
if (fsck_err(trans, accounting_to_invalid_device,
675+
"accounting entry points to invalid device %i\n %s",
676+
invalid_dev,
677+
(printbuf_reset(&buf),
678+
bch2_accounting_key_to_text(&buf, &acc),
679+
buf.buf))) {
680+
for (unsigned i = 0; i < nr; i++)
681+
v[i] = -v[i];
682+
683+
ret = commit_do(trans, NULL, NULL, 0,
684+
bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
685+
-BCH_ERR_remove_disk_accounting_entry;
686+
} else {
687+
ret = -BCH_ERR_remove_disk_accounting_entry;
688+
}
689+
goto fsck_err;
690+
}
691+
611692
/*
612693
* At startup time, initialize the in memory accounting from the btree (and
613694
* journal)
@@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c)
666747
}
667748
keys->gap = keys->nr = dst - keys->data;
668749

669-
percpu_down_read(&c->mark_lock);
670-
for (unsigned i = 0; i < acc->k.nr; i++) {
671-
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
672-
bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
750+
percpu_down_write(&c->mark_lock);
751+
unsigned i = 0;
752+
while (i < acc->k.nr) {
753+
unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
673754

674-
if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
675-
continue;
755+
struct disk_accounting_pos acc_k;
756+
bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
676757

677-
struct bch_replicas_padded r;
678-
if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
679-
continue;
758+
u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
759+
bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
680760

681761
/*
682-
* If the replicas entry is invalid it'll get cleaned up by
683-
* check_allocations:
762+
* If the entry counters are zeroed, it should be treated as
763+
* nonexistent - it might point to an invalid device.
764+
*
765+
* Remove it, so that if it's re-added it gets re-marked in the
766+
* superblock:
684767
*/
685-
if (bch2_replicas_entry_validate(&r.e, c, &buf))
768+
ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
769+
? -BCH_ERR_remove_disk_accounting_entry
770+
: bch2_disk_accounting_validate_late(trans, acc_k,
771+
v, acc->k.data[idx].nr_counters);
772+
773+
if (ret == -BCH_ERR_remove_disk_accounting_entry) {
774+
free_percpu(acc->k.data[idx].v[0]);
775+
free_percpu(acc->k.data[idx].v[1]);
776+
darray_remove_item(&acc->k, &acc->k.data[idx]);
777+
eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
778+
accounting_pos_cmp, NULL);
779+
ret = 0;
686780
continue;
687-
688-
struct disk_accounting_pos k;
689-
bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
690-
691-
if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
692-
trans, accounting_replicas_not_marked,
693-
"accounting not marked in superblock replicas\n %s",
694-
(printbuf_reset(&buf),
695-
bch2_accounting_key_to_text(&buf, &k),
696-
buf.buf))) {
697-
/*
698-
* We're not RW yet and still single threaded, dropping
699-
* and retaking lock is ok:
700-
*/
701-
percpu_up_read(&c->mark_lock);
702-
ret = bch2_mark_replicas(c, &r.e);
703-
if (ret)
704-
goto fsck_err;
705-
percpu_down_read(&c->mark_lock);
706781
}
782+
783+
if (ret)
784+
goto fsck_err;
785+
i++;
707786
}
708787

709788
preempt_disable();
@@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c)
742821
}
743822
preempt_enable();
744823
fsck_err:
745-
percpu_up_read(&c->mark_lock);
824+
percpu_up_write(&c->mark_lock);
746825
err:
747826
printbuf_exit(&buf);
748827
bch2_trans_put(trans);

0 commit comments

Comments
 (0)