diff --git a/Documentation/filesystems/bcachefs/CodingStyle.rst b/Documentation/filesystems/bcachefs/CodingStyle.rst index 01de555e21d85..b29562a6bf555 100644 --- a/Documentation/filesystems/bcachefs/CodingStyle.rst +++ b/Documentation/filesystems/bcachefs/CodingStyle.rst @@ -183,4 +183,4 @@ even better as a code comment. A good code comment is wonderful, but even better is the comment that didn't need to exist because the code was so straightforward as to be obvious; organized into small clean and tidy modules, with clear and descriptive names -for functions and variable, where every line of code has a clear purpose. +for functions and variables, where every line of code has a clear purpose. diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 5bac803ea367c..c2566a970946b 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -38,7 +38,7 @@ config BCACHEFS_ERASURE_CODING depends on BCACHEFS_FS select QUOTACTL help - This enables the "erasure_code" filesysystem and inode option, which + This enables the "erasure_code" filesystem and inode option, which organizes data into reed-solomon stripes instead of ordinary replication. @@ -89,7 +89,7 @@ config BCACHEFS_SIX_OPTIMISTIC_SPIN config BCACHEFS_PATH_TRACEPOINTS bool "Extra btree_path tracepoints" - depends on BCACHEFS_FS + depends on BCACHEFS_FS && TRACING help Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 56d20e219f591..d2689388d5e88 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -82,6 +82,7 @@ bcachefs-y := \ siphash.o \ six.o \ snapshot.o \ + str_hash.o \ subvolume.o \ super.o \ super-io.o \ diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 87f1be9d4db46..56b160e71a533 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -137,7 +137,7 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, return NULL; acl = allocate_dropping_locks(trans, ret, - posix_acl_alloc(count, _gfp)); + posix_acl_alloc(count, GFP_KERNEL)); if (!acl) return ERR_PTR(-ENOMEM); if (ret) { @@ -184,11 +184,6 @@ static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, return ERR_PTR(-EINVAL); } -#define acl_for_each_entry(acl, acl_e) \ - for (acl_e = acl->a_entries; \ - acl_e < acl->a_entries + acl->a_count; \ - acl_e++) - /* * Convert from in-memory to filesystem representation. */ @@ -199,11 +194,11 @@ bch2_acl_to_xattr(struct btree_trans *trans, { struct bkey_i_xattr *xattr; bch_acl_header *acl_header; - const struct posix_acl_entry *acl_e; + const struct posix_acl_entry *acl_e, *pe; void *outptr; unsigned nr_short = 0, nr_long = 0, acl_len, u64s; - acl_for_each_entry(acl, acl_e) { + FOREACH_ACL_ENTRY(acl_e, acl, pe) { switch (acl_e->e_tag) { case ACL_USER: case ACL_GROUP: @@ -241,7 +236,7 @@ bch2_acl_to_xattr(struct btree_trans *trans, outptr = (void *) acl_header + sizeof(*acl_header); - acl_for_each_entry(acl, acl_e) { + FOREACH_ACL_ENTRY(acl_e, acl, pe) { bch_acl_entry *entry = outptr; entry->e_tag = cpu_to_le16(acl_e->e_tag); @@ -427,7 +422,8 @@ int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, if (ret) goto err; - ret = allocate_dropping_locks_errcode(trans, __posix_acl_chmod(&acl, _gfp, mode)); + ret = allocate_dropping_locks_errcode(trans, + __posix_acl_chmod(&acl, GFP_KERNEL, mode)); if (ret) goto err; diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index c84a91572a1da..cc3cbcd11efea 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -198,7 +198,7 @@ static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) } int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); int ret = 0; @@ -213,7 +213,7 @@ int bch2_alloc_v1_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_alloc_unpacked u; int ret = 0; @@ -226,7 +226,7 @@ int bch2_alloc_v2_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_alloc_unpacked u; int ret = 0; @@ -239,7 +239,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bch_alloc_v4 a; int ret = 0; @@ -322,9 +322,9 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, void bch2_alloc_v4_swab(struct bkey_s k) { struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; - struct bch_backpointer *bp, *bps; - a->journal_seq = swab64(a->journal_seq); + a->journal_seq_nonempty = swab64(a->journal_seq_nonempty); + a->journal_seq_empty = swab64(a->journal_seq_empty); a->flags = swab32(a->flags); a->dirty_sectors = swab32(a->dirty_sectors); a->cached_sectors = swab32(a->cached_sectors); @@ -333,13 +333,6 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe = swab32(a->stripe); a->nr_external_backpointers = swab32(a->nr_external_backpointers); a->stripe_sectors = swab32(a->stripe_sectors); - - bps = alloc_v4_backpointers(a); - for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { - bp->bucket_offset = swab40(bp->bucket_offset); - bp->bucket_len = swab32(bp->bucket_len); - bch2_bpos_swab(&bp->pos); - } } void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) @@ -354,16 +347,17 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen); bch2_prt_data_type(out, a->data_type); prt_newline(out); - prt_printf(out, "journal_seq %llu\n", a->journal_seq); - prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); - prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); - prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); - prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); - prt_printf(out, "cached_sectors %u\n", a->cached_sectors); - prt_printf(out, "stripe %u\n", a->stripe); - prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); - prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); - prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); + prt_printf(out, "journal_seq_nonempty %llu\n", a->journal_seq_nonempty); + prt_printf(out, "journal_seq_empty %llu\n", a->journal_seq_empty); + prt_printf(out, "need_discard %llu\n", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_printf(out, "need_inc_gen %llu\n", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_printf(out, "dirty_sectors %u\n", a->dirty_sectors); + prt_printf(out, "stripe_sectors %u\n", a->stripe_sectors); + prt_printf(out, "cached_sectors %u\n", a->cached_sectors); + prt_printf(out, "stripe %u\n", a->stripe); + prt_printf(out, "stripe_redundancy %u\n", a->stripe_redundancy); + prt_printf(out, "io_time[READ] %llu\n", a->io_time[READ]); + prt_printf(out, "io_time[WRITE] %llu\n", a->io_time[WRITE]); if (ca) prt_printf(out, "fragmentation %llu\n", alloc_lru_idx_fragmentation(*a, ca)); @@ -392,7 +386,7 @@ void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); *out = (struct bch_alloc_v4) { - .journal_seq = u.journal_seq, + .journal_seq_nonempty = u.journal_seq, .flags = u.need_discard, .gen = u.gen, .oldest_gen = u.oldest_gen, @@ -517,7 +511,7 @@ static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) } int bch2_bucket_gens_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -664,74 +658,80 @@ int bch2_alloc_read(struct bch_fs *c) /* Free space/discard btree: */ +static int __need_discard_or_freespace_err(struct btree_trans *trans, + struct bkey_s_c alloc_k, + bool set, bool discard, bool repair) +{ + struct bch_fs *c = trans->c; + enum bch_fsck_flags flags = FSCK_CAN_IGNORE|(repair ? FSCK_CAN_FIX : 0); + enum bch_sb_error_id err_id = discard + ? BCH_FSCK_ERR_need_discard_key_wrong + : BCH_FSCK_ERR_freespace_key_wrong; + enum btree_id btree = discard ? BTREE_ID_need_discard : BTREE_ID_freespace; + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, alloc_k); + + int ret = __bch2_fsck_err(NULL, trans, flags, err_id, + "bucket incorrectly %sset in %s btree\n" + " %s", + set ? "" : "un", + bch2_btree_id_str(btree), + buf.buf); + if (ret == -BCH_ERR_fsck_ignore || + ret == -BCH_ERR_fsck_errors_not_fixed) + ret = 0; + + printbuf_exit(&buf); + return ret; +} + +#define need_discard_or_freespace_err(...) \ + fsck_err_wrap(__need_discard_or_freespace_err(__VA_ARGS__)) + +#define need_discard_or_freespace_err_on(cond, ...) \ + (unlikely(cond) ? need_discard_or_freespace_err(__VA_ARGS__) : false) + static int bch2_bucket_do_index(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c alloc_k, const struct bch_alloc_v4 *a, bool set) { - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c old; - struct bkey_i *k; enum btree_id btree; - enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; - enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - struct printbuf buf = PRINTBUF; - int ret; + struct bpos pos; if (a->data_type != BCH_DATA_free && a->data_type != BCH_DATA_need_discard) return 0; - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.type = new_type; - switch (a->data_type) { case BCH_DATA_free: btree = BTREE_ID_freespace; - k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); - bch2_key_resize(&k->k, 1); + pos = alloc_freespace_pos(alloc_k.k->p, *a); break; case BCH_DATA_need_discard: btree = BTREE_ID_need_discard; - k->k.p = alloc_k.k->p; + pos = alloc_k.k->p; break; default: return 0; } - old = bch2_bkey_get_iter(trans, &iter, btree, - bkey_start_pos(&k->k), - BTREE_ITER_intent); - ret = bkey_err(old); + struct btree_iter iter; + struct bkey_s_c old = bch2_bkey_get_iter(trans, &iter, btree, pos, BTREE_ITER_intent); + int ret = bkey_err(old); if (ret) return ret; - if (ca->mi.freespace_initialized && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && - bch2_trans_inconsistent_on(old.k->type != old_type, trans, - "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" - " for %s", - set ? "setting" : "clearing", - bch2_btree_id_str(btree), - iter.pos.inode, - iter.pos.offset, - bch2_bkey_types[old.k->type], - bch2_bkey_types[old_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - ret = -EIO; - goto err; - } + need_discard_or_freespace_err_on(ca->mi.freespace_initialized && + !old.k->type != set, + trans, alloc_k, set, + btree == BTREE_ID_need_discard, false); - ret = bch2_trans_update(trans, &iter, k, 0); -err: + ret = bch2_btree_bit_mod_iter(trans, &iter, set); +fsck_err: bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); return ret; } @@ -858,7 +858,10 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (flags & BTREE_TRIGGER_transactional) { alloc_data_type_set(new_a, new_a->data_type); - if (bch2_bucket_sectors_total(*new_a) > bch2_bucket_sectors_total(*old_a)) { + int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - + (int) data_type_is_empty(old_a->data_type); + + if (is_empty_delta < 0) { new_a->io_time[READ] = bch2_current_io_time(c, READ); new_a->io_time[WRITE]= bch2_current_io_time(c, WRITE); SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); @@ -928,37 +931,55 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) { - u64 journal_seq = trans->journal_res.seq; - u64 bucket_journal_seq = new_a->journal_seq; + u64 transaction_seq = trans->journal_res.seq; + BUG_ON(!transaction_seq); - if ((flags & BTREE_TRIGGER_insert) && - data_type_is_empty(old_a->data_type) != - data_type_is_empty(new_a->data_type) && - new.k->type == KEY_TYPE_alloc_v4) { - struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v; + if (log_fsck_err_on(transaction_seq && new_a->journal_seq_nonempty > transaction_seq, + trans, alloc_key_journal_seq_in_future, + "bucket journal seq in future (currently at %llu)\n%s", + journal_cur_seq(&c->journal), + (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf))) + new_a->journal_seq_nonempty = transaction_seq; - /* - * If the btree updates referring to a bucket weren't flushed - * before the bucket became empty again, then the we don't have - * to wait on a journal flush before we can reuse the bucket: - */ - v->journal_seq = bucket_journal_seq = - data_type_is_empty(new_a->data_type) && - (journal_seq == v->journal_seq || - bch2_journal_noflush_seq(&c->journal, v->journal_seq)) - ? 0 : journal_seq; + int is_empty_delta = (int) data_type_is_empty(new_a->data_type) - + (int) data_type_is_empty(old_a->data_type); + + /* + * Record journal sequence number of empty -> nonempty transition: + * Note that there may be multiple empty -> nonempty + * transitions, data in a bucket may be overwritten while we're + * still writing to it - so be careful to only record the first: + * */ + if (is_empty_delta < 0 && + new_a->journal_seq_empty <= c->journal.flushed_seq_ondisk) { + new_a->journal_seq_nonempty = transaction_seq; + new_a->journal_seq_empty = 0; } - if (!data_type_is_empty(old_a->data_type) && - data_type_is_empty(new_a->data_type) && - bucket_journal_seq) { - ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, - new.k->p.inode, new.k->p.offset, - bucket_journal_seq); - if (bch2_fs_fatal_err_on(ret, c, - "setting bucket_needs_journal_commit: %s", bch2_err_str(ret))) - goto err; + /* + * Bucket becomes empty: mark it as waiting for a journal flush, + * unless updates since empty -> nonempty transition were never + * flushed - we may need to ask the journal not to flush + * intermediate sequence numbers: + */ + if (is_empty_delta > 0) { + if (new_a->journal_seq_nonempty == transaction_seq || + bch2_journal_noflush_seq(&c->journal, + new_a->journal_seq_nonempty, + transaction_seq)) { + new_a->journal_seq_nonempty = new_a->journal_seq_empty = 0; + } else { + new_a->journal_seq_empty = transaction_seq; + + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + transaction_seq); + if (bch2_fs_fatal_err_on(ret, c, + "setting bucket_needs_journal_commit: %s", + bch2_err_str(ret))) + goto err; + } } if (new_a->gen != old_a->gen) { @@ -974,7 +995,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) #define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) -#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) +#define bucket_flushed(a) (a->journal_seq_empty <= c->journal.flushed_seq_ondisk) if (statechange(a->data_type == BCH_DATA_free) && bucket_flushed(new_a)) @@ -1006,6 +1027,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, rcu_read_unlock(); } err: +fsck_err: printbuf_exit(&buf); bch2_dev_put(ca); return ret; @@ -1045,7 +1067,7 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos * btree node min/max is a closed interval, upto takes a half * open interval: */ - k = bch2_btree_iter_peek_upto(&iter2, end); + k = bch2_btree_iter_peek_max(&iter2, end); next = iter2.pos; bch2_trans_iter_exit(iter->trans, &iter2); @@ -1129,7 +1151,6 @@ int bch2_check_alloc_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a; - unsigned discard_key_type, freespace_key_type; unsigned gens_offset; struct bkey_s_c k; struct printbuf buf = PRINTBUF; @@ -1149,64 +1170,30 @@ int bch2_check_alloc_key(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); - discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); k = bch2_btree_iter_peek_slot(discard_iter); ret = bkey_err(k); if (ret) goto err; - if (fsck_err_on(k.k->type != discard_key_type, - trans, need_discard_key_wrong, - "incorrect key in need_discard btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[discard_key_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = discard_key_type; - update->k.p = discard_iter->pos; - - ret = bch2_trans_update(trans, discard_iter, update, 0); + bool is_discarded = a->data_type == BCH_DATA_need_discard; + if (need_discard_or_freespace_err_on(!!k.k->type != is_discarded, + trans, alloc_k, !is_discarded, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, discard_iter, is_discarded); if (ret) goto err; } - freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); k = bch2_btree_iter_peek_slot(freespace_iter); ret = bkey_err(k); if (ret) goto err; - if (fsck_err_on(k.k->type != freespace_key_type, - trans, freespace_key_wrong, - "incorrect key in freespace btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[freespace_key_type], - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { - struct bkey_i *update = - bch2_trans_kmalloc(trans, sizeof(*update)); - - ret = PTR_ERR_OR_ZERO(update); - if (ret) - goto err; - - bkey_init(&update->k); - update->k.type = freespace_key_type; - update->k.p = freespace_iter->pos; - bch2_key_resize(&update->k, 1); - - ret = bch2_trans_update(trans, freespace_iter, update, 0); + bool is_free = a->data_type == BCH_DATA_free; + if (need_discard_or_freespace_err_on(!!k.k->type != is_free, + trans, alloc_k, !is_free, false, true)) { + ret = bch2_btree_bit_mod_iter(trans, freespace_iter, is_free); if (ret) goto err; } @@ -1368,51 +1355,87 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, return ret; } -static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans, - struct btree_iter *iter) +struct check_discard_freespace_key_async { + struct work_struct work; + struct bch_fs *c; + struct bbpos pos; +}; + +static int bch2_recheck_discard_freespace_key(struct btree_trans *trans, struct bbpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, pos.btree, pos.pos, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + u8 gen; + ret = k.k->type != KEY_TYPE_set + ? bch2_check_discard_freespace_key(trans, &iter, &gen, false) + : 0; + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void check_discard_freespace_key_work(struct work_struct *work) +{ + struct check_discard_freespace_key_async *w = + container_of(work, struct check_discard_freespace_key_async, work); + + bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); + bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); + kfree(w); +} + +int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_iter *iter, u8 *gen, + bool async_repair) { struct bch_fs *c = trans->c; - struct btree_iter alloc_iter; - struct bkey_s_c alloc_k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - u64 genbits; - struct bpos pos; enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard ? BCH_DATA_need_discard : BCH_DATA_free; struct printbuf buf = PRINTBUF; - int ret; - pos = iter->pos; - pos.offset &= ~(~0ULL << 56); - genbits = iter->pos.offset & (~0ULL << 56); + struct bpos bucket = iter->pos; + bucket.offset &= ~(~0ULL << 56); + u64 genbits = iter->pos.offset & (~0ULL << 56); - alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); - ret = bkey_err(alloc_k); + struct btree_iter alloc_iter; + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, BTREE_ITER_cached); + int ret = bkey_err(alloc_k); if (ret) return ret; - if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), - trans, need_discard_freespace_key_to_invalid_dev_bucket, - "entry in %s btree for nonexistant dev:bucket %llu:%llu", - bch2_btree_id_str(iter->btree_id), pos.inode, pos.offset)) - goto delete; + if (!bch2_dev_bucket_exists(c, bucket)) { + if (fsck_err(trans, need_discard_freespace_key_to_invalid_dev_bucket, + "entry in %s btree for nonexistent dev:bucket %llu:%llu", + bch2_btree_id_str(iter->btree_id), bucket.inode, bucket.offset)) + goto delete; + ret = 1; + goto out; + } - a = bch2_alloc_to_v4(alloc_k, &a_convert); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + + if (a->data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(*a))) { + if (fsck_err(trans, need_discard_freespace_key_bad, + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_id_str(iter->btree_id), + iter->pos.inode, + iter->pos.offset, + a->data_type == state, + genbits >> 56, alloc_freespace_genbits(*a) >> 56)) + goto delete; + ret = 1; + goto out; + } - if (fsck_err_on(a->data_type != state || - (state == BCH_DATA_free && - genbits != alloc_freespace_genbits(*a)), - trans, need_discard_freespace_key_bad, - "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), - bch2_btree_id_str(iter->btree_id), - iter->pos.inode, - iter->pos.offset, - a->data_type == state, - genbits >> 56, alloc_freespace_genbits(*a) >> 56)) - goto delete; + *gen = a->gen; out: fsck_err: bch2_set_btree_iter_dontneed(&alloc_iter); @@ -1420,11 +1443,40 @@ static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_tran printbuf_exit(&buf); return ret; delete: - ret = bch2_btree_delete_extent_at(trans, iter, - iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc); - goto out; + if (!async_repair) { + ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_commit; + goto out; + } else { + /* + * We can't repair here when called from the allocator path: the + * commit will recurse back into the allocator + */ + struct check_discard_freespace_key_async *w = + kzalloc(sizeof(*w), GFP_KERNEL); + if (!w) + goto out; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { + kfree(w); + goto out; + } + + INIT_WORK(&w->work, check_discard_freespace_key_work); + w->c = c; + w->pos = BBPOS(iter->btree_id, iter->pos); + queue_work(c->write_ref_wq, &w->work); + goto out; + } +} + +static int bch2_check_discard_freespace_key_fsck(struct btree_trans *trans, struct btree_iter *iter) +{ + u8 gen; + int ret = bch2_check_discard_freespace_key(trans, iter, &gen, false); + return ret < 0 ? ret : 0; } /* @@ -1581,7 +1633,7 @@ int bch2_check_alloc_info(struct bch_fs *c) ret = for_each_btree_key(trans, iter, BTREE_ID_need_discard, POS_MIN, BTREE_ITER_prefetch, k, - bch2_check_discard_freespace_key(trans, &iter)); + bch2_check_discard_freespace_key_fsck(trans, &iter)); if (ret) goto err; @@ -1594,7 +1646,7 @@ int bch2_check_alloc_info(struct bch_fs *c) break; ret = bkey_err(k) ?: - bch2_check_discard_freespace_key(trans, &iter); + bch2_check_discard_freespace_key_fsck(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; @@ -1757,7 +1809,8 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca, struct btree_iter *need_discard_iter, struct bpos *discard_pos_done, - struct discard_buckets_state *s) + struct discard_buckets_state *s, + bool fastpath) { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; @@ -1793,44 +1846,23 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, if (ret) goto out; - if (bch2_bucket_sectors_total(a->v)) { - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, - trans, "attempting to discard bucket with dirty data\n%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = -EIO; - goto out; - } - if (a->v.data_type != BCH_DATA_need_discard) { - if (data_type_is_empty(a->v.data_type) && - BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { - a->v.gen++; - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - goto write; + if (need_discard_or_freespace_err(trans, k, true, true, true)) { + ret = bch2_btree_bit_mod_iter(trans, need_discard_iter, false); + if (ret) + goto out; + goto commit; } - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, - trans, "bucket incorrectly set in need_discard btree\n" - "%s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = -EIO; goto out; } - if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { - if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info, - trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s", - a->v.journal_seq, - c->journal.flushed_seq_ondisk, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = -EIO; - goto out; - } - - if (discard_in_flight_add(ca, iter.pos.offset, true)) - goto out; + if (!fastpath) { + if (discard_in_flight_add(ca, iter.pos.offset, true)) + goto out; - discard_locked = true; + discard_locked = true; + } if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { @@ -1844,6 +1876,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, ca->mi.bucket_size, GFP_KERNEL); *discard_pos_done = iter.pos; + s->discarded++; ret = bch2_trans_relock_notrace(trans); if (ret) @@ -1851,22 +1884,25 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, } SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); -write: alloc_data_type_set(&a->v, a->v.data_type); - ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto out; +commit: + ret = bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc); if (ret) goto out; count_event(c, bucket_discard); - s->discarded++; out: +fsck_err: if (discard_locked) discard_in_flight_remove(ca, iter.pos.offset); - s->seen++; + if (!ret) + s->seen++; bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; @@ -1886,11 +1922,11 @@ static void bch2_do_discards_work(struct work_struct *work) * successful commit: */ ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, + for_each_btree_key_max(trans, iter, BTREE_ID_need_discard, POS(ca->dev_idx, 0), POS(ca->dev_idx, U64_MAX), 0, k, - bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s))); + bch2_discard_one_bucket(trans, ca, &iter, &discard_pos_done, &s, false))); trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); @@ -1923,27 +1959,29 @@ void bch2_do_discards(struct bch_fs *c) bch2_dev_do_discards(ca); } -static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) +static int bch2_do_discards_fast_one(struct btree_trans *trans, + struct bch_dev *ca, + u64 bucket, + struct bpos *discard_pos_done, + struct discard_buckets_state *s) { - struct btree_iter iter; - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); - int ret = bkey_err(k); + struct btree_iter need_discard_iter; + struct bkey_s_c discard_k = bch2_bkey_get_iter(trans, &need_discard_iter, + BTREE_ID_need_discard, POS(ca->dev_idx, bucket), 0); + int ret = bkey_err(discard_k); if (ret) - goto err; - - struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); - ret = PTR_ERR_OR_ZERO(a); - if (ret) - goto err; + return ret; - BUG_ON(a->v.dirty_sectors); - SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); - alloc_data_type_set(&a->v, a->v.data_type); + if (log_fsck_err_on(discard_k.k->type != KEY_TYPE_set, + trans, discarding_bucket_not_in_need_discard_btree, + "attempting to discard bucket %u:%llu not in need_discard btree", + ca->dev_idx, bucket)) + goto out; - ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -err: - bch2_trans_iter_exit(trans, &iter); + ret = bch2_discard_one_bucket(trans, ca, &need_discard_iter, discard_pos_done, s, true); +out: +fsck_err: + bch2_trans_iter_exit(trans, &need_discard_iter); return ret; } @@ -1951,6 +1989,10 @@ static void bch2_do_discards_fast_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, discard_fast_work); struct bch_fs *c = ca->fs; + struct discard_buckets_state s = {}; + struct bpos discard_pos_done = POS_MAX; + struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; while (1) { bool got_bucket = false; @@ -1971,16 +2013,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) if (!got_bucket) break; - if (ca->mi.discard && !c->opts.nochanges) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, - GFP_KERNEL); - - int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc, - bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket))); + ret = lockrestart_do(trans, + bch2_do_discards_fast_one(trans, ca, bucket, &discard_pos_done, &s)); bch_err_fn(c, ret); discard_in_flight_remove(ca, bucket); @@ -1989,6 +2023,9 @@ static void bch2_do_discards_fast_work(struct work_struct *work) break; } + trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); + + bch2_trans_put(trans); percpu_ref_put(&ca->io_ref); bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); } @@ -2030,8 +2067,11 @@ static int invalidate_one_bucket(struct btree_trans *trans, return 1; if (!bch2_dev_bucket_exists(c, bucket)) { - prt_str(&buf, "lru entry points to invalid bucket"); - goto err; + if (fsck_err(trans, lru_entry_to_invalid_bucket, + "lru key points to nonexistent device:bucket %llu:%llu", + bucket.inode, bucket.offset)) + return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + goto out; } if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) @@ -2072,28 +2112,9 @@ static int invalidate_one_bucket(struct btree_trans *trans, trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); --*nr_to_invalidate; out: +fsck_err: printbuf_exit(&buf); return ret; -err: - prt_str(&buf, "\n lru key: "); - bch2_bkey_val_to_text(&buf, c, lru_k); - - prt_str(&buf, "\n lru entry: "); - bch2_lru_pos_to_text(&buf, lru_iter->pos); - - prt_str(&buf, "\n alloc key: "); - if (!a) - bch2_bpos_to_text(&buf, bucket); - else - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); - - bch_err(c, "%s", buf.buf); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { - bch2_inconsistent_error(c); - ret = -EINVAL; - } - - goto out; } static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter *iter, @@ -2101,7 +2122,7 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter { struct bkey_s_c k; again: - k = bch2_btree_iter_peek_upto(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); if (!k.k && !*wrapped) { bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); *wrapped = true; diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 163a67b97a409..de25ba4ee94b5 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -8,8 +8,6 @@ #include "debug.h" #include "super.h" -enum bch_validate_flags; - /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U @@ -245,10 +243,14 @@ struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -282,7 +284,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); }) int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ @@ -307,6 +309,8 @@ int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); + +int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h index befdaa95c515b..740238369a5a2 100644 --- a/fs/bcachefs/alloc_background_format.h +++ b/fs/bcachefs/alloc_background_format.h @@ -58,7 +58,7 @@ LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) struct bch_alloc_v4 { struct bch_val v; - __u64 journal_seq; + __u64 journal_seq_nonempty; __u32 flags; __u8 gen; __u8 oldest_gen; @@ -70,7 +70,7 @@ struct bch_alloc_v4 { __u32 stripe; __u32 nr_external_backpointers; /* end of fields in original version of alloc_v4 */ - __u64 _fragmentation_lru; /* obsolete */ + __u64 journal_seq_empty; __u32 stripe_sectors; __u32 pad; } __packed __aligned(8); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 372178c8d4168..066a2d510d7cc 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -107,14 +107,10 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) return; } - percpu_down_read(&c->mark_lock); spin_lock(&ob->lock); - ob->valid = false; ob->data_type = 0; - spin_unlock(&ob->lock); - percpu_up_read(&c->mark_lock); spin_lock(&c->freelist_lock); bch2_open_bucket_hash_remove(c, ob); @@ -156,6 +152,14 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) return ob; } +static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) +{ + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + return false; + + return bch2_is_superblock_bucket(ca, b); +} + static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) { BUG_ON(c->open_buckets_partial_nr >= @@ -175,20 +179,6 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -/* _only_ for allocating the journal on a new device: */ -long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -{ - while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { - u64 b = ca->new_fs_bucket_idx++; - - if (!is_superblock_bucket(ca, b) && - (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) - return b; - } - - return -1; -} - static inline unsigned open_buckets_reserved(enum bch_watermark watermark) { switch (watermark) { @@ -206,33 +196,40 @@ static inline unsigned open_buckets_reserved(enum bch_watermark watermark) } } -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - u64 bucket, - enum bch_watermark watermark, - const struct bch_alloc_v4 *a, - struct bucket_alloc_state *s, - struct closure *cl) +static inline bool may_alloc_bucket(struct bch_fs *c, + struct bpos bucket, + struct bucket_alloc_state *s) { - struct open_bucket *ob; - - if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - s->skipped_nouse++; - return NULL; - } - - if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { s->skipped_open++; - return NULL; + return false; } if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, - c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + c->journal.flushed_seq_ondisk, bucket.inode, bucket.offset)) { s->skipped_need_journal_commit++; - return NULL; + return false; } - if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { s->skipped_nocow++; + return false; + } + + return true; +} + +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 bucket, u8 gen, + enum bch_watermark watermark, + struct bucket_alloc_state *s, + struct closure *cl) +{ + if (unlikely(is_superblock_bucket(c, ca, bucket))) + return NULL; + + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + s->skipped_nouse++; return NULL; } @@ -254,14 +251,13 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } - ob = bch2_open_bucket_alloc(c); + struct open_bucket *ob = bch2_open_bucket_alloc(c); spin_lock(&ob->lock); - ob->valid = true; ob->sectors_free = ca->mi.bucket_size; ob->dev = ca->dev_idx; - ob->gen = a->gen; + ob->gen = gen; ob->bucket = bucket; spin_unlock(&ob->lock); @@ -276,111 +272,29 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * } static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, u64 free_entry, + enum bch_watermark watermark, struct bucket_alloc_state *s, - struct bkey_s_c freespace_k, + struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; - struct bkey_s_c k; - struct open_bucket *ob; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; - u64 b = free_entry & ~(~0ULL << 56); - unsigned genbits = free_entry >> 56; - struct printbuf buf = PRINTBUF; - int ret; - - if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { - prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" - " freespace key ", - ca->mi.first_bucket, ca->mi.nbuckets); - bch2_bkey_val_to_text(&buf, c, freespace_k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } + u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - k = bch2_bkey_get_iter(trans, &iter, - BTREE_ID_alloc, POS(ca->dev_idx, b), - BTREE_ITER_cached); - ret = bkey_err(k); - if (ret) { - ob = ERR_PTR(ret); - goto err; - } - - a = bch2_alloc_to_v4(k, &a_convert); - - if (a->data_type != BCH_DATA_free) { - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { - ob = NULL; - goto err; - } - - prt_printf(&buf, "non free bucket in freespace btree\n" - " freespace key "); - bch2_bkey_val_to_text(&buf, c, freespace_k); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } - - if (genbits != (alloc_freespace_genbits(*a) >> 56) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { - prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" - " freespace key ", - genbits, alloc_freespace_genbits(*a) >> 56); - bch2_bkey_val_to_text(&buf, c, freespace_k); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - bch2_trans_inconsistent(trans, "%s", buf.buf); - ob = ERR_PTR(-EIO); - goto err; - } - - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { - struct bch_backpointer bp; - struct bpos bp_pos = POS_MIN; - - ret = bch2_get_next_backpointer(trans, ca, POS(ca->dev_idx, b), -1, - &bp_pos, &bp, - BTREE_ITER_nopreserve); - if (ret) { - ob = ERR_PTR(ret); - goto err; - } + if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + return NULL; - if (!bkey_eq(bp_pos, POS_MAX)) { - /* - * Bucket may have data in it - we don't call - * bc2h_trans_inconnsistent() because fsck hasn't - * finished yet - */ - ob = NULL; - goto err; - } - } + u8 gen; + int ret = bch2_check_discard_freespace_key(trans, freespace_iter, &gen, true); + if (ret < 0) + return ERR_PTR(ret); + if (ret) + return NULL; - ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); - if (!ob) - bch2_set_btree_iter_dontneed(&iter); -err: - if (iter.path) - bch2_set_btree_iter_dontneed(&iter); - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ob; + return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); } /* * This path is for before the freespace btree is initialized: - * - * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & - * journal buckets - journal buckets will be < ca->new_fs_bucket_idx */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, @@ -389,10 +303,11 @@ bch2_bucket_alloc_early(struct btree_trans *trans, struct bucket_alloc_state *s, struct closure *cl) { + struct bch_fs *c = trans->c; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; - u64 first_bucket = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 first_bucket = ca->mi.first_bucket; u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; @@ -415,10 +330,6 @@ bch2_bucket_alloc_early(struct btree_trans *trans, if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (ca->new_fs_bucket_idx && - is_superblock_bucket(ca, k.k->p.offset)) - continue; - if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { @@ -452,7 +363,10 @@ bch2_bucket_alloc_early(struct btree_trans *trans, s->buckets_seen++; - ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); + ob = may_alloc_bucket(c, k.k->p, s) + ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, + watermark, s, cl) + : NULL; next: bch2_set_btree_iter_dontneed(&citer); bch2_trans_iter_exit(trans, &citer); @@ -489,20 +403,21 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; - - BUG_ON(ca->new_fs_bucket_idx); again: - for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, - POS(ca->dev_idx, alloc_cursor), 0, k, ret) { - if (k.k->p.inode != ca->dev_idx) - break; + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, alloc_cursor), + POS(ca->dev_idx, U64_MAX), + 0, k, ret) { + /* + * peek normally doesn't trim extents - they can span iter.pos, + * which is not what we want here: + */ + iter.k.size = iter.k.p.offset - iter.pos.offset; - for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); - alloc_cursor < k.k->p.offset; - alloc_cursor++) { + while (iter.k.size) { s->buckets_seen++; - u64 bucket = alloc_cursor & ~(~0ULL << 56); + u64 bucket = iter.pos.offset & ~(~0ULL << 56); if (s->btree_bitmap != BTREE_BITMAP_ANY && s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { @@ -511,32 +426,36 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, goto fail; bucket = sector_to_bucket(ca, - round_up(bucket_to_sector(ca, bucket) + 1, + round_up(bucket_to_sector(ca, bucket + 1), 1ULL << ca->mi.btree_bitmap_shift)); - u64 genbits = alloc_cursor >> 56; - alloc_cursor = bucket | (genbits << 56); + alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - if (alloc_cursor > k.k->p.offset) - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); + bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); s->skipped_mi_btree_bitmap++; - continue; + goto next; } - ob = try_alloc_bucket(trans, ca, watermark, - alloc_cursor, s, k, cl); + ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); if (ob) { + if (!IS_ERR(ob)) + *dev_alloc_cursor = iter.pos.offset; bch2_set_btree_iter_dontneed(&iter); break; } - } + iter.k.size--; + iter.pos.offset++; + } +next: if (ob || ret) break; } fail: bch2_trans_iter_exit(trans, &iter); - if (!ob && ret) + BUG_ON(ob && ret); + + if (ret) ob = ERR_PTR(ret); if (!ob && alloc_start > ca->mi.first_bucket) { @@ -544,8 +463,6 @@ static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, goto again; } - *dev_alloc_cursor = alloc_cursor; - return ob; } @@ -595,6 +512,7 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, * @watermark: how important is this allocation? * @data_type: BCH_DATA_journal, btree, user... * @cl: if not NULL, closure to be used to wait if buckets not available + * @nowait: if true, do not wait for buckets to become available * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. @@ -629,6 +547,10 @@ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, bch2_dev_do_invalidates(ca); if (!avail) { + if (watermark > BCH_WATERMARK_normal && + c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + goto alloc; + if (cl && !waiting) { closure_wait(&c->freelist_wait, cl); waiting = true; @@ -711,9 +633,9 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.devs[ret.nr++] = i; + ret.data[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + bubble_sort(ret.data, ret.nr, dev_stripe_cmp); return ret; } @@ -785,18 +707,13 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted = - bch2_dev_alloc_list(c, stripe, devs_may_alloc); int ret = -BCH_ERR_insufficient_devices; BUG_ON(*nr_effective >= nr_replicas); - for (unsigned i = 0; i < devs_sorted.nr; i++) { - struct bch_dev_usage usage; - struct open_bucket *ob; - - unsigned dev = devs_sorted.devs[i]; - struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) { + struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); if (!ca) continue; @@ -805,8 +722,9 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + struct bch_dev_usage usage; + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, + cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); if (!IS_ERR(ob)) bch2_dev_stripe_increment_inlined(ca, stripe, &usage); bch2_dev_put(ca); @@ -850,10 +768,6 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, struct closure *cl) { struct bch_fs *c = trans->c; - struct dev_alloc_list devs_sorted; - struct ec_stripe_head *h; - struct open_bucket *ob; - unsigned i, ec_idx; int ret = 0; if (nr_replicas < 2) @@ -862,34 +776,32 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (ec_open_bucket(c, ptrs)) return 0; - h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + struct ec_stripe_head *h = + bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - - for (i = 0; i < devs_sorted.nr; i++) - for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + darray_for_each(devs_sorted, i) + for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; - ob = c->open_buckets + h->s->blocks[ec_idx]; - if (ob->dev == devs_sorted.devs[i] && - !test_and_set_bit(ec_idx, h->s->blocks_allocated)) - goto got_bucket; + struct open_bucket *ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->dev == *i && !test_and_set_bit(ec_idx, h->s->blocks_allocated)) { + ob->ec_idx = ec_idx; + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, ob); + goto out; + } } - goto out_put_head; -got_bucket: - ob->ec_idx = ec_idx; - ob->ec = h->s; - ec_stripe_new_get(h->s, STRIPE_REF_io); - - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); -out_put_head: +out: bch2_ec_stripe_head_put(c, h); return ret; } @@ -1566,7 +1478,7 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) mutex_init(&c->write_points_hash_lock); c->write_points_nr = ARRAY_SIZE(c->write_points); - /* open bucket 0 is a sentinal NULL: */ + /* open bucket 0 is a sentinel NULL: */ spin_lock_init(&c->open_buckets[0].lock); for (ob = c->open_buckets + 1; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 1a16fd5bd4f89..f25481a0d1a06 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -20,7 +20,7 @@ void bch2_reset_alloc_cursors(struct bch_fs *); struct dev_alloc_list { unsigned nr; - u8 devs[BCH_SB_MEMBERS_MAX]; + u8 data[BCH_SB_MEMBERS_MAX]; }; struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, @@ -28,8 +28,6 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct bch_devs_mask *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -long bch2_bucket_alloc_new_fs(struct bch_dev *); - static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) { return bch2_dev_have_ref(c, ob->dev); diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 654a58132a4db..ebeb6a5ff9d26 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -14,42 +14,8 @@ #include -static bool extent_matches_bp(struct bch_fs *c, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, - struct bpos bucket, - struct bch_backpointer bp) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - rcu_read_lock(); - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket2; - struct bch_backpointer bp2; - - if (p.ptr.cached) - continue; - - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); - if (!ca) - continue; - - bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, &bucket2, &bp2); - if (bpos_eq(bucket, bucket2) && - !memcmp(&bp, &bp2, sizeof(bp))) { - rcu_read_unlock(); - return true; - } - } - rcu_read_unlock(); - - return false; -} - int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); int ret = 0; @@ -59,67 +25,70 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, "backpointer level bad: %u >= %u", bp.v->level, BTREE_MAX_DEPTH); - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (!ca) { - /* these will be caught by fsck */ - rcu_read_unlock(); - return 0; - } - - struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p); - struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset); - rcu_read_unlock(); - - bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size || - !bpos_eq(bp.k->p, bp_pos), - c, backpointer_bucket_offset_wrong, - "backpointer bucket_offset wrong"); + bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, + c, backpointer_dev_bad, + "backpointer for BCH_SB_MEMBER_INVALID"); fsck_err: return ret; } -void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { - prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", - bch2_btree_id_str(bp->btree_id), - bp->level, - (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), - (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), - bp->bucket_len); - bch2_bpos_to_text(out, bp->pos); -} + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); -void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -{ rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); if (ca) { - struct bpos bucket = bp_pos_to_bucket(ca, k.k->p); + u32 bucket_offset; + struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); rcu_read_unlock(); - prt_str(out, "bucket="); - bch2_bpos_to_text(out, bucket); - prt_str(out, " "); + prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); } else { rcu_read_unlock(); + prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); } - bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); + bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_printf(out, " suboffset=%u len=%u gen=%u pos=", + (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp.v->bucket_len, + bp.v->bucket_gen); + bch2_bpos_to_text(out, bp.v->pos); } void bch2_backpointer_swab(struct bkey_s k) { struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); - bp.v->bucket_offset = swab40(bp.v->bucket_offset); bp.v->bucket_len = swab32(bp.v->bucket_len); bch2_bpos_swab(&bp.v->pos); } +static bool extent_matches_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct bkey_s_c_backpointer bp) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bkey_i_backpointer bp2; + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); + + if (bpos_eq(bp.k->p, bp2.k.p) && + !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) + return true; + } + + return false; +} + static noinline int backpointer_mod_err(struct btree_trans *trans, - struct bch_backpointer bp, - struct bkey_s_c bp_k, struct bkey_s_c orig_k, + struct bkey_i_backpointer *new_bp, + struct bkey_s_c found_bp, bool insert) { struct bch_fs *c = trans->c; @@ -127,12 +96,12 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, if (insert) { prt_printf(&buf, "existing backpointer found when inserting "); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); printbuf_indent_add(&buf, 2); prt_printf(&buf, "found "); - bch2_bkey_val_to_text(&buf, c, bp_k); + bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); @@ -144,11 +113,11 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); prt_printf(&buf, "got "); - bch2_bkey_val_to_text(&buf, c, bp_k); + bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); @@ -167,161 +136,118 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, - struct bch_backpointer bp, struct bkey_s_c orig_k, + struct bkey_i_backpointer *bp, bool insert) { struct btree_iter bp_iter; - struct bkey_s_c k; - struct bkey_i_backpointer *bp_k; - int ret; - - bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); - ret = PTR_ERR_OR_ZERO(bp_k); - if (ret) - return ret; - - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); - bp_k->v = bp; - - if (!insert) { - bp_k->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k->k, 0); - } - - k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bp_k->k.p, + struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bp->k.p, BTREE_ITER_intent| BTREE_ITER_slots| BTREE_ITER_with_updates); - ret = bkey_err(k); + int ret = bkey_err(k); if (ret) - goto err; + return ret; if (insert ? k.k->type : (k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { - ret = backpointer_mod_err(trans, bp, k, orig_k, insert); + memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { + ret = backpointer_mod_err(trans, orig_k, bp, k, insert); if (ret) goto err; } - ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); + if (!insert) { + bp->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp->k, 0); + } + + ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); err: bch2_trans_iter_exit(trans, &bp_iter); return ret; } -/* - * Find the next backpointer >= *bp_offset: - */ -int bch2_get_next_backpointer(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, int gen, - struct bpos *bp_pos, - struct bch_backpointer *bp, - unsigned iter_flags) +static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { - struct bpos bp_end_pos = bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0); - struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; - struct bkey_s_c k; - int ret = 0; - - if (bpos_ge(*bp_pos, bp_end_pos)) - goto done; - - if (gen >= 0) { - k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, - bucket, BTREE_ITER_cached|iter_flags); - ret = bkey_err(k); - if (ret) - goto out; - - if (k.k->type != KEY_TYPE_alloc_v4 || - bkey_s_c_to_alloc_v4(k).v->gen != gen) - goto done; - } - - *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(ca, bucket, 0)); - - for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, - *bp_pos, iter_flags, k, ret) { - if (bpos_ge(k.k->p, bp_end_pos)) - break; + return (likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) + : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); +} - *bp_pos = k.k->p; - *bp = *bkey_s_c_to_backpointer(k).v; - goto out; - } -done: - *bp_pos = SPOS_MAX; -out: - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; +static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, + struct bkey_s_c visiting_k, + struct bkey_buf *last_flushed) +{ + return likely(!bch2_backpointers_no_use_write_buffer) + ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) + : 0; } -static void backpointer_not_found(struct btree_trans *trans, - struct bpos bp_pos, - struct bch_backpointer bp, - struct bkey_s_c k) +static int backpointer_target_not_found(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct bkey_s_c target_k, + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + int ret = 0; /* * If we're using the btree write buffer, the backpointer we were * looking at may have already been deleted - failure to find what it * pointed to is not an error: */ - if (likely(!bch2_backpointers_no_use_write_buffer)) - return; - - struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) - return; + ret = last_flushed + ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) + : 0; + if (ret) + return ret; prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", - bp.level ? "btree node" : "extent"); - prt_printf(&buf, "bucket: "); - bch2_bpos_to_text(&buf, bucket); - prt_printf(&buf, "\n "); + bp.v->level ? "btree node" : "extent"); + bch2_bkey_val_to_text(&buf, c, bp.s_c); - prt_printf(&buf, "backpointer pos: "); - bch2_bpos_to_text(&buf, bp_pos); prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, target_k); - bch2_backpointer_to_text(&buf, &bp); - prt_printf(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, k); - if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) - bch_err_ratelimited(c, "%s", buf.buf); - else - bch2_trans_inconsistent(trans, "%s", buf.buf); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) + if (p.ptr.dev == bp.k->p.inode) { + prt_printf(&buf, "\n "); + struct bkey_i_backpointer bp2; + bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); + } + if (fsck_err(trans, backpointer_to_missing_ptr, + "%s", buf.buf)) + ret = bch2_backpointer_del(trans, bp.k->p); +fsck_err: printbuf_exit(&buf); + return ret; } struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, struct btree_iter *iter, - struct bpos bp_pos, - struct bch_backpointer bp, - unsigned iter_flags) + unsigned iter_flags, + struct bkey_buf *last_flushed) { - if (likely(!bp.level)) { - struct bch_fs *c = trans->c; + struct bch_fs *c = trans->c; - struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) - return bkey_s_c_err(-EIO); + if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) + return bkey_s_c_null; + if (likely(!bp.v->level)) { bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, + bp.v->btree_id, + bp.v->pos, 0, 0, iter_flags); struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); @@ -330,67 +256,64 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, return k; } - if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; bch2_trans_iter_exit(trans, iter); - backpointer_not_found(trans, bp_pos, bp, k); - return bkey_s_c_null; + int ret = backpointer_target_not_found(trans, bp, k, last_flushed); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { - struct btree *b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); + if (IS_ERR_OR_NULL(b)) + return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - if (IS_ERR_OR_NULL(b)) { - bch2_trans_iter_exit(trans, iter); - return IS_ERR(b) ? bkey_s_c_err(PTR_ERR(b)) : bkey_s_c_null; - } return bkey_i_to_s_c(&b->key); } } struct btree *bch2_backpointer_get_node(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, struct btree_iter *iter, - struct bpos bp_pos, - struct bch_backpointer bp) + struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - BUG_ON(!bp.level); - - struct bpos bucket; - if (!bp_pos_to_bucket_nodev(c, bp_pos, &bucket)) - return ERR_PTR(-EIO); + BUG_ON(!bp.v->level); bch2_trans_node_iter_init(trans, iter, - bp.btree_id, - bp.pos, + bp.v->btree_id, + bp.v->pos, 0, - bp.level - 1, + bp.v->level - 1, 0); struct btree *b = bch2_btree_iter_peek_node(iter); if (IS_ERR_OR_NULL(b)) goto err; - BUG_ON(b->c.level != bp.level - 1); + BUG_ON(b->c.level != bp.v->level - 1); - if (extent_matches_bp(c, bp.btree_id, bp.level, - bkey_i_to_s_c(&b->key), - bucket, bp)) + if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, + bkey_i_to_s_c(&b->key), bp)) return b; if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - backpointer_not_found(trans, bp_pos, bp, bkey_i_to_s_c(&b->key)); - b = NULL; + int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); + b = ret ? ERR_PTR(ret) : NULL; } err: bch2_trans_iter_exit(trans, iter); return b; } -static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, - struct bkey_s_c k) +static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, + struct bkey_buf *last_flushed) { + if (k.k->type != KEY_TYPE_backpointer) + return 0; + struct bch_fs *c = trans->c; struct btree_iter alloc_iter = { NULL }; struct bkey_s_c alloc_k; @@ -399,10 +322,14 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ struct bpos bucket; if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + if (fsck_err(trans, backpointer_to_missing_device, "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, bp_iter, 0); + ret = bch2_backpointer_del(trans, k.k->p); goto out; } @@ -411,13 +338,16 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ if (ret) goto out; - if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, - trans, backpointer_to_missing_alloc, - "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", - alloc_iter.pos.inode, alloc_iter.pos.offset, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = bch2_btree_delete_at(trans, bp_iter, 0); - goto out; + if (alloc_k.k->type != KEY_TYPE_alloc_v4) { + ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); + if (ret) + goto out; + + if (fsck_err(trans, backpointer_to_missing_alloc, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_backpointer_del(trans, k.k->p); } out: fsck_err: @@ -429,18 +359,24 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_ /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_btree_backpointer(trans, &iter, k))); + bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } struct extents_to_bp_state { - struct bpos bucket_start; - struct bpos bucket_end; + struct bpos bp_start; + struct bpos bp_end; struct bkey_buf last_flushed; }; @@ -501,9 +437,13 @@ static int check_extent_checksum(struct btree_trans *trans, goto err; prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); - prt_printf(&buf, "\n %s ", bch2_btree_id_str(btree)); + prt_printf(&buf, "\n "); + bch2_btree_id_to_text(&buf, btree); + prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent); - prt_printf(&buf, "\n %s ", bch2_btree_id_str(o_btree)); + prt_printf(&buf, "\n "); + bch2_btree_id_to_text(&buf, o_btree); + prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent2); struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); @@ -524,41 +464,25 @@ static int check_extent_checksum(struct btree_trans *trans, static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, - struct bpos bucket, - struct bch_backpointer bp, + struct bkey_i_backpointer *bp, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = {}; struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; - struct bkey_s_c bp_k; - int ret = 0; - struct bch_dev *ca = bch2_dev_bucket_tryget(c, bucket); - if (!ca) { - prt_str(&buf, "extent for nonexistent device:bucket "); - bch2_bpos_to_text(&buf, bucket); - prt_str(&buf, "\n "); - bch2_bkey_val_to_text(&buf, c, orig_k); - bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; - } - - if (bpos_lt(bucket, s->bucket_start) || - bpos_gt(bucket, s->bucket_end)) - goto out; + if (bpos_lt(bp->k.p, s->bp_start) || + bpos_gt(bp->k.p, s->bp_end)) + return 0; - bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp(ca, bucket, bp.bucket_offset), - 0); - ret = bkey_err(bp_k); + struct btree_iter bp_iter; + struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); + int ret = bkey_err(bp_k); if (ret) goto err; if (bp_k.k->type != KEY_TYPE_backpointer || - memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); if (ret) goto err; @@ -570,7 +494,6 @@ static int check_bp_exists(struct btree_trans *trans, fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); printbuf_exit(&buf); return ret; check_existing_bp: @@ -578,10 +501,10 @@ static int check_bp_exists(struct btree_trans *trans, if (bp_k.k->type != KEY_TYPE_backpointer) goto missing; - struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v; + struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0); + bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; @@ -600,19 +523,23 @@ static int check_bp_exists(struct btree_trans *trans, bch_err(c, "%s", buf.buf); if (other_extent.k->size <= orig_k.k->size) { - ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode); + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); if (ret) goto err; goto out; } else { - ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode); + ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); if (ret) goto err; goto missing; } } - ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode); + ret = check_extent_checksum(trans, + other_bp.v->btree_id, other_extent, + bp->v.btree_id, orig_k, + bp->k.p.inode); if (ret < 0) goto err; if (ret) { @@ -620,7 +547,8 @@ static int check_bp_exists(struct btree_trans *trans, goto missing; } - ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode); + ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, + other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret < 0) goto err; if (ret) { @@ -629,7 +557,7 @@ static int check_bp_exists(struct btree_trans *trans, } printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bucket.inode); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); bch2_bkey_val_to_text(&buf, c, orig_k); prt_str(&buf, "\n "); bch2_bkey_val_to_text(&buf, c, other_extent); @@ -638,21 +566,15 @@ static int check_bp_exists(struct btree_trans *trans, goto err; missing: printbuf_reset(&buf); - prt_printf(&buf, "missing backpointer for btree=%s l=%u ", - bch2_btree_id_str(bp.btree_id), bp.level); + prt_str(&buf, "missing backpointer\n for: "); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\n got: "); + prt_printf(&buf, "\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); + prt_printf(&buf, "\n got: "); bch2_bkey_val_to_text(&buf, c, bp_k); - struct bkey_i_backpointer n_bp_k; - bkey_backpointer_init(&n_bp_k.k_i); - n_bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); - n_bp_k.v = bp; - prt_printf(&buf, "\n want: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i)); - if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) - ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, orig_k, true); + ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); goto out; } @@ -663,31 +585,33 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - int ret; - ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - struct bpos bucket_pos = POS_MIN; - struct bch_backpointer bp; - if (p.ptr.cached) continue; + if (p.ptr.dev == BCH_SB_MEMBER_INVALID) + continue; + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - if (ca) - bch2_extent_ptr_to_bp(c, ca, btree, level, k, p, entry, &bucket_pos, &bp); + bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); + bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); rcu_read_unlock(); - if (!ca) - continue; + if (check || empty) { + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); - ret = check_bp_exists(trans, s, bucket_pos, bp, k); - if (ret) - return ret; + int ret = check + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); + if (ret) + return ret; + } } return 0; @@ -896,54 +820,330 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } +enum alloc_sector_counter { + ALLOC_dirty, + ALLOC_cached, + ALLOC_stripe, + ALLOC_SECTORS_NR +}; + +static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +{ + switch (t) { + case BCH_DATA_btree: + case BCH_DATA_user: + return ALLOC_dirty; + case BCH_DATA_cached: + return ALLOC_cached; + case BCH_DATA_stripe: + return ALLOC_stripe; + default: + BUG(); + } +} + +static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); + +static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); + bool need_commit = false; + + if (a->data_type == BCH_DATA_sb || + a->data_type == BCH_DATA_journal || + a->data_type == BCH_DATA_parity) + return 0; + + u32 sectors[ALLOC_SECTORS_NR]; + memset(sectors, 0, sizeof(sectors)); + + struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); + if (!ca) + return 0; + + struct btree_iter iter; + struct bkey_s_c bp_k; + int ret = 0; + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, alloc_k.k->p), + bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { + if (bp_k.k->type != KEY_TYPE_backpointer) + continue; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && + (bp.v->bucket_gen != a->gen || + bp.v->pad)) { + ret = bch2_backpointer_del(trans, bp_k.k->p); + if (ret) + break; + + need_commit = true; + continue; + } + + if (bp.v->bucket_gen != a->gen) + continue; + + sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + }; + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + if (need_commit) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + if (ret) + goto err; + } + + /* Cached pointers don't have backpointers: */ + + if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_stripe] != a->stripe_sectors) { + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { + ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); + if (ret) + goto err; + } + + if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_stripe] > a->stripe_sectors) { + ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: + -BCH_ERR_transaction_restart_nested; + goto err; + } + + if (!sectors[ALLOC_dirty] && + !sectors[ALLOC_stripe]) + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); + else + __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); + } +err: + bch2_dev_put(ca); + return ret; +} + +static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr_v2: { + bool ret = false; + + rcu_read_lock(); + struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; + while (pos.inode <= k.k->p.inode) { + if (pos.inode >= c->sb.nr_devices) + break; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); + if (!ca) + goto next; + + struct bpos bucket = bp_pos_to_bucket(ca, pos); + bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, + ca->mi.nbuckets, bucket.offset); + if (bucket.offset == ca->mi.nbuckets) + goto next; + + ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); + if (ret) + break; +next: + pos = SPOS(pos.inode + 1, 0, 0); + } + rcu_read_unlock(); + + return ret; + } + case KEY_TYPE_btree_ptr: + return true; + default: + return false; + } +} + +static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, + enum btree_id btree, unsigned level) +{ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + if (b) + bch2_node_pin(trans->c, b); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, + struct bpos start, struct bpos *end) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + struct bkey_buf tmp; + bch2_bkey_buf_init(&tmp); + + bch2_btree_cache_unpin(c); + + *end = SPOS_MAX; + + s64 mem_may_pin = mem_may_pin_bytes(c); + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, + 0, 1, BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + if (!backpointer_node_has_missing(c, k)) + continue; + + mem_may_pin -= c->opts.btree_node_size; + if (mem_may_pin <= 0) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + struct btree_path *path = btree_iter_path(trans, &iter); + + BUG_ON(path->level != 1); + + bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); + })); + if (ret) + return ret; + + struct bpos pinned = SPOS_MAX; + mem_may_pin = mem_may_pin_bytes(c); + bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, + 0, 1, BTREE_ITER_prefetch); + ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + if (!backpointer_node_has_missing(c, k)) + continue; + + mem_may_pin -= c->opts.btree_node_size; + if (mem_may_pin <= 0) { + *end = pinned; + break; + } + + bch2_bkey_buf_reassemble(&tmp, c, k); + struct btree_path *path = btree_iter_path(trans, &iter); + + BUG_ON(path->level != 1); + + int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); + + if (!ret2) + pinned = tmp.k->k.p; + + ret; + })); + if (ret) + return ret; + + return ret; +} + int bch2_check_extents_to_backpointers(struct bch_fs *c) { + int ret = 0; + + /* + * Can't allow devices to come/go/resize while we have bucket bitmaps + * allocated + */ + lockdep_assert_held(&c->state_lock); + + for_each_member_device(c, ca) { + BUG_ON(ca->bucket_backpointer_mismatches); + ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), + GFP_KERNEL); + ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), + GFP_KERNEL); + if (!ca->bucket_backpointer_mismatches || + !ca->bucket_backpointer_empty) { + bch2_dev_put(ca); + ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + goto err_free_bitmaps; + } + } + struct btree_trans *trans = bch2_trans_get(c); - struct extents_to_bp_state s = { .bucket_start = POS_MIN }; - int ret; + struct extents_to_bp_state s = { .bp_start = POS_MIN }; bch2_bkey_buf_init(&s.last_flushed); bkey_init(&s.last_flushed.k->k); + ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_prefetch, k, ({ + check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + })); + if (ret) + goto err; + + u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; + for_each_member_device(c, ca) { + nr_buckets += ca->mi.nbuckets; + nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); + nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); + } + + if (!nr_mismatches && !nr_empty) + goto err; + + bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", + nr_mismatches + nr_empty, nr_buckets); + while (1) { - struct bbpos end; - ret = bch2_get_btree_in_memory_pos(trans, - BIT_ULL(BTREE_ID_backpointers), - BIT_ULL(BTREE_ID_backpointers), - BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); + ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); if (ret) break; - s.bucket_end = end.pos; - - if ( bpos_eq(s.bucket_start, POS_MIN) && - !bpos_eq(s.bucket_end, SPOS_MAX)) + if ( bpos_eq(s.bp_start, POS_MIN) && + !bpos_eq(s.bp_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); - if (!bpos_eq(s.bucket_start, POS_MIN) || - !bpos_eq(s.bucket_end, SPOS_MAX)) { + if (!bpos_eq(s.bp_start, POS_MIN) || + !bpos_eq(s.bp_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); - bch2_bpos_to_text(&buf, s.bucket_start); + bch2_bpos_to_text(&buf, s.bp_start); prt_str(&buf, "-"); - bch2_bpos_to_text(&buf, s.bucket_end); + bch2_bpos_to_text(&buf, s.bp_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } ret = bch2_check_extents_to_backpointers_pass(trans, &s); - if (ret || bpos_eq(s.bucket_end, SPOS_MAX)) + if (ret || bpos_eq(s.bp_end, SPOS_MAX)) break; - s.bucket_start = bpos_successor(s.bucket_end); + s.bp_start = bpos_successor(s.bp_end); } +err: bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); - bch2_btree_cache_unpin(c); +err_free_bitmaps: + for_each_member_device(c, ca) { + kvfree(ca->bucket_backpointer_empty); + ca->bucket_backpointer_empty = NULL; + kvfree(ca->bucket_backpointer_mismatches); + ca->bucket_backpointer_mismatches = NULL; + } bch_err_fn(c, ret); return ret; @@ -959,44 +1159,43 @@ static int check_one_backpointer(struct btree_trans *trans, return 0; struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); - struct bch_fs *c = trans->c; - struct btree_iter iter; struct bbpos pos = bp_to_bbpos(*bp.v); - struct bkey_s_c k; - struct printbuf buf = PRINTBUF; - int ret; if (bbpos_cmp(pos, start) < 0 || bbpos_cmp(pos, end) > 0) return 0; - k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); + int ret = bkey_err(k); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) return 0; if (ret) return ret; - if (!k.k) { - ret = bch2_btree_write_buffer_maybe_flush(trans, bp.s_c, last_flushed); - if (ret) - goto out; - - if (fsck_err(trans, backpointer_to_missing_ptr, - "backpointer for missing %s\n %s", - bp.v->level ? "btree node" : "extent", - (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { - ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); - goto out; - } - } -out: -fsck_err: bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); return ret; } +static int check_bucket_backpointers_to_extents(struct btree_trans *trans, + struct bch_dev *ca, struct bpos bucket) +{ + u32 restart_count = trans->restart_count; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket), + bucket_pos_to_bp_end(ca, bucket), + 0, k, + check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) + ); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + return ret ?: trans_was_restarted(trans, restart_count); +} + static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) @@ -1009,9 +1208,8 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, bkey_init(&last_flushed.k->k); progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); - int ret = for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, - POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_prefetch, k, ({ progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); check_one_backpointer(trans, start, end, k, &last_flushed); })); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 3b29fdf519dd9..060dad1521ee3 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -18,14 +18,14 @@ static inline u64 swab40(u64 x) ((x & 0xff00000000ULL) >> 32)); } -int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, enum bch_validate_flags); -void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); -void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_backpointer_validate(struct bch_fs *, struct bkey_s_c k, + struct bkey_validate_context); +void bch2_backpointer_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_backpointer_swab(struct bkey_s); #define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ .key_validate = bch2_backpointer_validate, \ - .val_to_text = bch2_backpointer_k_to_text, \ + .val_to_text = bch2_backpointer_to_text, \ .swab = bch2_backpointer_swab, \ .min_val_size = 32, \ }) @@ -43,22 +43,24 @@ static inline struct bpos bp_pos_to_bucket(const struct bch_dev *ca, struct bpos return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); } +static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, struct bpos bp_pos, + u32 *bucket_offset) +{ + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket_and_offset(ca, bucket_sector, bucket_offset)); +} + static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, bp_pos.inode); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); if (ca) *bucket = bp_pos_to_bucket(ca, bp_pos); rcu_read_unlock(); return ca != NULL; } -static inline bool bp_pos_to_bucket_nodev(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) -{ - return !bch2_fs_inconsistent_on(!bp_pos_to_bucket_nodev_noerror(c, bp_pos, bucket), - c, "backpointer for missing device %llu", bp_pos.inode); -} - static inline struct bpos bucket_pos_to_bp_noerror(const struct bch_dev *ca, struct bpos bucket, u64 bucket_offset) @@ -80,31 +82,35 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_dev *ca, return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bch_dev *, - struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); +static inline struct bpos bucket_pos_to_bp_start(const struct bch_dev *ca, struct bpos bucket) +{ + return bucket_pos_to_bp(ca, bucket, 0); +} + +static inline struct bpos bucket_pos_to_bp_end(const struct bch_dev *ca, struct bpos bucket) +{ + return bpos_nosnap_predecessor(bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket), 0)); +} + +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, + struct bkey_s_c, + struct bkey_i_backpointer *, + bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, - struct bch_dev *ca, - struct bpos bucket, - struct bch_backpointer bp, struct bkey_s_c orig_k, + struct bkey_i_backpointer *bp, bool insert) { if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, ca, bucket, bp, orig_k, insert); - - struct bkey_i_backpointer bp_k; - - bkey_backpointer_init(&bp_k.k_i); - bp_k.k.p = bucket_pos_to_bp(ca, bucket, bp.bucket_offset); - bp_k.v = bp; + return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); if (!insert) { - bp_k.k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k.k, 0); + bp->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp->k, 0); } - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp->k_i); } static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, @@ -134,44 +140,29 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, } } -static inline void __bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct extent_ptr_decoded p, const union bch_extent_entry *entry, - struct bpos *bucket_pos, struct bch_backpointer *bp, - u64 sectors) + struct bkey_i_backpointer *bp) { - u32 bucket_offset; - *bucket_pos = PTR_BUCKET_POS_OFFSET(ca, &p.ptr, &bucket_offset); - *bp = (struct bch_backpointer) { + bkey_backpointer_init(&bp->k_i); + bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); + bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, .data_type = bch2_bkey_ptr_data_type(k, p, entry), - .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + - p.crc.offset, - .bucket_len = sectors, + .bucket_gen = p.ptr.gen, + .bucket_len = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p), .pos = k.k->p, }; } -static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bch_dev *ca, - enum btree_id btree_id, unsigned level, - struct bkey_s_c k, struct extent_ptr_decoded p, - const union bch_extent_entry *entry, - struct bpos *bucket_pos, struct bch_backpointer *bp) -{ - u64 sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); - - __bch2_extent_ptr_to_bp(c, ca, btree_id, level, k, p, entry, bucket_pos, bp, sectors); -} - -int bch2_get_next_backpointer(struct btree_trans *, struct bch_dev *ca, struct bpos, int, - struct bpos *, struct bch_backpointer *, unsigned); -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_backpointer, - unsigned); -struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, - struct bpos, struct bch_backpointer); +struct bkey_buf; +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_backpointer, + struct btree_iter *, unsigned, struct bkey_buf *); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, + struct btree_iter *, struct bkey_buf *); int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h index be2edced52133..63abe17f35eaa 100644 --- a/fs/bcachefs/bbpos.h +++ b/fs/bcachefs/bbpos.h @@ -29,7 +29,7 @@ static inline struct bbpos bbpos_successor(struct bbpos pos) static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) { - prt_str(out, bch2_btree_id_str(pos.btree)); + bch2_btree_id_to_text(out, pos.btree); prt_char(out, ':'); bch2_bpos_to_text(out, pos.pos); } diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index e94a83b8113e9..7ee05afdf741d 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -205,6 +205,7 @@ #include #include "bcachefs_format.h" +#include "btree_journal_iter_types.h" #include "disk_accounting_types.h" #include "errcode.h" #include "fifo.h" @@ -293,6 +294,8 @@ do { \ #define bch_info(c, fmt, ...) \ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_info_ratelimited(c, fmt, ...) \ + bch2_print_ratelimited(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_notice(c, fmt, ...) \ bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_warn(c, fmt, ...) \ @@ -352,6 +355,12 @@ do { \ bch_info(c, fmt, ##__VA_ARGS__); \ } while (0) +#define bch_verbose_ratelimited(c, fmt, ...) \ +do { \ + if ((c)->opts.verbose) \ + bch_info_ratelimited(c, fmt, ##__VA_ARGS__); \ +} while (0) + #define pr_verbose_init(opts, fmt, ...) \ do { \ if (opt_get(opts, verbose)) \ @@ -538,20 +547,20 @@ struct bch_dev { /* * Buckets: - * Per-bucket arrays are protected by c->mark_lock, bucket_lock and - * gc_gens_lock, for device resize - holding any is sufficient for - * access: Or rcu_read_lock(), but only for dev_ptr_stale(): + * Per-bucket arrays are protected by either rcu_read_lock or + * state_lock, for device resize. */ GENRADIX(struct bucket) buckets_gc; struct bucket_gens __rcu *bucket_gens; u8 *oldest_gen; unsigned long *buckets_nouse; - struct rw_semaphore bucket_lock; + + unsigned long *bucket_backpointer_mismatches; + unsigned long *bucket_backpointer_empty; struct bch_dev_usage __percpu *usage; /* Allocator: */ - u64 new_fs_bucket_idx; u64 alloc_cursor[3]; unsigned nr_open_buckets; @@ -606,6 +615,7 @@ struct bch_dev { x(going_ro) \ x(write_disable_complete) \ x(clean_shutdown) \ + x(recovery_running) \ x(fsck_running) \ x(initial_gc_unfixed) \ x(need_delete_dead_snapshots) \ @@ -650,28 +660,6 @@ struct journal_seq_blacklist_table { } entries[]; }; -struct journal_keys { - /* must match layout in darray_types.h */ - size_t nr, size; - struct journal_key { - u64 journal_seq; - u32 journal_offset; - enum btree_id btree_id:8; - unsigned level:8; - bool allocated; - bool overwritten; - struct bkey_i *k; - } *data; - /* - * Gap buffer: instead of all the empty space in the array being at the - * end of the buffer - from @nr to @size - the empty space is at @gap. - * This means that sequential insertions are O(n) instead of O(n^2). - */ - size_t gap; - atomic_t ref; - bool initial_ref_held; -}; - struct btree_trans_buf { struct btree_trans *trans; }; @@ -680,6 +668,7 @@ struct btree_trans_buf { ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) #define BCH_WRITE_REFS() \ + x(journal) \ x(trans) \ x(write) \ x(promote) \ @@ -692,6 +681,7 @@ struct btree_trans_buf { x(dio_write) \ x(discard) \ x(discard_fast) \ + x(check_discard_freespace_key) \ x(invalidate) \ x(delete_dead_snapshots) \ x(gc_gens) \ @@ -735,7 +725,13 @@ struct bch_fs { struct percpu_ref writes; #endif /* - * Analagous to c->writes, for asynchronous ops that don't necessarily + * Certain operations are only allowed in single threaded mode, during + * recovery, and we want to assert that this is the case: + */ + struct task_struct *recovery_task; + + /* + * Analogous to c->writes, for asynchronous ops that don't necessarily * need fs to be read-write */ refcount_t ro_ref; @@ -764,6 +760,8 @@ struct bch_fs { __uuid_t user_uuid; u16 version; + u16 version_incompat; + u16 version_incompat_allowed; u16 version_min; u16 version_upgrade_complete; @@ -834,9 +832,10 @@ struct bch_fs { struct work_struct btree_interior_update_work; struct workqueue_struct *btree_node_rewrite_worker; - - struct list_head pending_node_rewrites; - struct mutex pending_node_rewrites_lock; + struct list_head btree_node_rewrites; + struct list_head btree_node_rewrites_pending; + spinlock_t btree_node_rewrites_lock; + struct closure_waitlist btree_node_rewrites_wait; /* btree_io.c: */ spinlock_t btree_write_error_lock; @@ -967,8 +966,7 @@ struct bch_fs { struct rhashtable promote_table; mempool_t compression_bounce[2]; - mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; - mempool_t decompress_workspace; + mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; struct crypto_shash *sha256; @@ -1027,6 +1025,7 @@ struct bch_fs { struct list_head vfs_inodes_list; struct mutex vfs_inodes_lock; struct rhashtable vfs_inodes_table; + struct rhltable vfs_inodes_by_inum_table; /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; @@ -1048,10 +1047,12 @@ struct bch_fs { * for signaling to the toplevel code which pass we want to run now. */ enum bch_recovery_pass curr_recovery_pass; + enum bch_recovery_pass next_recovery_pass; /* bitmask of recovery passes that we actually ran */ u64 recovery_passes_complete; /* never rewinds version of curr_recovery_pass */ enum bch_recovery_pass recovery_pass_done; + spinlock_t recovery_pass_lock; struct semaphore online_fsck_mutex; /* DEBUG JUNK */ @@ -1062,9 +1063,6 @@ struct bch_fs { struct btree_node *verify_ondisk; struct mutex verify_lock; - u64 *unused_inode_hints; - unsigned inode_shard_bits; - /* * A btree node on disk could have too many bsets for an iterator to fit * on the stack - have to dynamically allocate them @@ -1086,8 +1084,6 @@ struct bch_fs { u64 counters_on_mount[BCH_COUNTER_NR]; u64 __percpu *counters; - unsigned copy_gc_enabled:1; - struct bch2_time_stats times[BCH_TIME_STAT_NR]; struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 5004f6ba997c9..638931a36d930 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -239,7 +239,7 @@ struct bkey { * * Specifically, when i was designing bkey, I wanted the header to be no * bigger than necessary so that bkey_packed could use the rest. That means that - * decently offten extent keys will fit into only 8 bytes, instead of spilling over + * decently often extent keys will fit into only 8 bytes, instead of spilling over * to 16. * * But packed_bkey treats the part after the header - the packed section - @@ -251,7 +251,7 @@ struct bkey { * So that constrains the key part of a bkig endian bkey to start right * after the header. * - * If we ever do a bkey_v2 and need to expand the hedaer by another byte for + * If we ever do a bkey_v2 and need to expand the header by another byte for * some reason - that will clean up this wart. */ __aligned(8) @@ -418,7 +418,8 @@ static inline void bkey_init(struct bkey *k) x(snapshot_tree, 31) \ x(logged_op_truncate, 32) \ x(logged_op_finsert, 33) \ - x(accounting, 34) + x(accounting, 34) \ + x(inode_alloc_cursor, 35) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -463,7 +464,8 @@ struct bch_backpointer { __u8 btree_id; __u8 level; __u8 data_type; - __u64 bucket_offset:40; + __u8 bucket_gen; + __u32 pad; __u32 bucket_len; struct bpos pos; } __packed __aligned(8); @@ -499,8 +501,6 @@ struct bch_sb_field { #include "disk_groups_format.h" #include "extents_format.h" #include "ec_format.h" -#include "dirent_format.h" -#include "disk_groups_format.h" #include "inode_format.h" #include "journal_seq_blacklist_format.h" #include "logged_ops_format.h" @@ -643,7 +643,7 @@ struct bch_sb_field_ext { /* * field 1: version name * field 2: BCH_VERSION(major, minor) - * field 3: recovery passess required on upgrade + * field 3: recovery passes required on upgrade */ #define BCH_METADATA_VERSIONS() \ x(bkey_renumber, BCH_VERSION(0, 10)) \ @@ -679,7 +679,13 @@ struct bch_sb_field_ext { x(disk_accounting_v3, BCH_VERSION(1, 10)) \ x(disk_accounting_inum, BCH_VERSION(1, 11)) \ x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \ - x(inode_has_child_snapshots, BCH_VERSION(1, 13)) + x(inode_has_child_snapshots, BCH_VERSION(1, 13)) \ + x(backpointer_bucket_gen, BCH_VERSION(1, 14)) \ + x(disk_accounting_big_endian, BCH_VERSION(1, 15)) \ + x(reflink_p_may_update_opts, BCH_VERSION(1, 16)) \ + x(inode_depth, BCH_VERSION(1, 17)) \ + x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ + x(autofix_errors, BCH_VERSION(1, 19)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -759,7 +765,7 @@ struct bch_sb { /* * Flags: - * BCH_SB_INITALIZED - set on first mount + * BCH_SB_INITIALIZED - set on first mount * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect * behaviour of mount/recovery path: * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits @@ -844,6 +850,10 @@ LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, struct bch_sb, flags[5], 0, 16); LE64_BITMASK(BCH_SB_ALLOCATOR_STUCK_TIMEOUT, struct bch_sb, flags[5], 16, 32); +LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); +LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, + struct bch_sb, flags[5], 48, 64); +LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -896,21 +906,22 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(new_varint, 15) \ x(journal_no_flush, 16) \ x(alloc_v2, 17) \ - x(extents_across_btree_nodes, 18) + x(extents_across_btree_nodes, 18) \ + x(incompat_version_field, 19) #define BCH_SB_FEATURES_ALWAYS \ - ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ - (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ - (1ULL << BCH_FEATURE_btree_updates_journalled)|\ - (1ULL << BCH_FEATURE_alloc_v2)|\ - (1ULL << BCH_FEATURE_extents_across_btree_nodes)) + (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ + BIT_ULL(BCH_FEATURE_extents_above_btree_updates)|\ + BIT_ULL(BCH_FEATURE_btree_updates_journalled)|\ + BIT_ULL(BCH_FEATURE_alloc_v2)|\ + BIT_ULL(BCH_FEATURE_extents_across_btree_nodes)) #define BCH_SB_FEATURES_ALL \ (BCH_SB_FEATURES_ALWAYS| \ - (1ULL << BCH_FEATURE_new_siphash)| \ - (1ULL << BCH_FEATURE_btree_ptr_v2)| \ - (1ULL << BCH_FEATURE_new_varint)| \ - (1ULL << BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_new_siphash)| \ + BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ + BIT_ULL(BCH_FEATURE_new_varint)| \ + BIT_ULL(BCH_FEATURE_journal_no_flush)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -1032,7 +1043,7 @@ static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) x(crc64, 2) \ x(xxhash, 3) -enum bch_csum_opts { +enum bch_csum_opt { #define x(t, n) BCH_CSUM_OPT_##t = n, BCH_CSUM_OPTS() #undef x @@ -1221,6 +1232,15 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); +static inline unsigned jset_entry_log_msg_bytes(struct jset_entry_log *l) +{ + unsigned b = vstruct_bytes(&l->entry) - offsetof(struct jset_entry_log, d); + + while (b && !l->d[b - 1]) + --b; + return b; +} + struct jset_entry_datetime { struct jset_entry entry; __le64 seconds; @@ -1331,7 +1351,8 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_set)) \ x(logged_ops, 17, 0, \ BIT_ULL(KEY_TYPE_logged_op_truncate)| \ - BIT_ULL(KEY_TYPE_logged_op_finsert)) \ + BIT_ULL(KEY_TYPE_logged_op_finsert)| \ + BIT_ULL(KEY_TYPE_inode_alloc_cursor)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ x(subvolume_children, 19, 0, \ @@ -1361,6 +1382,8 @@ static inline bool btree_id_is_alloc(enum btree_id id) case BTREE_ID_need_discard: case BTREE_ID_freespace: case BTREE_ID_bucket_gens: + case BTREE_ID_lru: + case BTREE_ID_accounting: return true; default: return false; diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 3c23bdf788cea..926a1af506abe 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -131,7 +131,7 @@ struct bch_ioctl_start { * may be either offline or offline. * * Will fail removing @dev would leave us with insufficient read write devices - * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are + * or degraded/unavailable data, unless the appropriate BCH_FORCE_IF_* flags are * set. */ @@ -154,7 +154,7 @@ struct bch_ioctl_start { * * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would * leave us with insufficient read write devices or degraded/unavailable data, - * unless the approprate BCH_FORCE_IF_* flags are set. + * unless the appropriate BCH_FORCE_IF_* flags are set. */ struct bch_ioctl_disk { diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 41df24a53d971..054e2d5e8448a 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -9,13 +9,6 @@ #include "util.h" #include "vstructs.h" -enum bch_validate_flags { - BCH_VALIDATE_write = BIT(0), - BCH_VALIDATE_commit = BIT(1), - BCH_VALIDATE_journal = BIT(2), - BCH_VALIDATE_silent = BIT(3), -}; - #if 0 /* diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index e7ac227ba7e89..15c93576b5c20 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -28,7 +28,7 @@ const char * const bch2_bkey_types[] = { }; static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -42,7 +42,7 @@ static int deleted_key_validate(struct bch_fs *c, struct bkey_s_c k, }) static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -59,7 +59,7 @@ static int empty_val_key_validate(struct bch_fs *c, struct bkey_s_c k, }) static int key_type_cookie_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -83,7 +83,7 @@ static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c, }) static int key_type_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -124,7 +124,7 @@ const struct bkey_ops bch2_bkey_null_ops = { }; int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; @@ -140,7 +140,7 @@ int bch2_bkey_val_validate(struct bch_fs *c, struct bkey_s_c k, if (!ops->key_validate) return 0; - ret = ops->key_validate(c, k, flags); + ret = ops->key_validate(c, k, from); fsck_err: return ret; } @@ -161,9 +161,10 @@ const char *bch2_btree_node_type_str(enum btree_node_type type) } int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bch_validate_flags flags) + struct bkey_validate_context from) { + enum btree_node_type type = __btree_node_type(from.level, from.btree); + if (test_bit(BCH_FS_no_invalid_checks, &c->flags)) return 0; @@ -177,7 +178,7 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, return 0; bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && - (type == BKEY_TYPE_btree || (flags & BCH_VALIDATE_commit)) && + (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", @@ -228,15 +229,15 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, - enum btree_node_type type, - enum bch_validate_flags flags) + struct bkey_validate_context from) { - return __bch2_bkey_validate(c, k, type, flags) ?: - bch2_bkey_val_validate(c, k, flags); + return __bch2_bkey_validate(c, k, from) ?: + bch2_bkey_val_validate(c, k, from); } int bch2_bkey_in_btree_node(struct bch_fs *c, struct btree *b, - struct bkey_s_c k, enum bch_validate_flags flags) + struct bkey_s_c k, + struct bkey_validate_context from) { int ret = 0; diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h index 018fb72e32d39..bf34111cdf008 100644 --- a/fs/bcachefs/bkey_methods.h +++ b/fs/bcachefs/bkey_methods.h @@ -22,7 +22,7 @@ extern const struct bkey_ops bch2_bkey_null_ops; */ struct bkey_ops { int (*key_validate)(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags); + struct bkey_validate_context from); void (*val_to_text)(struct printbuf *, struct bch_fs *, struct bkey_s_c); void (*swab)(struct bkey_s); @@ -48,13 +48,14 @@ static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) : &bch2_bkey_null_ops; } -int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); -int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags); -int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, enum btree_node_type, - enum bch_validate_flags); +int bch2_bkey_val_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int __bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +int bch2_bkey_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); int bch2_bkey_in_btree_node(struct bch_fs *, struct btree *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context from); void bch2_bpos_to_text(struct printbuf *, struct bpos); void bch2_bkey_to_text(struct printbuf *, const struct bkey *); diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h index c9ae9e42b3851..b4f328f9853c6 100644 --- a/fs/bcachefs/bkey_types.h +++ b/fs/bcachefs/bkey_types.h @@ -210,4 +210,32 @@ static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ BCH_BKEY_TYPES(); #undef x +enum bch_validate_flags { + BCH_VALIDATE_write = BIT(0), + BCH_VALIDATE_commit = BIT(1), + BCH_VALIDATE_silent = BIT(2), +}; + +#define BKEY_VALIDATE_CONTEXTS() \ + x(unknown) \ + x(superblock) \ + x(journal) \ + x(btree_root) \ + x(btree_node) \ + x(commit) + +struct bkey_validate_context { + enum { +#define x(n) BKEY_VALIDATE_##n, + BKEY_VALIDATE_CONTEXTS() +#undef x + } from:8; + enum bch_validate_flags flags:8; + u8 level; + enum btree_id btree; + bool root:1; + unsigned journal_offset; + u64 journal_seq; +}; + #endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 6953d55b72cca..bdd250e1be16c 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -45,7 +45,7 @@ * 4 in memory - we lazily resort as needed. * * We implement code here for creating and maintaining auxiliary search trees - * (described below) for searching an individial bset, and on top of that we + * (described below) for searching an individual bset, and on top of that we * implement a btree iterator. * * BTREE ITERATOR: @@ -178,7 +178,7 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree * it used to be 64, but I realized the lookup code would touch slightly less * memory if it was 128. * - * It definites the number of bytes (in struct bset) per struct bkey_float in + * It defines the number of bytes (in struct bset) per struct bkey_float in * the auxiliar search tree - when we're done searching the bset_float tree we * have this many bytes left that we do a linear search over. * diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 7123019ab3bc8..6319a9a8ae6a1 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -222,7 +222,6 @@ void bch2_node_pin(struct bch_fs *c, struct btree *b) struct btree_cache *bc = &c->btree_cache; mutex_lock(&bc->lock); - BUG_ON(!__btree_node_pinned(bc, b)); if (b != btree_node_root(c, b) && !btree_node_pinned(b)) { set_btree_node_pinned(b); list_move(&b->list, &bc->live[1].list); @@ -326,7 +325,7 @@ void bch2_btree_node_update_key_early(struct btree_trans *trans, if (!IS_ERR_OR_NULL(b)) { mutex_lock(&c->btree_cache.lock); - bch2_btree_node_hash_remove(&c->btree_cache, b); + __bch2_btree_node_hash_remove(&c->btree_cache, b); bkey_copy(&b->key, new); ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); @@ -829,7 +828,8 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea mutex_unlock(&bc->lock); - if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + if (memalloc_flags_do(PF_MEMALLOC_NORECLAIM, + btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN))) { bch2_trans_unlock(trans); if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) goto err; @@ -1004,16 +1004,14 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) return; prt_printf(&buf, - "btree node header doesn't match ptr\n" - "btree %s level %u\n" - "ptr: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + "btree node header doesn't match ptr: "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, "\nptr: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_printf(&buf, "\nheader: btree %s level %llu\n" - "min ", - bch2_btree_id_str(BTREE_NODE_ID(b->data)), - BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nheader: "); + bch2_btree_id_level_to_text(&buf, BTREE_NODE_ID(b->data), BTREE_NODE_LEVEL(b->data)); + prt_str(&buf, "\nmin "); bch2_bpos_to_text(&buf, b->data->min_key); prt_printf(&buf, "\nmax "); @@ -1133,7 +1131,7 @@ static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btr if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1175,7 +1173,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * /* * Check b->hash_val _before_ calling btree_node_lock() - this might not * be the node we want anymore, and trying to lock the wrong node could - * cause an unneccessary transaction restart: + * cause an unnecessary transaction restart: */ if (unlikely(!c->opts.btree_node_mem_ptr_optimization || !b || @@ -1223,7 +1221,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-BCH_ERR_btree_node_read_error); + return ERR_PTR(-BCH_ERR_btree_node_read_err_cached); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1305,7 +1303,7 @@ struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-BCH_ERR_btree_node_read_error); + b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); goto out; } @@ -1398,12 +1396,24 @@ void bch2_btree_id_to_text(struct printbuf *out, enum btree_id btree) prt_printf(out, "(unknown btree %u)", btree); } +void bch2_btree_id_level_to_text(struct printbuf *out, enum btree_id btree, unsigned level) +{ + prt_str(out, "btree="); + bch2_btree_id_to_text(out, btree); + prt_printf(out, " level=%u", level); +} + void bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { - prt_printf(out, "%s level %u/%u\n ", - bch2_btree_id_str(b->c.btree_id), - b->c.level, - bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_btree_id_to_text(out, b->c.btree_id); + prt_printf(out, " level %u/", b->c.level); + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + if (r) + prt_printf(out, "%u", r->level); + else + prt_printf(out, "(unknown)"); + prt_printf(out, "\n "); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); } @@ -1478,8 +1488,12 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); prt_newline(out); - for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) - prt_btree_cache_line(out, c, bch2_btree_id_str(i), bc->nr_by_btree[i]); + for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { + bch2_btree_id_to_text(out, i); + prt_printf(out, "\t"); + prt_human_readable_u64(out, bc->nr_by_btree[i] * c->opts.btree_node_size); + prt_printf(out, " (%zu)\n", bc->nr_by_btree[i]); + } prt_newline(out); prt_printf(out, "freed:\t%zu\n", bc->nr_freed); diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h index 66e86d1a178d4..dcc34fe4996d5 100644 --- a/fs/bcachefs/btree_cache.h +++ b/fs/bcachefs/btree_cache.h @@ -128,18 +128,24 @@ static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned i } else { unsigned idx = id - BTREE_ID_NR; - EBUG_ON(idx >= c->btree_roots_extra.nr); + /* This can happen when we're called from btree_node_scan */ + if (idx >= c->btree_roots_extra.nr) + return NULL; + return &c->btree_roots_extra.data[idx]; } } static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) { - return bch2_btree_id_root(c, b->c.btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, b->c.btree_id); + + return r ? r->b : NULL; } -const char *bch2_btree_id_str(enum btree_id); +const char *bch2_btree_id_str(enum btree_id); /* avoid */ void bch2_btree_id_to_text(struct printbuf *, enum btree_id); +void bch2_btree_id_level_to_text(struct printbuf *, enum btree_id, unsigned); void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 81dcf9e512c0e..721dca5517202 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -29,6 +29,7 @@ #include "move.h" #include "recovery_passes.h" #include "reflink.h" +#include "recovery.h" #include "replicas.h" #include "super-io.h" #include "trace.h" @@ -56,8 +57,8 @@ void bch2_gc_pos_to_text(struct printbuf *out, struct gc_pos *p) { prt_str(out, bch2_gc_phase_strs[p->phase]); prt_char(out, ' '); - bch2_btree_id_to_text(out, p->btree); - prt_printf(out, " l=%u ", p->level); + bch2_btree_id_level_to_text(out, p->btree, p->level); + prt_char(out, ' '); bch2_bpos_to_text(out, p->pos); } @@ -209,8 +210,9 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * if (bpos_eq(expected_start, cur->data->min_key)) return 0; - prt_printf(&buf, " at btree %s level %u:\n parent: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_printf(&buf, " at "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_printf(&buf, ":\n parent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (prev) { @@ -277,8 +279,9 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, if (bpos_eq(child->key.k.p, b->key.k.p)) return 0; - prt_printf(&buf, "at btree %s level %u:\n parent: ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_printf(&buf, " at "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_printf(&buf, ":\n parent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, "\n child: "); @@ -341,14 +344,14 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct ret = PTR_ERR_OR_ZERO(cur); printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level - 1); + prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_unreadable, - "Topology repair: unreadable btree node at btree %s level %u:\n" + trans, btree_node_read_error, + "Topology repair: unreadable btree node at\n" " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level - 1, buf.buf)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; @@ -357,11 +360,9 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct if (ret) break; - if (!btree_id_is_alloc(b->c.btree_id)) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); - if (ret) - break; - } + ret = bch2_btree_lost_data(c, b->c.btree_id); + if (ret) + break; continue; } @@ -370,7 +371,7 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct break; if (bch2_btree_node_is_stale(c, cur)) { - bch_info(c, "btree node %s older than nodes found by scanning", buf.buf); + bch_info(c, "btree node older than nodes found by scanning\n %s", buf.buf); six_unlock_read(&cur->c.lock); bch2_btree_node_evict(trans, cur_k.k); ret = bch2_journal_key_delete(c, b->c.btree_id, @@ -478,14 +479,13 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct } printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (mustfix_fsck_err_on(!have_child, trans, btree_node_topology_interior_node_empty, - "empty interior btree node at btree %s level %u\n" - " %s", - bch2_btree_id_str(b->c.btree_id), - b->c.level, buf.buf)) + "empty interior btree node at %s", buf.buf)) ret = DROP_THIS_NODE; err: fsck_err: @@ -511,6 +511,7 @@ int bch2_check_topology(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bpos pulled_from_scan = POS_MIN; + struct printbuf buf = PRINTBUF; int ret = 0; bch2_trans_srcu_unlock(trans); @@ -519,19 +520,22 @@ int bch2_check_topology(struct bch_fs *c) struct btree_root *r = bch2_btree_id_root(c, i); bool reconstructed_root = false; + printbuf_reset(&buf); + bch2_btree_id_to_text(&buf, i); + if (r->error) { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + ret = bch2_btree_lost_data(c, i); if (ret) break; reconstruct_root: - bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i)); + bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); r->alive = false; r->error = 0; if (!bch2_btree_has_scanned_nodes(c, i)) { mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", bch2_btree_id_str(i)); + "no nodes found for btree %s, continue?", buf.buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { bch2_btree_root_alloc_fake_trans(trans, i, 1); @@ -560,13 +564,14 @@ int bch2_check_topology(struct bch_fs *c) if (!reconstructed_root) goto reconstruct_root; - bch_err(c, "empty btree root %s", bch2_btree_id_str(i)); + bch_err(c, "empty btree root %s", buf.buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); r->alive = false; ret = 0; } } fsck_err: + printbuf_exit(&buf); bch2_trans_put(trans); return ret; } @@ -713,6 +718,7 @@ static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id ids[BTREE_ID_NR]; + struct printbuf buf = PRINTBUF; unsigned i; int ret = 0; @@ -727,14 +733,9 @@ static int bch2_gc_btrees(struct bch_fs *c) continue; ret = bch2_gc_btree(trans, btree, true); - - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_read_error, - "btree node read error for %s", - bch2_btree_id_str(btree))) - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); } -fsck_err: + + printbuf_exit(&buf); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; @@ -802,7 +803,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, old = bch2_alloc_to_v4(k, &old_convert); gc = new = *old; - percpu_down_read(&c->mark_lock); __bucket_m_to_alloc(&gc, *gc_bucket(ca, iter->pos.offset)); old_gc = gc; @@ -813,7 +813,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans, gc.data_type = old->data_type; gc.dirty_sectors = old->dirty_sectors; } - percpu_up_read(&c->mark_lock); /* * gc.data_type doesn't yet include need_discard & need_gc_gen states - @@ -831,11 +830,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans, * safe w.r.t. transaction restarts, so fixup the gc_bucket so * we don't run it twice: */ - percpu_down_read(&c->mark_lock); struct bucket *gc_m = gc_bucket(ca, iter->pos.offset); gc_m->data_type = gc.data_type; gc_m->dirty_sectors = gc.dirty_sectors; - percpu_up_read(&c->mark_lock); } if (fsck_err_on(new.data_type != gc.data_type, @@ -895,11 +892,11 @@ static int bch2_gc_alloc_done(struct bch_fs *c) for_each_member_device(c, ca) { ret = bch2_trans_run(c, - for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc, + for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, ca->mi.first_bucket), POS(ca->dev_idx, ca->mi.nbuckets - 1), BTREE_ITER_slots|BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_alloc_write_key(trans, &iter, ca, k))); if (ret) { bch2_dev_put(ca); @@ -928,98 +925,6 @@ static int bch2_gc_alloc_start(struct bch_fs *c) return ret; } -static int bch2_gc_write_reflink_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - size_t *idx) -{ - struct bch_fs *c = trans->c; - const __le64 *refcount = bkey_refcount_c(k); - struct printbuf buf = PRINTBUF; - struct reflink_gc *r; - int ret = 0; - - if (!refcount) - return 0; - - while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && - r->offset < k.k->p.offset) - ++*idx; - - if (!r || - r->offset != k.k->p.offset || - r->size != k.k->size) { - bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); - return -EINVAL; - } - - if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), - trans, reflink_v_refcount_wrong, - "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf), - r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); - ret = PTR_ERR_OR_ZERO(new); - if (ret) - goto out; - - if (!r->refcount) - new->k.type = KEY_TYPE_deleted; - else - *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); - ret = bch2_trans_update(trans, iter, new, 0); - } -out: -fsck_err: - printbuf_exit(&buf); - return ret; -} - -static int bch2_gc_reflink_done(struct bch_fs *c) -{ - size_t idx = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key_commit(trans, iter, - BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_gc_write_reflink_key(trans, &iter, k, &idx))); - c->reflink_gc_nr = 0; - return ret; -} - -static int bch2_gc_reflink_start(struct bch_fs *c) -{ - c->reflink_gc_nr = 0; - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, - BTREE_ITER_prefetch, k, ({ - const __le64 *refcount = bkey_refcount_c(k); - - if (!refcount) - continue; - - struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, - c->reflink_gc_nr++, GFP_KERNEL); - if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; - break; - } - - r->offset = k.k->p.offset; - r->size = k.k->size; - r->refcount = 0; - 0; - }))); - - bch_err_fn(c, ret); - return ret; -} - static int bch2_gc_write_stripes_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) @@ -1171,7 +1076,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - percpu_down_read(&c->mark_lock); rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); @@ -1180,7 +1084,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, if (dev_ptr_stale(ca, ptr) > 16) { rcu_read_unlock(); - percpu_up_read(&c->mark_lock); goto update; } } @@ -1195,7 +1098,6 @@ static int gc_btree_gens_key(struct btree_trans *trans, *gen = ptr->gen; } rcu_read_unlock(); - percpu_up_read(&c->mark_lock); return 0; update: u = bch2_bkey_make_mut(trans, iter, &k, 0); @@ -1224,7 +1126,6 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct bch_dev return ret; a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; - alloc_data_type_set(&a_mut->v, a_mut->v.data_type); return bch2_trans_update(trans, iter, &a_mut->k_i, 0); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 839d68802e426..d99f8a78d286d 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -25,9 +25,8 @@ static void bch2_btree_node_header_to_text(struct printbuf *out, struct btree_node *bn) { - prt_printf(out, "btree=%s l=%u seq %llux\n", - bch2_btree_id_str(BTREE_NODE_ID(bn)), - (unsigned) BTREE_NODE_LEVEL(bn), bn->keys.seq); + bch2_btree_id_level_to_text(out, BTREE_NODE_ID(bn), BTREE_NODE_LEVEL(bn)); + prt_printf(out, " seq %llx %llu\n", bn->keys.seq, BTREE_NODE_SEQ(bn)); prt_str(out, "min: "); bch2_bpos_to_text(out, bn->min_key); prt_newline(out); @@ -832,13 +831,32 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, return ret; } +static int btree_node_bkey_val_validate(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + enum bch_validate_flags flags) +{ + return bch2_bkey_val_validate(c, k, (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = flags + }); +} + static int bset_key_validate(struct bch_fs *c, struct btree *b, struct bkey_s_c k, - bool updated_range, int rw) + bool updated_range, + enum bch_validate_flags flags) { - return __bch2_bkey_validate(c, k, btree_node_type(b), 0) ?: - (!updated_range ? bch2_bkey_in_btree_node(c, b, k, 0) : 0) ?: - (rw == WRITE ? bch2_bkey_val_validate(c, k, 0) : 0); + struct bkey_validate_context from = (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = flags, + }; + return __bch2_bkey_validate(c, k, from) ?: + (!updated_range ? bch2_bkey_in_btree_node(c, b, k, from) : 0) ?: + (flags & BCH_VALIDATE_write ? btree_node_bkey_val_validate(c, b, k, flags) : 0); } static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, @@ -855,7 +873,21 @@ static bool bkey_packed_valid(struct bch_fs *c, struct btree *b, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - return !__bch2_bkey_validate(c, u.s_c, btree_node_type(b), BCH_VALIDATE_silent); + return !__bch2_bkey_validate(c, u.s_c, + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_silent + }); +} + +static inline int btree_node_read_bkey_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l); } static int validate_bset_keys(struct bch_fs *c, struct btree *b, @@ -918,7 +950,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, BSET_BIG_ENDIAN(i), write, &b->format, k); - if (prev && bkey_iter_cmp(b, prev, k) > 0) { + if (prev && btree_node_read_bkey_cmp(b, prev, k) >= 0) { struct bkey up = bkey_unpack_key(b, prev); printbuf_reset(&buf); @@ -965,6 +997,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, got_good_key: le16_add_cpu(&i->u64s, -next_good_key); memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_node_need_rewrite(b); } fsck_err: printbuf_exit(&buf); @@ -1038,39 +1071,51 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, while (b->written < (ptr_written ?: btree_sectors(c))) { unsigned sectors; - struct nonce nonce; bool first = !b->written; - bool csum_bad; - if (!b->written) { + if (first) { + bne = NULL; i = &b->data->keys; + } else { + bne = write_block(b); + i = &bne->keys; - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, b->written << 9); + if (i->seq != b->data->keys.seq) + break; + } - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - csum_bad = bch2_crc_cmp(b->data->csum, csum); - if (csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + struct nonce nonce = btree_nonce(i, b->written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; + btree_err_on(!good_csum_type, + bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) + ? -BCH_ERR_btree_node_read_err_must_retry + : -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_unknown_csum, + "unknown checksum type %llu", BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + bool csum_bad = bch2_crc_cmp(b->data->csum, csum); + if (csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum), + buf.buf)); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), @@ -1081,37 +1126,26 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sectors = vstruct_sectors(b->data, c->block_bits); } else { - bne = write_block(b); - i = &bne->keys; - - if (i->seq != b->data->keys.seq) - break; - - btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_unknown_csum, - "unknown checksum type %llu", BSET_CSUM_TYPE(i)); - - nonce = btree_nonce(i, b->written << 9); - struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - csum_bad = bch2_crc_cmp(bne->csum, csum); - if (ca && csum_bad) - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - - btree_err_on(csum_bad, - -BCH_ERR_btree_node_read_err_want_retry, - c, ca, b, i, NULL, - bset_bad_csum, - "%s", - (printbuf_reset(&buf), - bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), - buf.buf)); - - ret = bset_encrypt(c, i, b->written << 9); - if (bch2_fs_fatal_err_on(ret, c, - "decrypting btree node: %s", bch2_err_str(ret))) - goto fsck_err; + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + bool csum_bad = bch2_crc_cmp(bne->csum, csum); + if (ca && csum_bad) + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + + btree_err_on(csum_bad, + -BCH_ERR_btree_node_read_err_want_retry, + c, ca, b, i, NULL, + bset_bad_csum, + "%s", + (printbuf_reset(&buf), + bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum), + buf.buf)); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "decrypting btree node: %s", bch2_err_str(ret))) + goto fsck_err; + } sectors = vstruct_sectors(bne, c->block_bits); } @@ -1216,7 +1250,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); - ret = bch2_bkey_val_validate(c, u.s_c, READ); + ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || (bch2_inject_invalid_keys && !bversion_cmp(u.k->bversion, MAX_VERSION))) { @@ -1226,6 +1260,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); + set_btree_node_need_rewrite(b); continue; } if (ret) @@ -1339,13 +1374,18 @@ static void btree_node_read_work(struct work_struct *work) rb->start_time); bio_put(&rb->bio); - if (saw_error && + if ((saw_error || + btree_node_need_rewrite(b)) && !btree_node_read_error(b) && c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - printbuf_reset(&buf); - bch2_bpos_to_text(&buf, b->key.k.p); - bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", - __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf); + if (saw_error) { + printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", + __func__, buf.buf); + } bch2_btree_node_rewrite_async(c, b); } @@ -1933,7 +1973,12 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, bool saw_error; int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), - BKEY_TYPE_btree, WRITE); + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level + 1, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_write, + }); if (ret) { bch2_fs_inconsistent(c, "invalid btree node key before write"); return ret; diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index eef9b89c561dd..a6aab81c64332 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -270,8 +270,10 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) BUG_ON(!(iter->flags & BTREE_ITER_all_snapshots) && iter->pos.snapshot != iter->snapshot); - BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || - bkey_gt(iter->pos, iter->k.p)); + BUG_ON(iter->flags & BTREE_ITER_all_snapshots ? !bpos_eq(iter->pos, iter->k.p) : + !(iter->flags & BTREE_ITER_is_extents) ? !bkey_eq(iter->pos, iter->k.p) : + (bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || + bkey_gt(iter->pos, iter->k.p))); } static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) @@ -327,7 +329,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos) { - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); struct btree_path *path; struct trans_for_each_path_inorder_iter iter; @@ -720,7 +722,7 @@ static inline int btree_path_lock_root(struct btree_trans *trans, unsigned long trace_ip) { struct bch_fs *c = trans->c; - struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + struct btree_root *r = bch2_btree_id_root(c, path->btree_id); enum six_lock_type lock_type; unsigned i; int ret; @@ -728,7 +730,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, EBUG_ON(path->nodes_locked); while (1) { - b = READ_ONCE(*rootp); + struct btree *b = READ_ONCE(r->b); + if (unlikely(!b)) { + BUG_ON(!r->error); + return r->error; + } + path->level = READ_ONCE(b->c.level); if (unlikely(path->level < depth_want)) { @@ -748,14 +755,12 @@ static inline int btree_path_lock_root(struct btree_trans *trans, ret = btree_node_lock(trans, path, &b->c, path->level, lock_type, trace_ip); if (unlikely(ret)) { - if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) - continue; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; BUG(); } - if (likely(b == READ_ONCE(*rootp) && + if (likely(b == READ_ONCE(r->b) && b->c.level == path->level && !race_fault())) { for (i = 0; i < path->level; i++) @@ -825,6 +830,8 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p bch2_bkey_buf_init(&tmp); + jiter->fail_if_too_many_whiteouts = true; + while (nr-- && !ret) { if (!bch2_btree_node_relock(trans, path, path->level)) break; @@ -1000,7 +1007,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) bch2_trans_unlock(trans); cond_resched(); - trans_set_locked(trans); + trans_set_locked(trans, false); if (unlikely(trans->memory_allocation_failure)) { struct closure cl; @@ -1267,7 +1274,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, { int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); EBUG_ON(!trans->paths[path_idx].ref); trace_btree_path_set_pos(trans, trans->paths + path_idx, &new_pos); @@ -1427,17 +1434,31 @@ void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_ (void *) trans->last_begin_ip); } -void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) +static void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct printbuf buf = PRINTBUF; + bch2_prt_backtrace(&buf, &trans->last_restarted_trace); + panic("in transaction restart: %s, last restarted by\n%s", + bch2_err_str(trans->restarted), + buf.buf); +#else panic("in transaction restart: %s, last restarted by %pS\n", bch2_err_str(trans->restarted), (void *) trans->last_restarted_ip); +#endif } -void __noreturn bch2_trans_unlocked_error(struct btree_trans *trans) +void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *trans) { - panic("trans should be locked, unlocked by %pS\n", - (void *) trans->last_unlock_ip); + if (trans->restarted) + bch2_trans_in_restart_error(trans); + + if (!trans->locked) + panic("trans should be locked, unlocked by %pS\n", + (void *) trans->last_unlock_ip); + + BUG(); } noinline __cold @@ -1450,10 +1471,11 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) trans_for_each_update(trans, i) { struct bkey_s_c old = { &i->old_k, i->old_v }; - prt_printf(buf, "update: btree=%s cached=%u %pS\n", - bch2_btree_id_str(i->btree_id), - i->cached, - (void *) i->ip_allocated); + prt_str(buf, "update: btree="); + bch2_btree_id_to_text(buf, i->btree_id); + prt_printf(buf, " cached=%u %pS\n", + i->cached, + (void *) i->ip_allocated); prt_printf(buf, " old "); bch2_bkey_val_to_text(buf, trans->c, old); @@ -1486,13 +1508,13 @@ static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_tra { struct btree_path *path = trans->paths + path_idx; - prt_printf(out, "path: idx %3u ref %u:%u %c %c %c btree=%s l=%u pos ", + prt_printf(out, "path: idx %3u ref %u:%u %c %c %c ", path_idx, path->ref, path->intent_ref, path->preserve ? 'P' : ' ', path->should_be_locked ? 'S' : ' ', - path->cached ? 'C' : 'B', - bch2_btree_id_str(path->btree_id), - path->level); + path->cached ? 'C' : 'B'); + bch2_btree_id_level_to_text(out, path->btree_id, path->level); + prt_str(out, " pos "); bch2_bpos_to_text(out, path->pos); if (!path->cached && btree_node_locked(path, path->level)) { @@ -1717,8 +1739,7 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, struct trans_for_each_path_inorder_iter iter; btree_path_idx_t path_pos = 0, path_idx; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_trans_verify_locks(trans); btree_trans_sort_paths(trans); @@ -1833,7 +1854,7 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * !bkey_eq(path->pos, ck->key.pos)); *u = ck->k->k; - k = bkey_i_to_s_c(ck->k); + k = (struct bkey_s_c) { u, &ck->k->v }; } return k; @@ -1843,7 +1864,6 @@ struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey * return (struct bkey_s_c) { u, NULL }; } - void bch2_set_btree_iter_dontneed(struct btree_iter *iter) { struct btree_trans *trans = iter->trans; @@ -1870,7 +1890,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) struct btree_trans *trans = iter->trans; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); iter->path = bch2_btree_path_set_pos(trans, iter->path, btree_iter_search_key(iter), @@ -1945,7 +1965,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify(iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); @@ -2101,7 +2121,7 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, { struct btree_path *path = btree_iter_path(trans, iter); - return bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + return bch2_journal_keys_peek_max(trans->c, iter->btree_id, path->level, path->pos, end_pos, @@ -2124,21 +2144,47 @@ struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, } static noinline -struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +void btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) { struct btree_path *path = btree_iter_path(trans, iter); struct bkey_i *next_journal = bch2_btree_journal_peek(trans, iter, - k.k ? k.k->p : path_l(path)->b->key.k.p); - + k->k ? k->k->p : path_l(path)->b->key.k.p); if (next_journal) { iter->k = next_journal->k; - k = bkey_i_to_s_c(next_journal); + *k = bkey_i_to_s_c(next_journal); } +} - return k; +static struct bkey_i *bch2_btree_journal_peek_prev(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end_pos) +{ + struct btree_path *path = btree_iter_path(trans, iter); + + return bch2_journal_keys_peek_prev_min(trans->c, iter->btree_id, + path->level, + path->pos, + end_pos, + &iter->journal_idx); +} + +static noinline +void btree_trans_peek_prev_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c *k) +{ + struct btree_path *path = btree_iter_path(trans, iter); + struct bkey_i *next_journal = + bch2_btree_journal_peek_prev(trans, iter, + k->k ? k->k->p : path_l(path)->b->key.k.p); + + if (next_journal) { + iter->k = next_journal->k; + *k = bkey_i_to_s_c(next_journal); + } } /* @@ -2154,8 +2200,7 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos struct bkey_s_c k; int ret; - bch2_trans_verify_not_in_restart(trans); - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if ((iter->flags & BTREE_ITER_key_cache_fill) && bpos_eq(iter->pos, pos)) @@ -2184,10 +2229,15 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos btree_path_set_should_be_locked(trans, trans->paths + iter->key_cache_path); k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u); - if (k.k && !bkey_err(k)) { - iter->k = u; - k.k = &iter->k; - } + if (!k.k) + return k; + + if ((iter->flags & BTREE_ITER_all_snapshots) && + !bpos_eq(pos, k.k->p)) + return bkey_s_c_null; + + iter->k = u; + k.k = &iter->k; return k; } @@ -2201,8 +2251,6 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp bch2_btree_iter_verify(iter); while (1) { - struct btree_path_level *l; - iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); @@ -2212,17 +2260,17 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out; + break; } struct btree_path *path = btree_iter_path(trans, iter); - l = path_l(path); + struct btree_path_level *l = path_l(path); if (unlikely(!l->b)) { /* No btree nodes at requested level: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } btree_path_set_should_be_locked(trans, path); @@ -2233,15 +2281,14 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp k.k && (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { k = k2; - ret = bkey_err(k); - if (ret) { + if (bkey_err(k)) { bch2_btree_iter_set_pos(iter, iter->pos); - goto out; + break; } } if (unlikely(iter->flags & BTREE_ITER_with_journal)) - k = btree_trans_peek_journal(trans, iter, k); + btree_trans_peek_journal(trans, iter, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) @@ -2270,32 +2317,32 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp /* End of btree: */ bch2_btree_iter_set_pos(iter, SPOS_MAX); k = bkey_s_c_null; - goto out; + break; } } -out: - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(iter); return k; } /** - * bch2_btree_iter_peek_upto() - returns first key greater than or equal to + * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) { struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; - struct bpos iter_pos; + struct bpos iter_pos = iter->pos; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); if (iter->update_path) { @@ -2304,8 +2351,6 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e iter->update_path = 0; } - bch2_btree_iter_verify_entry_exit(iter); - while (1) { k = __bch2_btree_iter_peek(iter, search_key); if (unlikely(!k.k)) @@ -2313,79 +2358,79 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e if (unlikely(bkey_err(k))) goto out_no_locked; - /* - * We need to check against @end before FILTER_SNAPSHOTS because - * if we get to a different inode that requested we might be - * seeing keys for a different snapshot tree that will all be - * filtered out. - * - * But we can't do the full check here, because bkey_start_pos() - * isn't monotonically increasing before FILTER_SNAPSHOTS, and - * that's what we check against in extents mode: - */ - if (unlikely(!(iter->flags & BTREE_ITER_is_extents) - ? bkey_gt(k.k->p, end) - : k.k->p.inode > end.inode)) - goto end; + if (iter->flags & BTREE_ITER_filter_snapshots) { + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + * + * But we can't do the full check here, because bkey_start_pos() + * isn't monotonically increasing before FILTER_SNAPSHOTS, and + * that's what we check against in extents mode: + */ + if (unlikely(!(iter->flags & BTREE_ITER_is_extents) + ? bkey_gt(k.k->p, end) + : k.k->p.inode > end.inode)) + goto end; + + if (iter->update_path && + !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); + iter->update_path = 0; + } - if (iter->update_path && - !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); - iter->update_path = 0; - } + if ((iter->flags & BTREE_ITER_intent) && + !(iter->flags & BTREE_ITER_is_extents) && + !iter->update_path) { + struct bpos pos = k.k->p; - if ((iter->flags & BTREE_ITER_filter_snapshots) && - (iter->flags & BTREE_ITER_intent) && - !(iter->flags & BTREE_ITER_is_extents) && - !iter->update_path) { - struct bpos pos = k.k->p; + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } - if (pos.snapshot < iter->snapshot) { - search_key = bpos_successor(k.k->p); - continue; - } + pos.snapshot = iter->snapshot; - pos.snapshot = iter->snapshot; + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_intent, + _THIS_IP_); + ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + } /* - * advance, same as on exit for iter->path, but only up - * to snapshot + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: */ - __btree_path_get(trans, trans->paths + iter->path, iter->flags & BTREE_ITER_intent); - iter->update_path = iter->path; - - iter->update_path = bch2_btree_path_set_pos(trans, - iter->update_path, pos, - iter->flags & BTREE_ITER_intent, - _THIS_IP_); - ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); - if (unlikely(ret)) { - k = bkey_s_c_err(ret); - goto out_no_locked; + if (!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; } - } - - /* - * We can never have a key in a leaf node at POS_MAX, so - * we don't have to check these successor() calls: - */ - if ((iter->flags & BTREE_ITER_filter_snapshots) && - !bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - search_key = bpos_successor(k.k->p); - continue; - } - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_all_snapshots)) { - search_key = bkey_successor(iter, k.k->p); - continue; + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_key_cache_fill)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } } /* - * iter->pos should be mononotically increasing, and always be + * iter->pos should be monotonically increasing, and always be * equal to the key we just returned - except extents can * straddle iter->pos: */ @@ -2451,127 +2496,204 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) return bch2_btree_iter_peek(iter); } -/** - * bch2_btree_iter_peek_prev() - returns first key less than or equal to - * iterator's current position - * @iter: iterator to peek from - * - * Returns: key if found, or an error extractable with bkey_err(). - */ -struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) { struct btree_trans *trans = iter->trans; - struct bpos search_key = iter->pos; - struct bkey_s_c k; - struct bkey saved_k; - const struct bch_val *saved_v; - btree_path_idx_t saved_path = 0; - int ret; - - bch2_trans_verify_not_unlocked(trans); - EBUG_ON(btree_iter_path(trans, iter)->cached || - btree_iter_path(trans, iter)->level); - - if (iter->flags & BTREE_ITER_with_journal) - return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); + struct bkey_s_c k, k2; bch2_btree_iter_verify(iter); - bch2_btree_iter_verify_entry_exit(iter); - - if (iter->flags & BTREE_ITER_filter_snapshots) - search_key.snapshot = U32_MAX; while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, - iter->flags & BTREE_ITER_intent, - btree_iter_ip_allocated(iter)); + iter->flags & BTREE_ITER_intent, + btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ bch2_btree_iter_set_pos(iter, iter->pos); k = bkey_s_c_err(ret); - goto out_no_locked; + break; } struct btree_path *path = btree_iter_path(trans, iter); + struct btree_path_level *l = path_l(path); - k = btree_path_level_peek(trans, path, &path->l[0], &iter->k); - if (!k.k || - ((iter->flags & BTREE_ITER_is_extents) - ? bpos_ge(bkey_start_pos(k.k), search_key) - : bpos_gt(k.k->p, search_key))) - k = btree_path_level_prev(trans, path, &path->l[0], &iter->k); + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + break; + } + + btree_path_set_should_be_locked(trans, path); + + k = btree_path_level_peek_all(trans->c, l, &iter->k); + if (!k.k || bpos_gt(k.k->p, search_key)) { + k = btree_path_level_prev(trans, path, l, &iter->k); + + BUG_ON(k.k && bpos_gt(k.k->p, search_key)); + } + + if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; + if (bkey_err(k2)) { + bch2_btree_iter_set_pos(iter, iter->pos); + break; + } + } + + if (unlikely(iter->flags & BTREE_ITER_with_journal)) + btree_trans_peek_prev_journal(trans, iter, &k); if (unlikely((iter->flags & BTREE_ITER_with_updates) && trans->nr_updates)) bch2_btree_trans_peek_prev_updates(trans, iter, &k); - if (likely(k.k)) { - if (iter->flags & BTREE_ITER_filter_snapshots) { - if (k.k->p.snapshot == iter->snapshot) - goto got_key; + if (likely(k.k && !bkey_deleted(k.k))) { + break; + } else if (k.k) { + search_key = bpos_predecessor(k.k->p); + } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ + search_key = bpos_predecessor(path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); + k = bkey_s_c_null; + break; + } + } + + bch2_btree_iter_verify(iter); + return k; +} + +/** + * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to + * iterator's current position + * @iter: iterator to peek from + * @end: search limit: returns keys greater than or equal to @end + * + * Returns: key if found, or an error extractable with bkey_err(). + */ +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) +{ + if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && + !bkey_eq(iter->pos, POS_MAX)) { + /* + * bkey_start_pos(), for extents, is not monotonically + * increasing until after filtering for snapshots: + * + * Thus, for extents we need to search forward until we find a + * real visible extents - easiest to just use peek_slot() (which + * internally uses peek() for extents) + */ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) + return k; + + if (!bkey_deleted(k.k) && + (!(iter->flags & BTREE_ITER_is_extents) || + bkey_lt(bkey_start_pos(k.k), iter->pos))) + return k; + } + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct bkey_s_c k; + btree_path_idx_t saved_path = 0; + + bch2_trans_verify_not_unlocked_or_in_restart(trans); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + + while (1) { + k = __bch2_btree_iter_peek_prev(iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) + goto out_no_locked; + + if (iter->flags & BTREE_ITER_filter_snapshots) { + struct btree_path *s = saved_path ? trans->paths + saved_path : NULL; + if (s && bpos_lt(k.k->p, SPOS(s->pos.inode, s->pos.offset, iter->snapshot))) { + /* + * If we have a saved candidate, and we're past + * the last possible snapshot overwrite, return + * it: + */ + bch2_path_put_nokeep(trans, iter->path, + iter->flags & BTREE_ITER_intent); + iter->path = saved_path; + saved_path = 0; + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); + break; + } + + /* + * We need to check against @end before FILTER_SNAPSHOTS because + * if we get to a different inode that requested we might be + * seeing keys for a different snapshot tree that will all be + * filtered out. + */ + if (unlikely(bkey_lt(k.k->p, end))) + goto end; + + if (!bch2_snapshot_is_ancestor(trans->c, iter->snapshot, k.k->p.snapshot)) { + search_key = bpos_predecessor(k.k->p); + continue; + } + + if (k.k->p.snapshot != iter->snapshot) { /* - * If we have a saved candidate, and we're no - * longer at the same _key_ (not pos), return - * that candidate + * Have a key visible in iter->snapshot, but + * might have overwrites: - save it and keep + * searching. Unless it's a whiteout - then drop + * our previous saved candidate: */ - if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { - bch2_path_put_nokeep(trans, iter->path, - iter->flags & BTREE_ITER_intent); - iter->path = saved_path; + if (saved_path) { + bch2_path_put_nokeep(trans, saved_path, + iter->flags & BTREE_ITER_intent); saved_path = 0; - iter->k = saved_k; - k.v = saved_v; - goto got_key; } - if (bch2_snapshot_is_ancestor(trans->c, - iter->snapshot, - k.k->p.snapshot)) { - if (saved_path) - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_intent); + if (!bkey_whiteout(k.k)) { saved_path = btree_path_clone(trans, iter->path, iter->flags & BTREE_ITER_intent, _THIS_IP_); - path = btree_iter_path(trans, iter); - trace_btree_path_save_pos(trans, path, trans->paths + saved_path); - saved_k = *k.k; - saved_v = k.v; + trace_btree_path_save_pos(trans, + trans->paths + iter->path, + trans->paths + saved_path); } search_key = bpos_predecessor(k.k->p); continue; } -got_key: - if (bkey_whiteout(k.k) && - !(iter->flags & BTREE_ITER_all_snapshots)) { + + if (bkey_whiteout(k.k)) { search_key = bkey_predecessor(iter, k.k->p); - if (iter->flags & BTREE_ITER_filter_snapshots) - search_key.snapshot = U32_MAX; + search_key.snapshot = U32_MAX; continue; } - - btree_path_set_should_be_locked(trans, path); - break; - } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) { - /* Advance to previous leaf node: */ - search_key = bpos_predecessor(path->l[0].b->data->min_key); - } else { - /* Start of btree: */ - bch2_btree_iter_set_pos(iter, POS_MIN); - k = bkey_s_c_null; - goto out_no_locked; } - } - EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); + EBUG_ON(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(k.k->p, iter->pos) : + iter->flags & BTREE_ITER_is_extents ? bkey_ge(bkey_start_pos(k.k), iter->pos) : + bkey_gt(k.k->p, iter->pos)); + + if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_lt(k.k->p, end) : + iter->flags & BTREE_ITER_is_extents ? bkey_le(k.k->p, end) : + bkey_lt(k.k->p, end))) + goto end; + + break; + } /* Extents can straddle iter->pos: */ - if (bkey_lt(k.k->p, iter->pos)) - iter->pos = k.k->p; + iter->pos = bpos_min(iter->pos, k.k->p);; if (iter->flags & BTREE_ITER_filter_snapshots) iter->pos.snapshot = iter->snapshot; @@ -2581,8 +2703,11 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); bch2_btree_iter_verify(iter); - return k; +end: + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; } /** @@ -2607,7 +2732,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct bkey_s_c k; int ret; - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); @@ -2658,6 +2783,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); if (unlikely(!k.k)) goto out_no_locked; + + if (unlikely(k.k->type == KEY_TYPE_whiteout && + (iter->flags & BTREE_ITER_filter_snapshots) && + !(iter->flags & BTREE_ITER_key_cache_fill))) + iter->k.type = KEY_TYPE_deleted; } else { struct bpos next; struct bpos end = iter->pos; @@ -2671,7 +2801,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) struct btree_iter iter2; bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_upto(&iter2, end); + k = bch2_btree_iter_peek_max(&iter2, end); if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); @@ -2682,7 +2812,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_upto(iter, end); + k = bch2_btree_iter_peek_max(iter, end); if (unlikely(bkey_err(k))) bch2_btree_iter_set_pos(iter, pos); else @@ -2902,7 +3032,7 @@ void bch2_trans_iter_init_outlined(struct btree_trans *trans, unsigned flags) { bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _RET_IP_); } @@ -2918,8 +3048,11 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, flags |= BTREE_ITER_snapshot_field; flags |= BTREE_ITER_all_snapshots; + if (!depth && btree_id_cached(trans->c, btree_id)) + flags |= BTREE_ITER_with_key_cache; + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, - __bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, depth, flags), _RET_IP_); iter->min_depth = depth; @@ -3083,7 +3216,7 @@ u32 bch2_trans_begin(struct btree_trans *trans) /* * If the transaction wasn't restarted, we're presuming to be - * doing something new: dont keep iterators excpt the ones that + * doing something new: don't keep iterators except the ones that * are in use - except for the subvolumes btree: */ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) @@ -3122,14 +3255,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; - trans_set_locked(trans); + trans_set_locked(trans, false); if (trans->restarted) { bch2_btree_path_traverse_all(trans); trans->notrace_relock_fail = false; } - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); return trans->restart_count; } @@ -3228,7 +3361,7 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx) trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); trans->srcu_lock_time = jiffies; trans->srcu_held = true; - trans_set_locked(trans); + trans_set_locked(trans, false); closure_init_stack_release(&trans->ref); return trans; @@ -3262,6 +3395,9 @@ void bch2_trans_put(struct btree_trans *trans) { struct bch_fs *c = trans->c; + if (trans->restarted) + bch2_trans_in_restart_error(trans); + bch2_trans_unlock(trans); trans_for_each_update(trans, i) @@ -3285,6 +3421,10 @@ void bch2_trans_put(struct btree_trans *trans) closure_return_sync(&trans->ref); trans->locking_wait.task = NULL; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); +#endif + unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; trans->paths = NULL; @@ -3338,8 +3478,9 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, pid = owner ? owner->pid : 0; rcu_read_unlock(); - prt_printf(out, "\t%px %c l=%u %s:", b, b->cached ? 'c' : 'b', - b->level, bch2_btree_id_str(b->btree_id)); + prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, b->btree_id); + prt_printf(out, " l=%u:", b->level); bch2_bpos_to_text(out, btree_node_pos(b)); prt_printf(out, "\t locks %u:%u:%u held by pid %u", @@ -3378,11 +3519,11 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) if (!path->nodes_locked) continue; - prt_printf(out, " path %u %c l=%u %s:", - idx, - path->cached ? 'c' : 'b', - path->level, - bch2_btree_id_str(path->btree_id)); + prt_printf(out, " path %u %c ", + idx, + path->cached ? 'c' : 'b'); + bch2_btree_id_to_text(out, path->btree_id); + prt_printf(out, " l=%u:", path->level); bch2_bpos_to_text(out, path->pos); prt_newline(out); @@ -3488,7 +3629,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c) #ifdef CONFIG_LOCKDEP fs_reclaim_acquire(GFP_KERNEL); struct btree_trans *trans = bch2_trans_get(c); - trans_set_locked(trans); + trans_set_locked(trans, false); bch2_trans_put(trans); fs_reclaim_release(GFP_KERNEL); #endif diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 0bda054f80d76..07e0210c53aea 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -6,6 +6,8 @@ #include "btree_types.h" #include "trace.h" +#include + void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); @@ -23,6 +25,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path { unsigned idx = path - trans->paths; + EBUG_ON(idx >= trans->nr_paths); EBUG_ON(!test_bit(idx, trans->paths_allocated)); if (unlikely(path->ref == U8_MAX)) { bch2_dump_trans_paths_updates(trans); @@ -36,6 +39,7 @@ static inline void __btree_path_get(struct btree_trans *trans, struct btree_path static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) { + EBUG_ON(path - trans->paths >= trans->nr_paths); EBUG_ON(!test_bit(path - trans->paths, trans->paths_allocated)); EBUG_ON(!path->ref); EBUG_ON(!path->intent_ref && intent); @@ -234,12 +238,12 @@ int __must_check bch2_btree_path_traverse_one(struct btree_trans *, btree_path_idx_t, unsigned, unsigned long); -static inline void bch2_trans_verify_not_unlocked(struct btree_trans *); +static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *); static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) { - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK) return 0; @@ -324,38 +328,33 @@ static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, bch2_trans_restart_error(trans, restart_count); } -void __noreturn bch2_trans_in_restart_error(struct btree_trans *); +void __noreturn bch2_trans_unlocked_or_in_restart_error(struct btree_trans *); -static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_trans *trans) { - if (trans->restarted) - bch2_trans_in_restart_error(trans); -} - -void __noreturn bch2_trans_unlocked_error(struct btree_trans *); - -static inline void bch2_trans_verify_not_unlocked(struct btree_trans *trans) -{ - if (!trans->locked) - bch2_trans_unlocked_error(trans); + if (trans->restarted || !trans->locked) + bch2_trans_unlocked_or_in_restart_error(trans); } __always_inline -static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; - trans->last_restarted_ip = _THIS_IP_; + trans->last_restarted_ip = ip; +#ifdef CONFIG_BCACHEFS_DEBUG + darray_exit(&trans->last_restarted_trace); + bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); +#endif return -err; } __always_inline static int btree_trans_restart(struct btree_trans *trans, int err) { - btree_trans_restart_nounlock(trans, err); - return -err; + return btree_trans_restart_ip(trans, err, _THIS_IP_); } bool bch2_btree_node_upgrade(struct btree_trans *, @@ -384,15 +383,21 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *); struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); struct btree *bch2_btree_iter_next_node(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) { - return bch2_btree_iter_peek_upto(iter, SPOS_MAX); + return bch2_btree_iter_peek_max(iter, SPOS_MAX); +} + +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); + +static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +{ + return bch2_btree_iter_peek_prev_min(iter, POS_MIN); } -struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); @@ -443,10 +448,17 @@ static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 sna void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); -static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) +static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned level, + unsigned flags) { + if (level || !btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_cached; + flags &= ~BTREE_ITER_with_key_cache; + } else if (!(flags & BTREE_ITER_cached)) + flags |= BTREE_ITER_with_key_cache; + if (!(flags & (BTREE_ITER_all_snapshots|BTREE_ITER_not_extents)) && btree_id_is_extents(btree_id)) flags |= BTREE_ITER_is_extents; @@ -465,19 +477,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, return flags; } -static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, - unsigned btree_id, - unsigned flags) -{ - if (!btree_id_cached(trans->c, btree_id)) { - flags &= ~BTREE_ITER_cached; - flags &= ~BTREE_ITER_with_key_cache; - } else if (!(flags & BTREE_ITER_cached)) - flags |= BTREE_ITER_with_key_cache; - - return __bch2_btree_iter_flags(trans, btree_id, flags); -} - static inline void bch2_trans_iter_init_common(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, @@ -514,7 +513,7 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, if (__builtin_constant_p(btree_id) && __builtin_constant_p(flags)) bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, - bch2_btree_iter_flags(trans, btree_id, flags), + bch2_btree_iter_flags(trans, btree_id, 0, flags), _THIS_IP_); else bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); @@ -593,13 +592,18 @@ static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ _btree_id, _pos, _flags, KEY_TYPE_##_type)) +static inline void __bkey_val_copy(void *dst_v, unsigned dst_size, struct bkey_s_c src_k) +{ + unsigned b = min_t(unsigned, dst_size, bkey_val_bytes(src_k.k)); + memcpy(dst_v, src_k.v, b); + if (unlikely(b < dst_size)) + memset(dst_v + b, 0, dst_size - b); +} + #define bkey_val_copy(_dst_v, _src_k) \ do { \ - unsigned b = min_t(unsigned, sizeof(*_dst_v), \ - bkey_val_bytes(_src_k.k)); \ - memcpy(_dst_v, _src_k.v, b); \ - if (b < sizeof(*_dst_v)) \ - memset((void *) (_dst_v) + b, 0, sizeof(*_dst_v) - b); \ + BUILD_BUG_ON(!__typecheck(*_dst_v, *_src_k.v)); \ + __bkey_val_copy(_dst_v, sizeof(*_dst_v), _src_k.s_c); \ } while (0) static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, @@ -608,17 +612,10 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, unsigned val_size, void *val) { struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); - ret = bkey_err(k); + struct bkey_s_c k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); + int ret = bkey_err(k); if (!ret) { - unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); - - memcpy(val, k.v, b); - if (unlikely(b < sizeof(*val))) - memset((void *) val + b, 0, sizeof(*val) - b); + __bkey_val_copy(val, val_size, k); bch2_trans_iter_exit(trans, &iter); } @@ -677,12 +674,12 @@ static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, bch2_btree_iter_peek(iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, struct bpos end, unsigned flags) { if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_upto(iter, end); + return bch2_btree_iter_peek_max(iter, end); if (bkey_gt(iter->pos, end)) return bkey_s_c_null; @@ -746,7 +743,7 @@ transaction_restart: \ _ret2 ?: trans_was_restarted(_trans, _restart_count); \ }) -#define for_each_btree_key_upto_continue(_trans, _iter, \ +#define for_each_btree_key_max_continue(_trans, _iter, \ _end, _flags, _k, _do) \ ({ \ struct bkey_s_c _k; \ @@ -754,7 +751,7 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ _end, (_flags)); \ if (!(_k).k) \ break; \ @@ -768,9 +765,9 @@ transaction_restart: \ }) #define for_each_btree_key_continue(_trans, _iter, _flags, _k, _do) \ - for_each_btree_key_upto_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) + for_each_btree_key_max_continue(_trans, _iter, SPOS_MAX, _flags, _k, _do) -#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _do) \ ({ \ bch2_trans_begin(trans); \ @@ -779,12 +776,12 @@ transaction_restart: \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - for_each_btree_key_upto_continue(_trans, _iter, _end, _flags, _k, _do);\ + for_each_btree_key_max_continue(_trans, _iter, _end, _flags, _k, _do);\ }) #define for_each_btree_key(_trans, _iter, _btree_id, \ _start, _flags, _k, _do) \ - for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \ + for_each_btree_key_max(_trans, _iter, _btree_id, _start, \ SPOS_MAX, _flags, _k, _do) #define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ @@ -828,33 +825,33 @@ transaction_restart: \ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max_commit(_trans, _iter, _btree_id, \ _start, _end, _iter_flags, _k, \ _disk_res, _journal_seq, _commit_flags,\ _do) \ - for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + for_each_btree_key_max(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ +#define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) -#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ +#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ for (; \ - (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ + (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ bch2_btree_iter_advance(&(_iter))) #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ - for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\ + for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) #define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ @@ -866,7 +863,7 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); bch2_btree_iter_rewind(&(_iter))) #define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) + for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) /* * This should not be used in a fastpath, without first trying _do in @@ -879,29 +876,33 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); (_do) ?: bch2_trans_relock(_trans); \ }) -#define allocate_dropping_locks_errcode(_trans, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - int _ret = _do; \ - \ - if (bch2_err_matches(_ret, ENOMEM)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, _do); \ - } \ - _ret; \ +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ }) -#define allocate_dropping_locks(_trans, _ret, _do) \ -({ \ - gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ - typeof(_do) _p = _do; \ - \ - _ret = 0; \ - if (unlikely(!_p)) { \ - _gfp = GFP_KERNEL; \ - _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ - } \ - _p; \ +#define allocate_dropping_locks_errcode(_trans, _do) \ +({ \ + int _ret = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\ + \ + if (bch2_err_matches(_ret, ENOMEM)) { \ + _ret = drop_locks_do(_trans, _do); \ + } \ + _ret; \ +}) + +#define allocate_dropping_locks(_trans, _ret, _do) \ +({ \ + typeof(_do) _p = memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, _do);\ + \ + _ret = 0; \ + if (unlikely(!_p)) { \ + _ret = drop_locks_do(_trans, ((_p = _do), 0)); \ + } \ + _p; \ }) #define bch2_trans_run(_c, _do) \ diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index c1657182c2758..6d25e3f85ce89 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -16,6 +16,17 @@ * operations for the regular btree iter code to use: */ +static inline size_t pos_to_idx(struct journal_keys *keys, size_t pos) +{ + size_t gap_size = keys->size - keys->nr; + + BUG_ON(pos >= keys->gap && pos < keys->gap + gap_size); + + if (pos >= keys->gap) + pos -= gap_size; + return pos; +} + static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) { size_t gap_size = keys->size - keys->nr; @@ -61,7 +72,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys, } /* Returns first non-overwritten key >= search key: */ -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, +struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bpos pos, struct bpos end_pos, size_t *idx) { @@ -84,27 +95,92 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree } } + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) - return NULL; + break; if (k->overwritten) { - (*idx)++; + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->end; + else + *idx += 1; continue; } - if (__journal_key_cmp(btree_id, level, pos, k) <= 0) - return k->k; + if (__journal_key_cmp(btree_id, level, pos, k) <= 0) { + ret = k->k; + break; + } (*idx)++; iters++; if (iters == 10) { *idx = 0; + rcu_read_unlock(); goto search; } } - return NULL; + rcu_read_unlock(); + return ret; +} + +struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; + + BUG_ON(*idx > keys->nr); +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while (*idx && + __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) { + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + struct bkey_i *ret = NULL; + rcu_read_lock(); /* for overwritten_ranges */ + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) > 0) + break; + + if (k->overwritten) { + if (k->overwritten_range) + *idx = rcu_dereference(k->overwritten_range)->start - 1; + else + *idx -= 1; + continue; + } + + if (__journal_key_cmp(btree_id, level, pos, k) >= 0) { + ret = k->k; + break; + } + + --(*idx); + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + rcu_read_unlock(); + return ret; } struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, @@ -112,11 +188,12 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree { size_t idx = 0; - return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); + return bch2_journal_keys_peek_max(c, btree_id, level, pos, pos, &idx); } static void journal_iter_verify(struct journal_iter *iter) { +#ifdef CONFIG_BCACHEFS_DEBUG struct journal_keys *keys = iter->keys; size_t gap_size = keys->size - keys->nr; @@ -126,10 +203,10 @@ static void journal_iter_verify(struct journal_iter *iter) if (iter->idx < keys->size) { struct journal_key *k = keys->data + iter->idx; - int cmp = cmp_int(k->btree_id, iter->btree_id) ?: - cmp_int(k->level, iter->level); - BUG_ON(cmp < 0); + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + BUG_ON(cmp > 0); } +#endif } static void journal_iters_fix(struct bch_fs *c) @@ -182,7 +259,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, * Ensure these keys are done last by journal replay, to unblock * journal reclaim: */ - .journal_seq = U32_MAX, + .journal_seq = U64_MAX, }; struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); @@ -290,6 +367,68 @@ bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree, bkey_deleted(&keys->data[idx].k->k)); } +static void __bch2_journal_key_overwritten(struct journal_keys *keys, size_t pos) +{ + struct journal_key *k = keys->data + pos; + size_t idx = pos_to_idx(keys, pos); + + k->overwritten = true; + + struct journal_key *prev = idx > 0 ? keys->data + idx_to_pos(keys, idx - 1) : NULL; + struct journal_key *next = idx + 1 < keys->nr ? keys->data + idx_to_pos(keys, idx + 1) : NULL; + + bool prev_overwritten = prev && prev->overwritten; + bool next_overwritten = next && next->overwritten; + + struct journal_key_range_overwritten *prev_range = + prev_overwritten ? prev->overwritten_range : NULL; + struct journal_key_range_overwritten *next_range = + next_overwritten ? next->overwritten_range : NULL; + + BUG_ON(prev_range && prev_range->end != idx); + BUG_ON(next_range && next_range->start != idx + 1); + + if (prev_range && next_range) { + prev_range->end = next_range->end; + + keys->data[pos].overwritten_range = prev_range; + for (size_t i = next_range->start; i < next_range->end; i++) { + struct journal_key *ip = keys->data + idx_to_pos(keys, i); + BUG_ON(ip->overwritten_range != next_range); + ip->overwritten_range = prev_range; + } + + kfree_rcu_mightsleep(next_range); + } else if (prev_range) { + prev_range->end++; + k->overwritten_range = prev_range; + if (next_overwritten) { + prev_range->end++; + next->overwritten_range = prev_range; + } + } else if (next_range) { + next_range->start--; + k->overwritten_range = next_range; + if (prev_overwritten) { + next_range->start--; + prev->overwritten_range = next_range; + } + } else if (prev_overwritten || next_overwritten) { + struct journal_key_range_overwritten *r = kmalloc(sizeof(*r), GFP_KERNEL); + if (!r) + return; + + r->start = idx - (size_t) prev_overwritten; + r->end = idx + 1 + (size_t) next_overwritten; + + rcu_assign_pointer(k->overwritten_range, r); + if (prev_overwritten) + prev->overwritten_range = r; + if (next_overwritten) + next->overwritten_range = r; + } +} + void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, unsigned level, struct bpos pos) { @@ -299,8 +438,12 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, if (idx < keys->size && keys->data[idx].btree_id == btree && keys->data[idx].level == level && - bpos_eq(keys->data[idx].k->k.p, pos)) - keys->data[idx].overwritten = true; + bpos_eq(keys->data[idx].k->k.p, pos) && + !keys->data[idx].overwritten) { + mutex_lock(&keys->overwrite_lock); + __bch2_journal_key_overwritten(keys, idx); + mutex_unlock(&keys->overwrite_lock); + } } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -314,24 +457,32 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { + struct bkey_s_c ret = bkey_s_c_null; + journal_iter_verify(iter); + rcu_read_lock(); while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; - int cmp = cmp_int(k->btree_id, iter->btree_id) ?: - cmp_int(k->level, iter->level); - if (cmp > 0) + int cmp = __journal_key_btree_cmp(iter->btree_id, iter->level, k); + if (cmp < 0) break; BUG_ON(cmp); - if (!k->overwritten) - return bkey_i_to_s_c(k->k); + if (!k->overwritten) { + ret = bkey_i_to_s_c(k->k); + break; + } - bch2_journal_iter_advance(iter); + if (k->overwritten_range) + iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); + else + bch2_journal_iter_advance(iter); } + rcu_read_unlock(); - return bkey_s_c_null; + return ret; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -382,6 +533,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter : (level > 1 ? 1 : 16); iter.prefetch = false; + iter.fail_if_too_many_whiteouts = true; bch2_bkey_buf_init(&tmp); while (nr--) { @@ -400,6 +552,7 @@ static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret; + size_t iters = 0; if (iter->prefetch && iter->journal.level) btree_and_journal_iter_prefetch(iter); @@ -407,6 +560,11 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter * if (iter->at_end) return bkey_s_c_null; + iters++; + + if (iters > 20 && iter->fail_if_too_many_whiteouts) + return bkey_s_c_null; + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && bpos_lt(btree_k.k->p, iter->pos)) bch2_journal_iter_advance_btree(iter); @@ -481,16 +639,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, /* sort and dedup all keys in the journal: */ -void bch2_journal_entries_free(struct bch_fs *c) -{ - struct journal_replay **i; - struct genradix_iter iter; - - genradix_for_each(&c->journal_entries, iter, i) - kvfree(*i); - genradix_free(&c->journal_entries); -} - /* * When keys compare equal, oldest compares first: */ @@ -515,15 +663,26 @@ void bch2_journal_keys_put(struct bch_fs *c) move_gap(keys, keys->nr); - darray_for_each(*keys, i) + darray_for_each(*keys, i) { + if (i->overwritten_range && + (i == &darray_last(*keys) || + i->overwritten_range != i[1].overwritten_range)) + kfree(i->overwritten_range); + if (i->allocated) kfree(i->k); + } kvfree(keys->data); keys->data = NULL; keys->nr = keys->gap = keys->size = 0; - bch2_journal_entries_free(c); + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + kvfree(*i); + genradix_free(&c->journal_entries); } static void __journal_keys_sort(struct journal_keys *keys) @@ -628,8 +787,20 @@ void bch2_journal_keys_dump(struct bch_fs *c) darray_for_each(*keys, i) { printbuf_reset(&buf); + prt_printf(&buf, "btree="); + bch2_btree_id_to_text(&buf, i->btree_id); + prt_printf(&buf, " l=%u ", i->level); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(i->k)); - pr_err("%s l=%u %s", bch2_btree_id_str(i->btree_id), i->level, buf.buf); + pr_err("%s", buf.buf); } printbuf_exit(&buf); } + +void bch2_fs_journal_keys_init(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + + atomic_set(&keys->ref, 1); + keys->initial_ref_held = true; + mutex_init(&keys->overwrite_lock); +} diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 1653de9d609bc..2a3082919b8d3 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -26,16 +26,24 @@ struct btree_and_journal_iter { struct bpos pos; bool at_end; bool prefetch; + bool fail_if_too_many_whiteouts; }; +static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, + unsigned l_level, + const struct journal_key *r) +{ + return -cmp_int(l_level, r->level) ?: + cmp_int(l_btree_id, r->btree_id); +} + static inline int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, const struct journal_key *r) { - return (cmp_int(l_btree_id, r->btree_id) ?: - cmp_int(l_level, r->level) ?: - bpos_cmp(l_pos, r->k->k.p)); + return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: + bpos_cmp(l_pos, r->k->k.p); } static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) @@ -43,7 +51,9 @@ static inline int journal_key_cmp(const struct journal_key *l, const struct jour return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } -struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, +struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); @@ -79,8 +89,6 @@ static inline void bch2_journal_keys_put_initial(struct bch_fs *c) c->journal_keys.initial_ref_held = false; } -void bch2_journal_entries_free(struct bch_fs *); - int bch2_journal_keys_sort(struct bch_fs *); void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, @@ -89,4 +97,6 @@ void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, void bch2_journal_keys_dump(struct bch_fs *); +void bch2_fs_journal_keys_init(struct bch_fs *); + #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ diff --git a/fs/bcachefs/btree_journal_iter_types.h b/fs/bcachefs/btree_journal_iter_types.h new file mode 100644 index 0000000000000..8b773823704f2 --- /dev/null +++ b/fs/bcachefs/btree_journal_iter_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H +#define _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H + +struct journal_key_range_overwritten { + size_t start, end; +}; + +struct journal_key { + u64 journal_seq; + u32 journal_offset; + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; + bool overwritten; + struct journal_key_range_overwritten __rcu * + overwritten_range; + struct bkey_i *k; +}; + +struct journal_keys { + /* must match layout in darray_types.h */ + size_t nr, size; + struct journal_key *data; + /* + * Gap buffer: instead of all the empty space in the array being at the + * end of the buffer - from @nr to @size - the empty space is at @gap. + * This means that sequential insertions are O(n) instead of O(n^2). + */ + size_t gap; + atomic_t ref; + bool initial_ref_held; + struct mutex overwrite_lock; +}; + +#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_TYPES_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 244610b1d0b59..8905e73174656 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -116,14 +116,14 @@ static void bkey_cached_free(struct btree_key_cache *bc, this_cpu_inc(*bc->nr_pending); } -static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) +static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s) { - gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; + gfp_t gfp = GFP_KERNEL|__GFP_ACCOUNT|__GFP_RECLAIMABLE; struct bkey_cached *ck = kmem_cache_zalloc(bch2_key_cache, gfp); if (unlikely(!ck)) return NULL; - ck->k = kmalloc(key_u64s * sizeof(u64), gfp); + ck->k = kmalloc(key_u64s * sizeof(u64), GFP_KERNEL); if (unlikely(!ck->k)) { kmem_cache_free(bch2_key_cache, ck); return NULL; @@ -147,7 +147,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k goto lock; ck = allocate_dropping_locks(trans, ret, - __bkey_cached_alloc(key_u64s, _gfp)); + __bkey_cached_alloc(key_u64s)); if (ret) { if (ck) kfree(ck->k); @@ -197,7 +197,9 @@ bkey_cached_reuse(struct btree_key_cache *c) return ck; } -static int btree_key_cache_create(struct btree_trans *trans, struct btree_path *path, +static int btree_key_cache_create(struct btree_trans *trans, + struct btree_path *path, + struct btree_path *ck_path, struct bkey_s_c k) { struct bch_fs *c = trans->c; @@ -217,7 +219,7 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * key_u64s = min(256U, (key_u64s * 3) / 2); key_u64s = roundup_pow_of_two(key_u64s); - struct bkey_cached *ck = bkey_cached_alloc(trans, path, key_u64s); + struct bkey_cached *ck = bkey_cached_alloc(trans, ck_path, key_u64s); int ret = PTR_ERR_OR_ZERO(ck); if (ret) return ret; @@ -226,22 +228,22 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * ck = bkey_cached_reuse(bc); if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", - bch2_btree_id_str(path->btree_id)); + bch2_btree_id_str(ck_path->btree_id)); return -BCH_ERR_ENOMEM_btree_key_cache_create; } } ck->c.level = 0; - ck->c.btree_id = path->btree_id; - ck->key.btree_id = path->btree_id; - ck->key.pos = path->pos; + ck->c.btree_id = ck_path->btree_id; + ck->key.btree_id = ck_path->btree_id; + ck->key.pos = ck_path->pos; ck->flags = 1U << BKEY_CACHED_ACCESSED; if (unlikely(key_u64s > ck->u64s)) { - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); struct bkey_i *new_k = allocate_dropping_locks(trans, ret, - kmalloc(key_u64s * sizeof(u64), _gfp)); + kmalloc(key_u64s * sizeof(u64), GFP_KERNEL)); if (unlikely(!new_k)) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(ck->key.btree_id), key_u64s); @@ -258,22 +260,29 @@ static int btree_key_cache_create(struct btree_trans *trans, struct btree_path * bkey_reassemble(ck->k, k); + ret = bch2_btree_node_lock_write(trans, path, &path_l(path)->b->c); + if (unlikely(ret)) + goto err; + ret = rhashtable_lookup_insert_fast(&bc->table, &ck->hash, bch2_btree_key_cache_params); + + bch2_btree_node_unlock_write(trans, path, path_l(path)->b); + if (unlikely(ret)) /* raced with another fill? */ goto err; atomic_long_inc(&bc->nr_keys); six_unlock_write(&ck->c.lock); - enum six_lock_type lock_want = __btree_lock_want(path, 0); + enum six_lock_type lock_want = __btree_lock_want(ck_path, 0); if (lock_want == SIX_LOCK_read) six_lock_downgrade(&ck->c.lock); - btree_path_cached_set(trans, path, ck, (enum btree_node_locked_type) lock_want); - path->uptodate = BTREE_ITER_UPTODATE; + btree_path_cached_set(trans, ck_path, ck, (enum btree_node_locked_type) lock_want); + ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: bkey_cached_free(bc, ck); - mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); + mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; } @@ -293,6 +302,7 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, int ret; bch2_trans_iter_init(trans, &iter, ck_path->btree_id, ck_path->pos, + BTREE_ITER_intent| BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; @@ -306,9 +316,19 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, if (unlikely(ret)) goto out; - ret = btree_key_cache_create(trans, ck_path, k); + ret = btree_key_cache_create(trans, btree_iter_path(trans, &iter), ck_path, k); if (ret) goto err; + + if (trace_key_cache_fill_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); + } out: /* We're not likely to need this iterator again: */ bch2_set_btree_iter_dontneed(&iter); @@ -424,8 +444,15 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - ret = bch2_btree_iter_traverse(&b_iter) ?: - bch2_trans_update(trans, &b_iter, ck->k, + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); + ret = bkey_err(btree_k); + if (ret) + goto err; + + /* * Check that we're not violating cache coherency rules: */ + BUG_ON(bkey_deleted(btree_k.k)); + + ret = bch2_trans_update(trans, &b_iter, ck->k, BTREE_UPDATE_key_cache_reclaim| BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun) ?: @@ -433,7 +460,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| commit_flags); - +err: bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index efe2a007b4828..d343df9f0ad2e 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -782,7 +782,7 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) return bch2_trans_relock_fail(trans, path, &f, trace); } - trans_set_locked(trans); + trans_set_locked(trans, true); out: bch2_trans_verify_locks(trans); return 0; diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 7c07f9fa9add3..7474ab6ce0191 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -188,10 +188,10 @@ int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); /* lock: */ -static inline void trans_set_locked(struct btree_trans *trans) +static inline void trans_set_locked(struct btree_trans *trans, bool try) { if (!trans->locked) { - lock_acquire_exclusive(&trans->dep_map, 0, 0, NULL, _THIS_IP_); + lock_acquire_exclusive(&trans->dep_map, 0, try, NULL, _THIS_IP_); trans->locked = true; trans->last_unlock_ip = 0; @@ -282,7 +282,7 @@ static inline int btree_node_lock(struct btree_trans *trans, int ret = 0; EBUG_ON(level >= BTREE_MAX_DEPTH); - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (likely(six_trylock_type(&b->lock, type)) || btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 30131c3bdd970..a7f06deee13c3 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -12,6 +12,7 @@ #include "recovery_passes.h" #include +#include #include struct find_btree_nodes_worker { @@ -22,17 +23,15 @@ struct find_btree_nodes_worker { static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n) { - prt_printf(out, "%s l=%u seq=%u journal_seq=%llu cookie=%llx ", - bch2_btree_id_str(n->btree_id), n->level, n->seq, - n->journal_seq, n->cookie); + bch2_btree_id_level_to_text(out, n->btree_id, n->level); + prt_printf(out, " seq=%u journal_seq=%llu cookie=%llx ", + n->seq, n->journal_seq, n->cookie); bch2_bpos_to_text(out, n->min_key); prt_str(out, "-"); bch2_bpos_to_text(out, n->max_key); if (n->range_updated) prt_str(out, " range updated"); - if (n->overwritten) - prt_str(out, " overwritten"); for (unsigned i = 0; i < n->nr_ptrs; i++) { prt_char(out, ' '); @@ -140,6 +139,24 @@ static int found_btree_node_cmp_pos(const void *_l, const void *_r) -found_btree_node_cmp_time(l, r); } +static inline bool found_btree_node_cmp_pos_less(const void *l, const void *r, void *arg) +{ + return found_btree_node_cmp_pos(l, r) < 0; +} + +static inline void found_btree_node_swap(void *_l, void *_r, void *arg) +{ + struct found_btree_node *l = _l; + struct found_btree_node *r = _r; + + swap(*l, *r); +} + +static const struct min_heap_callbacks found_btree_node_heap_cbs = { + .less = found_btree_node_cmp_pos_less, + .swp = found_btree_node_swap, +}; + static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, struct bio *bio, struct btree_node *bn, u64 offset) { @@ -159,6 +176,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, return; if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { + if (!c->chacha20) + return; + struct nonce nonce = btree_nonce(&bn->keys, 0); unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; @@ -292,55 +312,48 @@ static int read_btree_nodes(struct find_btree_nodes *f) return f->ret ?: ret; } -static void bubble_up(struct found_btree_node *n, struct found_btree_node *end) +static bool nodes_overlap(const struct found_btree_node *l, + const struct found_btree_node *r) { - while (n + 1 < end && - found_btree_node_cmp_pos(n, n + 1) > 0) { - swap(n[0], n[1]); - n++; - } + return (l->btree_id == r->btree_id && + l->level == r->level && + bpos_gt(l->max_key, r->min_key)); } static int handle_overwrites(struct bch_fs *c, - struct found_btree_node *start, - struct found_btree_node *end) + struct found_btree_node *l, + found_btree_nodes *nodes_heap) { - struct found_btree_node *n; -again: - for (n = start + 1; - n < end && - n->btree_id == start->btree_id && - n->level == start->level && - bpos_lt(n->min_key, start->max_key); - n++) { - int cmp = found_btree_node_cmp_time(start, n); + struct found_btree_node *r; + + while ((r = min_heap_peek(nodes_heap)) && + nodes_overlap(l, r)) { + int cmp = found_btree_node_cmp_time(l, r); if (cmp > 0) { - if (bpos_cmp(start->max_key, n->max_key) >= 0) - n->overwritten = true; + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); else { - n->range_updated = true; - n->min_key = bpos_successor(start->max_key); - n->range_updated = true; - bubble_up(n, end); - goto again; + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } else if (cmp < 0) { - BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0); + BUG_ON(bpos_eq(l->min_key, r->min_key)); - start->max_key = bpos_predecessor(n->min_key); - start->range_updated = true; - } else if (n->level) { - n->overwritten = true; + l->max_key = bpos_predecessor(r->min_key); + l->range_updated = true; + } else if (r->level) { + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); } else { - if (bpos_cmp(start->max_key, n->max_key) >= 0) - n->overwritten = true; + if (bpos_cmp(l->max_key, r->max_key) >= 0) + min_heap_pop(nodes_heap, &found_btree_node_heap_cbs, NULL); else { - n->range_updated = true; - n->min_key = bpos_successor(start->max_key); - n->range_updated = true; - bubble_up(n, end); - goto again; + r->range_updated = true; + r->min_key = bpos_successor(l->max_key); + r->range_updated = true; + min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } } @@ -352,6 +365,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) { struct find_btree_nodes *f = &c->found_btree_nodes; struct printbuf buf = PRINTBUF; + found_btree_nodes nodes_heap = {}; size_t dst; int ret = 0; @@ -406,29 +420,57 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) bch2_print_string_as_lines(KERN_INFO, buf.buf); } - dst = 0; - darray_for_each(f->nodes, i) { - if (i->overwritten) - continue; + swap(nodes_heap, f->nodes); + + { + /* darray must have same layout as a heap */ + min_heap_char real_heap; + BUILD_BUG_ON(sizeof(nodes_heap.nr) != sizeof(real_heap.nr)); + BUILD_BUG_ON(sizeof(nodes_heap.size) != sizeof(real_heap.size)); + BUILD_BUG_ON(offsetof(found_btree_nodes, nr) != offsetof(min_heap_char, nr)); + BUILD_BUG_ON(offsetof(found_btree_nodes, size) != offsetof(min_heap_char, size)); + } + + min_heapify_all(&nodes_heap, &found_btree_node_heap_cbs, NULL); - ret = handle_overwrites(c, i, &darray_top(f->nodes)); + if (nodes_heap.nr) { + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); if (ret) goto err; - BUG_ON(i->overwritten); - f->nodes.data[dst++] = *i; + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); } - f->nodes.nr = dst; - if (c->opts.verbose) { + while (true) { + ret = handle_overwrites(c, &darray_last(f->nodes), &nodes_heap); + if (ret) + goto err; + + if (!nodes_heap.nr) + break; + + ret = darray_push(&f->nodes, *min_heap_peek(&nodes_heap)); + if (ret) + goto err; + + min_heap_pop(&nodes_heap, &found_btree_node_heap_cbs, NULL); + } + + for (struct found_btree_node *n = f->nodes.data; n < &darray_last(f->nodes); n++) + BUG_ON(nodes_overlap(n, n + 1)); + + if (0 && c->opts.verbose) { printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); bch2_print_string_as_lines(KERN_INFO, buf.buf); + } else { + bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); } eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); err: + darray_exit(&nodes_heap); printbuf_exit(&buf); return ret; } @@ -499,7 +541,9 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, if (c->opts.verbose) { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level); + prt_str(&buf, "recovery "); + bch2_btree_id_level_to_text(&buf, btree, level); + prt_str(&buf, " "); bch2_bpos_to_text(&buf, node_min); prt_str(&buf, " - "); bch2_bpos_to_text(&buf, node_max); @@ -533,7 +577,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); printbuf_exit(&buf); - BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0)); + BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), + (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = level + 1, + .btree = btree, + })); ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k); if (ret) diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h index b6c36c45d0be1..2811b6857c970 100644 --- a/fs/bcachefs/btree_node_scan_types.h +++ b/fs/bcachefs/btree_node_scan_types.h @@ -6,7 +6,6 @@ struct found_btree_node { bool range_updated:1; - bool overwritten:1; u8 btree_id; u8 level; unsigned sectors_written; diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 9bf471fa43614..c3a3bfd11e8ce 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -479,8 +479,7 @@ static int run_one_mem_trigger(struct btree_trans *trans, old, flags); } -static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, - bool overwrite) +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i) { verify_update_old_key(trans, i); @@ -507,10 +506,10 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k), BTREE_TRIGGER_insert| BTREE_TRIGGER_overwrite|flags) ?: 1; - } else if (overwrite && !i->overwrite_trigger_run) { + } else if (!i->overwrite_trigger_run) { i->overwrite_trigger_run = true; return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1; - } else if (!overwrite && !i->insert_trigger_run) { + } else if (!i->insert_trigger_run) { i->insert_trigger_run = true; return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1; } else { @@ -519,39 +518,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned btree_id_start) + unsigned *btree_id_updates_start) { - for (int overwrite = 1; overwrite >= 0; --overwrite) { - bool trans_trigger_run; + bool trans_trigger_run; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; - - for (unsigned i = btree_id_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; - i++) { - if (trans->updates[i].btree_id != btree_id) - continue; + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; - int ret = run_one_trans_trigger(trans, trans->updates + i, overwrite); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; + for (unsigned i = *btree_id_updates_start; + i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; + i++) { + if (trans->updates[i].btree_id < btree_id) { + *btree_id_updates_start = i; + continue; } - } while (trans_trigger_run); - } + + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && + i->btree_id == btree_id && + btree_node_type_has_trans_triggers(i->bkey_type) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); return 0; } static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - unsigned btree_id = 0, btree_id_start = 0; + unsigned btree_id = 0, btree_id_updates_start = 0; int ret = 0; /* @@ -565,27 +570,15 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans) if (btree_id == BTREE_ID_alloc) continue; - while (btree_id_start < trans->nr_updates && - trans->updates[btree_id_start].btree_id < btree_id) - btree_id_start++; - - ret = run_btree_triggers(trans, btree_id, btree_id_start); + ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); if (ret) return ret; } - for (unsigned idx = 0; idx < trans->nr_updates; idx++) { - struct btree_insert_entry *i = trans->updates + idx; - - if (i->btree_id > BTREE_ID_alloc) - break; - if (i->btree_id == BTREE_ID_alloc) { - ret = run_btree_triggers(trans, BTREE_ID_alloc, idx); - if (ret) - return ret; - break; - } - } + btree_id_updates_start = 0; + ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); + if (ret) + return ret; #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) @@ -609,14 +602,6 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) return 0; } -static struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) -{ - return (struct bversion) { - .hi = res->seq >> 32, - .lo = (res->seq << 32) | (res->offset + offset), - }; -} - static inline int bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct btree_insert_entry **stopped_at, @@ -627,12 +612,11 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, unsigned u64s = 0; int ret = 0; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (race_fault()) { trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); - return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_fault_inject); } /* @@ -701,25 +685,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, struct jset_entry *entry = trans->journal_entries; percpu_down_read(&c->mark_lock); - for (entry = trans->journal_entries; entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); entry = vstruct_next(entry)) if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && entry->start->k.type == KEY_TYPE_accounting) { - BUG_ON(!trans->journal_res.ref); - - struct bkey_i_accounting *a = bkey_i_to_accounting(entry->start); - - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, - (u64 *) entry - (u64 *) trans->journal_entries); - BUG_ON(bversion_zero(a->k.bversion)); - - if (likely(!(flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { - ret = bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal); - if (ret) - goto revert_fs_usage; - } + ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); + if (ret) + goto revert_fs_usage; } percpu_up_read(&c->mark_lock); @@ -739,33 +712,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, goto fatal_err; } - trans_for_each_update(trans, i) { - enum bch_validate_flags invalid_flags = 0; + struct bkey_validate_context validate_context = { .from = BKEY_VALIDATE_commit }; - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - - ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags); - if (unlikely(ret)){ - bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", - trans->fn, (void *) i->ip_allocated); - goto fatal_err; - } - btree_insert_entry_checks(trans, i); - } + if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) + validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; for (struct jset_entry *i = trans->journal_entries; i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); i = vstruct_next(i)) { - enum bch_validate_flags invalid_flags = 0; - - if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) - invalid_flags |= BCH_VALIDATE_write|BCH_VALIDATE_commit; - ret = bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, - CPU_BIG_ENDIAN, invalid_flags); + CPU_BIG_ENDIAN, validate_context); if (unlikely(ret)) { bch2_trans_inconsistent(trans, "invalid journal entry on insert from %s\n", trans->fn); @@ -773,6 +730,19 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } } + trans_for_each_update(trans, i) { + validate_context.level = i->level; + validate_context.btree = i->btree_id; + + ret = bch2_bkey_validate(c, bkey_i_to_s_c(i->k), validate_context); + if (unlikely(ret)){ + bch2_trans_inconsistent(trans, "invalid bkey on insert from %s -> %ps\n", + trans->fn, (void *) i->ip_allocated); + goto fatal_err; + } + btree_insert_entry_checks(trans, i); + } + if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) { struct journal *j = &c->journal; struct jset_entry *entry; @@ -833,13 +803,9 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, entry2 != entry; entry2 = vstruct_next(entry2)) if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && - entry2->start->k.type == KEY_TYPE_accounting) { - struct bkey_s_accounting a = bkey_i_to_s_accounting(entry2->start); - - bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); - bch2_accounting_neg(a); - } + entry2->start->k.type == KEY_TYPE_accounting) + bch2_accounting_trans_commit_revert(trans, + bkey_i_to_accounting(entry2->start), flags); percpu_up_read(&c->mark_lock); return ret; } @@ -994,24 +960,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, return ret; } -static noinline int -bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -{ - struct bch_fs *c = trans->c; - int ret; - - if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || - test_bit(BCH_FS_started, &c->flags)) - return -BCH_ERR_erofs_trans_commit; - - ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); - if (ret) - return ret; - - bch2_write_ref_get(c, BCH_WRITE_REF_trans); - return 0; -} - /* * This is for updates done in the early part of fsck - btree_gc - before we've * gone RW. we only add the new key to the list of keys for journal replay to @@ -1022,6 +970,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) { struct bch_fs *c = trans->c; + BUG_ON(current != c->recovery_task); + trans_for_each_update(trans, i) { int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); if (ret) @@ -1047,8 +997,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) struct bch_fs *c = trans->c; int ret = 0; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (!trans->nr_updates && !trans->journal_entries_u64s) @@ -1058,16 +1007,13 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (ret) goto out_reset; - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { - ret = do_bch2_trans_commit_to_journal_replay(trans); - goto out_reset; - } - if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { - ret = bch2_trans_commit_get_rw_cold(trans, flags); - if (ret) - goto out_reset; + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) + ret = do_bch2_trans_commit_to_journal_replay(trans); + else + ret = -BCH_ERR_erofs_trans_commit; + goto out_reset; } EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); @@ -1112,8 +1058,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) } retry: errored_at = NULL; - bch2_trans_verify_not_unlocked(trans); - bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) memset(&trans->journal_res, 0, sizeof(trans->journal_res)); memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta)); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 4568a41fefaf6..f5b41ad83b8c2 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -446,7 +446,7 @@ struct btree_insert_entry { /* Number of btree paths we preallocate, usually enough */ #define BTREE_ITER_INITIAL 64 /* - * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code + * Limit for btree_trans_too_many_iters(); this is enough that almost all code * paths should run inside this limit, and if they don't it usually indicates a * bug (leaking/duplicated btree paths). * @@ -513,6 +513,9 @@ struct btree_trans { u64 last_begin_time; unsigned long last_begin_ip; unsigned long last_restarted_ip; +#ifdef CONFIG_BCACHEFS_DEBUG + bch_stacktrace last_restarted_trace; +#endif unsigned long last_unlock_ip; unsigned long srcu_lock_time; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 5d809e8bd1708..13d794f201a56 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -144,7 +144,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) @@ -296,7 +296,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_with_updates| BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -323,7 +323,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, goto out; next: bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -588,12 +588,9 @@ struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsi int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { - struct bkey_s_c k; - int ret = 0; - - bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_intent); - k = bch2_btree_iter_prev(iter); - ret = bkey_err(k); + bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); + struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); + int ret = bkey_err(k); if (ret) goto err; @@ -672,27 +669,19 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k, bch2_btree_insert_trans(trans, id, k, iter_flags)); } -int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, - unsigned len, unsigned update_flags) +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned update_flags) { - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; bkey_init(&k->k); k->k.p = iter->pos; - bch2_key_resize(&k->k, len); return bch2_trans_update(trans, iter, k, update_flags); } -int bch2_btree_delete_at(struct btree_trans *trans, - struct btree_iter *iter, unsigned update_flags) -{ - return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); -} - int bch2_btree_delete(struct btree_trans *trans, enum btree_id btree, struct bpos pos, unsigned update_flags) @@ -721,7 +710,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int ret = 0; bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { + while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; @@ -794,8 +783,7 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, return ret; } -int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, - struct bpos pos, bool set) +int bch2_btree_bit_mod_iter(struct btree_trans *trans, struct btree_iter *iter, bool set) { struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); int ret = PTR_ERR_OR_ZERO(k); @@ -804,13 +792,21 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, bkey_init(&k->k); k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; + k->k.p = iter->pos; + if (iter->flags & BTREE_ITER_is_extents) + bch2_key_resize(&k->k, 1); + return bch2_trans_update(trans, iter, k, 0); +} + +int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ struct btree_iter iter; bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update(trans, &iter, k, 0); + int ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_bit_mod_iter(trans, &iter, set); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -827,10 +823,17 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s) +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) { + unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); + prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); + + int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + return ret; + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - int ret = PTR_ERR_OR_ZERO(e); + ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; @@ -865,9 +868,8 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, memcpy(l->d, buf.buf, buf.pos); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { - ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_lazy_rw|commit_flags, - __bch2_trans_log_msg(trans, &buf, u64s)); + ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, + bch2_trans_log_msg(trans, &buf)); } err: printbuf_exit(&buf); diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 70b3c989fac23..96d208edafdb3 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -24,7 +24,6 @@ void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, #define BCH_TRANS_COMMIT_FLAGS() \ x(no_enospc, "don't check for enospc") \ x(no_check_rw, "don't attempt to take a ref on c->writes") \ - x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \ x(no_journal_res, "don't take a journal reservation, instead " \ "pin journal entry referred to by trans->journal_res.seq") \ x(journal_reclaim, "operation required for journal reclaim; may return error" \ @@ -47,8 +46,6 @@ enum bch_trans_commit_flags { void bch2_trans_commit_flags_to_text(struct printbuf *, enum bch_trans_commit_flags); -int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, - unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); @@ -66,6 +63,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); +int bch2_btree_bit_mod_iter(struct btree_trans *, struct btree_iter *, bool); int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); @@ -82,7 +80,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, * For use when splitting extents in existing snapshots: * * If @old_pos is an interior snapshot node, iterate over descendent snapshot - * nodes: for every descendent snapshot in whiche @old_pos is overwritten and + * nodes: for every descendent snapshot in which @old_pos is overwritten and * not visible, emit a whiteout at @new_pos. */ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, @@ -161,6 +159,7 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); +int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); @@ -244,7 +243,8 @@ static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *tra KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, unsigned flags, + struct bkey_s_c *k, + enum btree_iter_update_trigger_flags flags, unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); @@ -261,8 +261,9 @@ static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, str return mut; } -static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c *k, unsigned flags) +static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c *k, + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); } @@ -274,7 +275,8 @@ static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struc static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) { struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags|BTREE_ITER_intent, type); @@ -289,7 +291,7 @@ static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *tr static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); } @@ -297,7 +299,8 @@ static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *tran static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned type, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned min_bytes) { struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags|BTREE_ITER_intent, type, min_bytes); @@ -318,7 +321,8 @@ static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags, unsigned min_bytes) + enum btree_iter_update_trigger_flags flags, + unsigned min_bytes) { return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); } @@ -326,7 +330,7 @@ static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, - unsigned flags) + enum btree_iter_update_trigger_flags flags) { return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); } @@ -337,7 +341,8 @@ static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, - unsigned flags, unsigned type, unsigned val_size) + enum btree_iter_update_trigger_flags flags, + unsigned type, unsigned val_size) { struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); int ret; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index d596ef93239f8..03a6eba7403dd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -58,11 +58,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); + bch2_bkey_buf_init(&prev); + bkey_init(&prev.k->k); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->min_key); - need_fsck_err(trans, btree_root_bad_min_key, + log_fsck_err(trans, btree_root_bad_min_key, "btree root with incorrect min_key: %s", buf.buf); goto topology_repair; } @@ -70,18 +74,14 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (!bpos_eq(b->data->max_key, SPOS_MAX)) { printbuf_reset(&buf); bch2_bpos_to_text(&buf, b->data->max_key); - need_fsck_err(trans, btree_root_bad_max_key, + log_fsck_err(trans, btree_root_bad_max_key, "btree root with incorrect max_key: %s", buf.buf); goto topology_repair; } } if (!b->c.level) - return 0; - - bch2_bkey_buf_init(&prev); - bkey_init(&prev.k->k); - bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + goto out; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { if (k.k->type != KEY_TYPE_btree_ptr_v2) @@ -97,16 +97,16 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) bch2_topology_error(c); printbuf_reset(&buf); - prt_str(&buf, "end of prev node doesn't match start of next node\n"), - prt_printf(&buf, " in btree %s level %u node ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_str(&buf, "end of prev node doesn't match start of next node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, "\n prev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); prt_str(&buf, "\n next "); bch2_bkey_val_to_text(&buf, c, k); - need_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); + log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); goto topology_repair; } @@ -118,25 +118,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) bch2_topology_error(c); printbuf_reset(&buf); - prt_str(&buf, "empty interior node\n"); - prt_printf(&buf, " in btree %s level %u node ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_str(&buf, "empty interior node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - need_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); + log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); goto topology_repair; } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { bch2_topology_error(c); printbuf_reset(&buf); - prt_str(&buf, "last child node doesn't end at end of parent node\n"); - prt_printf(&buf, " in btree %s level %u node ", - bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_str(&buf, "last child node doesn't end at end of parent node\n in "); + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); prt_str(&buf, "\n last key "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - need_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); goto topology_repair; } out: @@ -146,13 +146,7 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) printbuf_exit(&buf); return ret; topology_repair: - if ((c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) { - bch2_inconsistent_error(c); - ret = -BCH_ERR_btree_need_topology_repair; - } else { - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); - } + ret = bch2_topology_error(c); goto out; } @@ -1366,9 +1360,14 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, if (unlikely(!test_bit(JOURNAL_replay_done, &c->journal.flags))) bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); - if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), - btree_node_type(b), BCH_VALIDATE_write) ?: - bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), BCH_VALIDATE_write)) { + struct bkey_validate_context from = (struct bkey_validate_context) { + .from = BKEY_VALIDATE_btree_node, + .level = b->c.level, + .btree = b->c.btree_id, + .flags = BCH_VALIDATE_commit, + }; + if (bch2_bkey_validate(c, bkey_i_to_s_c(insert), from) ?: + bch2_bkey_in_btree_node(c, b, bkey_i_to_s_c(insert), from)) { bch2_fs_inconsistent(c, "%s: inserting invalid bkey", __func__); dump_stack(); } @@ -1418,15 +1417,26 @@ bch2_btree_insert_keys_interior(struct btree_update *as, (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) ; - while (!bch2_keylist_empty(keys)) { - insert = bch2_keylist_front(keys); + for (; + insert != keys->top && bpos_le(insert->k.p, b->key.k.p); + insert = bkey_next(insert)) + bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - if (bpos_gt(insert->k.p, b->key.k.p)) - break; + if (bch2_btree_node_check_topology(trans, b)) { + struct printbuf buf = PRINTBUF; - bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - bch2_keylist_pop_front(keys); + for (struct bkey_i *k = keys->keys; + k != insert; + k = bkey_next(k)) { + bch2_bkey_val_to_text(&buf, trans->c, bkey_i_to_s_c(k)); + prt_newline(&buf); + } + + panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); } + + memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); + keys->top_p -= insert->_data - keys->keys_p; } static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) @@ -1575,8 +1585,6 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); - - BUG_ON(bch2_btree_node_check_topology(trans, b)); } } @@ -1599,8 +1607,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) return ret; - bch2_btree_interior_update_will_free_node(as, b); - if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { struct btree *n[2]; @@ -1699,6 +1705,8 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (ret) goto err; + bch2_btree_interior_update_will_free_node(as, b); + if (n3) { bch2_btree_update_get_open_buckets(as, n3); bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); @@ -1827,8 +1835,6 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t btree_update_updated_node(as, b); bch2_btree_node_unlock_write(trans, path, b); - - BUG_ON(bch2_btree_node_check_topology(trans, b)); return 0; split: /* @@ -1953,8 +1959,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, u64 start_time = local_clock(); int ret = 0; - bch2_trans_verify_not_in_restart(trans); - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); BUG_ON(!trans->paths[path].should_be_locked); BUG_ON(!btree_node_locked(&trans->paths[path], level)); @@ -2058,9 +2063,6 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, trace_and_count(c, btree_node_merge, trans, b); - bch2_btree_interior_update_will_free_node(as, b); - bch2_btree_interior_update_will_free_node(as, m); - n = bch2_btree_node_alloc(as, trans, b->c.level); SET_BTREE_NODE_SEQ(n->data, @@ -2096,6 +2098,9 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, if (ret) goto err_free_update; + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); + bch2_trans_verify_paths(trans); bch2_btree_update_get_open_buckets(as, n); @@ -2150,8 +2155,6 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto out; - bch2_btree_interior_update_will_free_node(as, b); - n = bch2_btree_node_alloc_replacement(as, trans, b); bch2_btree_build_aux_trees(n); @@ -2175,6 +2178,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (ret) goto err; + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_update_get_open_buckets(as, n); bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); @@ -2201,42 +2206,50 @@ struct async_btree_rewrite { struct list_head list; enum btree_id btree_id; unsigned level; - struct bpos pos; - __le64 seq; + struct bkey_buf key; }; static int async_btree_node_rewrite_trans(struct btree_trans *trans, struct async_btree_rewrite *a) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct btree *b; - int ret; - - bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, + bch2_trans_node_iter_init(trans, &iter, + a->btree_id, a->key.k->k.p, BTREE_MAX_DEPTH, a->level, 0); - b = bch2_btree_iter_peek_node(&iter); - ret = PTR_ERR_OR_ZERO(b); + struct btree *b = bch2_btree_iter_peek_node(&iter); + int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; - if (!b || b->data->keys.seq != a->seq) { + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); + ret = found + ? bch2_btree_node_rewrite(trans, &iter, b, 0) + : -ENOENT; + +#if 0 + /* Tracepoint... */ + if (!ret || ret == -ENOENT) { + struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null"); - bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", - __func__, a->seq, buf.buf); + if (!ret) { + prt_printf(&buf, "rewrite node:\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); + } else { + prt_printf(&buf, "node to rewrite not found:\n want: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); + prt_printf(&buf, "\n got: "); + if (b) + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + else + prt_str(&buf, "(null)"); + } + bch_info(c, "%s", buf.buf); printbuf_exit(&buf); - goto out; } - - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +#endif out: bch2_trans_iter_exit(trans, &iter); - return ret; } @@ -2247,81 +2260,97 @@ static void async_btree_node_rewrite_work(struct work_struct *work) struct bch_fs *c = a->c; int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); - bch_err_fn_ratelimited(c, ret); + if (ret != -ENOENT) + bch_err_fn_ratelimited(c, ret); + + spin_lock(&c->btree_node_rewrites_lock); + list_del(&a->list); + spin_unlock(&c->btree_node_rewrites_lock); + + closure_wake_up(&c->btree_node_rewrites_wait); + + bch2_bkey_buf_exit(&a->key, c); bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); kfree(a); } void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) { - struct async_btree_rewrite *a; - int ret; - - a = kmalloc(sizeof(*a), GFP_NOFS); - if (!a) { - bch_err(c, "%s: error allocating memory", __func__); + struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS); + if (!a) return; - } a->c = c; a->btree_id = b->c.btree_id; a->level = b->c.level; - a->pos = b->key.k.p; - a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { - mutex_lock(&c->pending_node_rewrites_lock); - list_add(&a->list, &c->pending_node_rewrites); - mutex_unlock(&c->pending_node_rewrites_lock); - return; - } + bch2_bkey_buf_init(&a->key); + bch2_bkey_buf_copy(&a->key, c, &b->key); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - if (test_bit(BCH_FS_started, &c->flags)) { - bch_err(c, "%s: error getting c->writes ref", __func__); - kfree(a); - return; - } + bool now = false, pending = false; - ret = bch2_fs_read_write_early(c); - bch_err_msg(c, ret, "going read-write"); - if (ret) { - kfree(a); - return; - } + spin_lock(&c->btree_node_rewrites_lock); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && + bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + list_add(&a->list, &c->btree_node_rewrites); + now = true; + } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { + list_add(&a->list, &c->btree_node_rewrites_pending); + pending = true; + } + spin_unlock(&c->btree_node_rewrites_lock); - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + if (now) { + queue_work(c->btree_node_rewrite_worker, &a->work); + } else if (pending) { + /* bch2_do_pending_node_rewrites will execute */ + } else { + bch2_bkey_buf_exit(&a->key, c); + kfree(a); } +} - queue_work(c->btree_node_rewrite_worker, &a->work); +void bch2_async_btree_node_rewrites_flush(struct bch_fs *c) +{ + closure_wait_event(&c->btree_node_rewrites_wait, + list_empty(&c->btree_node_rewrites)); } void bch2_do_pending_node_rewrites(struct bch_fs *c) { - struct async_btree_rewrite *a, *n; - - mutex_lock(&c->pending_node_rewrites_lock); - list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { - list_del(&a->list); + while (1) { + spin_lock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a = + list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); + if (a) + list_add(&a->list, &c->btree_node_rewrites); + spin_unlock(&c->btree_node_rewrites_lock); + + if (!a) + break; bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); queue_work(c->btree_node_rewrite_worker, &a->work); } - mutex_unlock(&c->pending_node_rewrites_lock); } void bch2_free_pending_node_rewrites(struct bch_fs *c) { - struct async_btree_rewrite *a, *n; + while (1) { + spin_lock(&c->btree_node_rewrites_lock); + struct async_btree_rewrite *a = + list_pop_entry(&c->btree_node_rewrites_pending, + struct async_btree_rewrite, list); + spin_unlock(&c->btree_node_rewrites_lock); - mutex_lock(&c->pending_node_rewrites_lock); - list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { - list_del(&a->list); + if (!a) + break; + bch2_bkey_buf_exit(&a->key, c); kfree(a); } - mutex_unlock(&c->pending_node_rewrites_lock); } static int __bch2_btree_node_update_key(struct btree_trans *trans, @@ -2575,8 +2604,9 @@ static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update prt_printf(out, "%ps: ", (void *) as->ip_started); bch2_trans_commit_flags_to_text(out, as->flags); - prt_printf(out, " btree=%s l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", - bch2_btree_id_str(as->btree_id), + prt_str(out, " "); + bch2_btree_id_to_text(out, as->btree_id); + prt_printf(out, " l=%u-%u mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n", as->update_level_start, as->update_level_end, bch2_btree_update_modes[as->mode], @@ -2677,6 +2707,9 @@ void bch2_btree_reserve_cache_to_text(struct printbuf *out, struct bch_fs *c) void bch2_fs_btree_interior_update_exit(struct bch_fs *c) { + WARN_ON(!list_empty(&c->btree_node_rewrites)); + WARN_ON(!list_empty(&c->btree_node_rewrites_pending)); + if (c->btree_node_rewrite_worker) destroy_workqueue(c->btree_node_rewrite_worker); if (c->btree_interior_update_worker) @@ -2692,8 +2725,9 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) mutex_init(&c->btree_interior_update_lock); INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); - INIT_LIST_HEAD(&c->pending_node_rewrites); - mutex_init(&c->pending_node_rewrites_lock); + INIT_LIST_HEAD(&c->btree_node_rewrites); + INIT_LIST_HEAD(&c->btree_node_rewrites_pending); + spin_lock_init(&c->btree_node_rewrites_lock); } int bch2_fs_btree_interior_update_init(struct bch_fs *c) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 10f400957f21a..ffc6a8c1f0125 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -116,7 +116,7 @@ struct btree_update { struct keylist parent_keys; /* * Enough room for btree_split's keys without realloc - btree node - * pointers never have crc/compression info, so we only need to acount + * pointers never have crc/compression info, so we only need to account * for the pointers for three keys */ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; @@ -159,7 +159,7 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, unsigned level, unsigned flags) { - bch2_trans_verify_not_unlocked(trans); + bch2_trans_verify_not_unlocked_or_in_restart(trans); return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, btree_prev_sib) ?: @@ -334,6 +334,7 @@ void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, struct jset_entry *, unsigned long); +void bch2_async_btree_node_rewrites_flush(struct bch_fs *); void bch2_do_pending_node_rewrites(struct bch_fs *); void bch2_free_pending_node_rewrites(struct bch_fs *); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index 1639c60dffa0e..723ffbe6efaad 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -19,8 +19,6 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); - static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) { return (cmp_int(l->hi, r->hi) ?: @@ -455,7 +453,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) * journal replay has to split/rewrite nodes to make room for * its updates. * - * And for those new acounting updates, updates to the same + * And for those new accounting updates, updates to the same * counters get accumulated as they're flushed from the journal * to the write buffer - see the patch for eytzingcer tree * accumulated. So we could only overflow if the number of @@ -481,21 +479,55 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) return ret; } -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) +static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) +{ + struct journal_keys_to_wb dst; + int ret = 0; + + bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); + + for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { + jset_entry_for_each_key(entry, k) { + ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); + if (ret) + goto out; + } + + entry->type = BCH_JSET_ENTRY_btree_keys; + } +out: + ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; + return ret; +} + +static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 max_seq) { struct journal *j = &c->journal; struct journal_buf *buf; + bool blocked; int ret = 0; - while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) { + while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, max_seq, &blocked))) { ret = bch2_journal_keys_to_write_buffer(c, buf); + + if (!blocked && !ret) { + spin_lock(&j->lock); + buf->need_flush_to_write_buffer = false; + spin_unlock(&j->lock); + } + mutex_unlock(&j->buf_lock); + + if (blocked) { + bch2_journal_unblock(j); + break; + } } return ret; } -static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, +static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 max_seq, bool *did_work) { struct bch_fs *c = trans->c; @@ -505,7 +537,7 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, do { bch2_trans_unlock(trans); - fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); + fetch_from_journal_err = fetch_wb_keys_from_journal(c, max_seq); *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr; @@ -518,8 +550,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq, mutex_unlock(&wb->flushing.lock); } while (!ret && (fetch_from_journal_err || - (wb->inc.pin.seq && wb->inc.pin.seq <= seq) || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq))); + (wb->inc.pin.seq && wb->inc.pin.seq <= max_seq) || + (wb->flushing.pin.seq && wb->flushing.pin.seq <= max_seq))); return ret; } @@ -600,6 +632,14 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, bch2_bkey_buf_init(&tmp); if (!bkey_and_val_eq(referring_k, bkey_i_to_s_c(last_flushed->k))) { + if (trace_write_buffer_maybe_flush_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, referring_k); + trace_write_buffer_maybe_flush(trans, _RET_IP_, buf.buf); + printbuf_exit(&buf); + } + bch2_bkey_buf_reassemble(&tmp, c, referring_k); if (bkey_is_btree_ptr(referring_k.k)) { @@ -771,31 +811,6 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ return ret; } -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } - - spin_lock(&c->journal.lock); - buf->need_flush_to_write_buffer = false; - spin_unlock(&c->journal.lock); -out: - ret = bch2_journal_keys_to_write_buffer_end(c, &dst) ?: ret; - return ret; -} - static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) { if (wb->keys.size >= new_size) diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index ec7d9a59bea9c..56d3e3800a890 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -18,7 +18,9 @@ #include "error.h" #include "inode.h" #include "movinggc.h" +#include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "reflink.h" #include "replicas.h" #include "subvolume.h" @@ -260,8 +262,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - percpu_down_read(&c->mark_lock); - bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) @@ -362,7 +362,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, bch_info(c, "new key %s", buf.buf); } - percpu_up_read(&c->mark_lock); struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, BTREE_ITER_intent|BTREE_ITER_all_snapshots); @@ -371,8 +370,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun); bch2_trans_iter_exit(trans, &iter); - percpu_down_read(&c->mark_lock); - if (ret) goto err; @@ -380,7 +377,6 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); } err: - percpu_up_read(&c->mark_lock); printbuf_exit(&buf); return ret; } @@ -401,8 +397,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, BUG_ON(!sectors); if (gen_after(ptr->gen, b_gen)) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - ptr_gen_newer_than_bucket_gen, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -415,8 +411,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - ptr_too_stale, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_too_stale, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -435,8 +431,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (b_gen != ptr->gen) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - stale_dirty_ptr, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, stale_dirty_ptr, "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -451,8 +447,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - ptr_bucket_data_type_mismatch, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, ptr_bucket_data_type_mismatch, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -466,8 +462,8 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, } if ((u64) *bucket_sectors + sectors > U32_MAX) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - bucket_sector_count_overflow, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, bucket_sector_count_overflow, "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" "while marking %s", ptr->dev, bucket_nr, b_gen, @@ -485,7 +481,9 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, printbuf_exit(&buf); return ret; err: +fsck_err: bch2_dump_trans_updates(trans); + bch2_inconsistent_error(c); ret = -BCH_ERR_bucket_ref_update; goto out; } @@ -543,7 +541,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct extent_ptr_decoded *p, s64 sectors, enum bch_data_type ptr_data_type, - struct bch_alloc_v4 *a) + struct bch_alloc_v4 *a, + bool insert) { u32 *dst_sectors = p->has_ec ? &a->stripe_sectors : !p->ptr.cached ? &a->dirty_sectors : @@ -553,8 +552,8 @@ static int __mark_pointer(struct btree_trans *trans, struct bch_dev *ca, if (ret) return ret; - - alloc_data_type_set(a, ptr_data_type); + if (insert) + alloc_data_type_set(a, ptr_data_type); return 0; } @@ -570,8 +569,10 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - u64 abs_sectors = ptr_disk_sectors(level ? btree_sectors(c) : k.k->size, p); - *sectors = insert ? abs_sectors : -abs_sectors; + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp); + + *sectors = insert ? bp.v.bucket_len : -(s64) bp.v.bucket_len; struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { @@ -580,41 +581,36 @@ static int bch2_trigger_pointer(struct btree_trans *trans, goto err; } - struct bpos bucket; - struct bch_backpointer bp; - __bch2_extent_ptr_to_bp(trans->c, ca, btree_id, level, k, p, entry, &bucket, &bp, abs_sectors); + struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); ret = PTR_ERR_OR_ZERO(a) ?: - __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &a->v); + __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &a->v, insert); if (ret) goto err; if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, ca, bucket, bp, k, insert); + ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); if (ret) goto err; } } if (flags & BTREE_TRIGGER_gc) { - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = -BCH_ERR_trigger_pointer; - goto err_unlock; + goto err; } bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g), new = old; - ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.data_type, &new); + ret = __mark_pointer(trans, ca, k, &p, *sectors, bp.v.data_type, &new, insert); alloc_to_bucket(g, new); bucket_unlock(g); -err_unlock: - percpu_up_read(&c->mark_lock); if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); @@ -951,6 +947,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, enum bch_data_type type, unsigned sectors) { + struct bch_fs *c = trans->c; struct btree_iter iter; int ret = 0; @@ -960,8 +957,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_fsck_err(trans, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - bucket_metadata_type_mismatch, + bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); + log_fsck_err(trans, bucket_metadata_type_mismatch, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" "while marking %s", iter.pos.inode, iter.pos.offset, a->v.gen, @@ -979,6 +976,7 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: +fsck_err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -990,11 +988,10 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * struct bch_fs *c = trans->c; int ret = 0; - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, b); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u when marking metadata type %s", ca->dev_idx, bch2_data_type_str(data_type))) - goto err_unlock; + goto err; bucket_lock(g); struct bch_alloc_v4 old = bucket_m_to_alloc(*g); @@ -1004,26 +1001,24 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * "different types of data in same bucket: %s, %s", bch2_data_type_str(g->data_type), bch2_data_type_str(data_type))) - goto err; + goto err_unlock; if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, "bucket %u:%llu gen %u data type %s sector count overflow: %u + %u > bucket size", ca->dev_idx, b, g->gen, bch2_data_type_str(g->data_type ?: data_type), g->dirty_sectors, sectors)) - goto err; + goto err_unlock; g->data_type = data_type; g->dirty_sectors += sectors; struct bch_alloc_v4 new = bucket_m_to_alloc(*g); bucket_unlock(g); - percpu_up_read(&c->mark_lock); ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); return ret; -err: - bucket_unlock(g); err_unlock: - percpu_up_read(&c->mark_lock); + bucket_unlock(g); +err: return -BCH_ERR_metadata_bucket_inconsistency; } @@ -1155,6 +1150,31 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c) return bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_transactional); } +bool bch2_is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + for (i = 0; i < ca->journal.nr; i++) + if (b == ca->journal.buckets[i]) + return true; + + return false; +} + /* Disk reservations: */ #define SECTORS_CACHE 1024 @@ -1264,10 +1284,15 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bool resize = ca->bucket_gens != NULL; int ret; - BUG_ON(resize && ca->buckets_nouse); + if (resize) + lockdep_assert_held(&c->state_lock); + + if (resize && ca->buckets_nouse) + return -BCH_ERR_no_resize_with_buckets_nouse; - if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO))) { + bucket_gens = kvmalloc(struct_size(bucket_gens, b, nbuckets), + GFP_KERNEL|__GFP_ZERO); + if (!bucket_gens) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } @@ -1277,19 +1302,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bucket_gens->nbuckets_minus_first = bucket_gens->nbuckets - bucket_gens->first_bucket; - if (resize) { - down_write(&ca->bucket_lock); - percpu_down_write(&c->mark_lock); - } - old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { - size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); - + bucket_gens->nbuckets = min(bucket_gens->nbuckets, + old_bucket_gens->nbuckets); + bucket_gens->nbuckets_minus_first = + bucket_gens->nbuckets - bucket_gens->first_bucket; memcpy(bucket_gens->b, old_bucket_gens->b, - n); + bucket_gens->nbuckets); } rcu_assign_pointer(ca->bucket_gens, bucket_gens); @@ -1297,11 +1319,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) nbuckets = ca->mi.nbuckets; - if (resize) { - percpu_up_write(&c->mark_lock); - up_write(&ca->bucket_lock); - } - ret = 0; err: if (bucket_gens) diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index ccc78bfe2fd4e..a9acdd6c0c865 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -82,16 +82,15 @@ static inline void bucket_lock(struct bucket *b) static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) { - return genradix_ptr(&ca->buckets_gc, b); + return bucket_valid(ca, b) + ? genradix_ptr(&ca->buckets_gc, b) + : NULL; } static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) { return rcu_dereference_check(ca->bucket_gens, - !ca->fs || - percpu_rwsem_is_held(&ca->fs->mark_lock) || - lockdep_is_held(&ca->fs->state_lock) || - lockdep_is_held(&ca->bucket_lock)); + lockdep_is_held(&ca->fs->state_lock)); } static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) @@ -308,26 +307,7 @@ int bch2_trans_mark_dev_sbs_flags(struct bch_fs *, enum btree_iter_update_trigger_flags); int bch2_trans_mark_dev_sbs(struct bch_fs *); -static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -{ - struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; - u64 b_offset = bucket_to_sector(ca, b); - u64 b_end = bucket_to_sector(ca, b + 1); - unsigned i; - - if (!b) - return true; - - for (i = 0; i < layout->nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout->sb_offset[i]); - u64 end = offset + (1 << layout->sb_max_size_bits); - - if (!(offset >= b_end || end <= b_offset)) - return true; - } - - return false; -} +bool bch2_is_superblock_bucket(struct bch_dev *, u64); static inline const char *bch2_data_type_str(enum bch_data_type type) { diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 28bd09a253c83..7174047b8e920 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -24,7 +24,7 @@ struct bucket_gens { u16 first_bucket; size_t nbuckets; size_t nbuckets_minus_first; - u8 b[]; + u8 b[] __counted_by(nbuckets); }; struct bch_dev_usage { diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 2182b555c112c..46e9e32105a9f 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -6,11 +6,11 @@ #include "buckets.h" #include "chardev.h" #include "disk_accounting.h" +#include "fsck.h" #include "journal.h" #include "move.h" #include "recovery_passes.h" #include "replicas.h" -#include "super.h" #include "super-io.h" #include "thread_with_file.h" @@ -127,130 +127,6 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg } #endif -struct fsck_thread { - struct thread_with_stdio thr; - struct bch_fs *c; - struct bch_opts opts; -}; - -static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) -{ - struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); - kfree(thr); -} - -static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - int ret = PTR_ERR_OR_ZERO(c); - if (ret) - return ret; - - ret = bch2_fs_start(thr->c); - if (ret) - goto err; - - if (test_bit(BCH_FS_errors_fixed, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); - ret |= 1; - } - if (test_bit(BCH_FS_error, &c->flags)) { - bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); - ret |= 4; - } -err: - bch2_fs_stop(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_offline_thread_fn, -}; - -static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) -{ - struct bch_ioctl_fsck_offline arg; - struct fsck_thread *thr = NULL; - darray_str(devs) = {}; - long ret = 0; - - if (copy_from_user(&arg, user_arg, sizeof(arg))) - return -EFAULT; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - for (size_t i = 0; i < arg.nr_devs; i++) { - u64 dev_u64; - ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); - if (ret) - goto err; - - char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); - ret = PTR_ERR_OR_ZERO(dev_str); - if (ret) - goto err; - - ret = darray_push(&devs, dev_str); - if (ret) { - kfree(dev_str); - goto err; - } - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - opt_set(thr->opts, read_only, 1); - opt_set(thr->opts, ratelimit_errors, 0); - - /* We need request_key() to be called before we punt to kthread: */ - opt_set(thr->opts, nostart, true); - - bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - - thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); - - if (!IS_ERR(thr->c) && - thr->c->opts.errors == BCH_ON_ERROR_panic) - thr->c->opts.errors = BCH_ON_ERROR_ro; - - ret = __bch2_run_thread_with_stdio(&thr->thr); -out: - darray_for_each(devs, i) - kfree(*i); - darray_exit(&devs); - return ret; -err: - if (thr) - bch2_fsck_thread_exit(&thr->thr); - pr_err("ret %s", bch2_err_str(ret)); - goto out; -} - static long bch2_global_ioctl(unsigned cmd, void __user *arg) { long ret; @@ -775,99 +651,6 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } -static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) -{ - struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); - struct bch_fs *c = thr->c; - - c->stdio_filter = current; - c->stdio = &thr->thr.stdio; - - /* - * XXX: can we figure out a way to do this without mucking with c->opts? - */ - unsigned old_fix_errors = c->opts.fix_errors; - if (opt_defined(thr->opts, fix_errors)) - c->opts.fix_errors = thr->opts.fix_errors; - else - c->opts.fix_errors = FSCK_FIX_ask; - - c->opts.fsck = true; - set_bit(BCH_FS_fsck_running, &c->flags); - - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); - - clear_bit(BCH_FS_fsck_running, &c->flags); - bch_err_fn(c, ret); - - c->stdio = NULL; - c->stdio_filter = NULL; - c->opts.fix_errors = old_fix_errors; - - up(&c->online_fsck_mutex); - bch2_ro_ref_put(c); - return ret; -} - -static const struct thread_with_stdio_ops bch2_online_fsck_ops = { - .exit = bch2_fsck_thread_exit, - .fn = bch2_fsck_online_thread_fn, -}; - -static long bch2_ioctl_fsck_online(struct bch_fs *c, - struct bch_ioctl_fsck_online arg) -{ - struct fsck_thread *thr = NULL; - long ret = 0; - - if (arg.flags) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!bch2_ro_ref_tryget(c)) - return -EROFS; - - if (down_trylock(&c->online_fsck_mutex)) { - bch2_ro_ref_put(c); - return -EAGAIN; - } - - thr = kzalloc(sizeof(*thr), GFP_KERNEL); - if (!thr) { - ret = -ENOMEM; - goto err; - } - - thr->c = c; - thr->opts = bch2_opts_empty(); - - if (arg.opts) { - char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); - - ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); - if (!IS_ERR(optstr)) - kfree(optstr); - - if (ret) - goto err; - } - - ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); -err: - if (ret < 0) { - bch_err_fn(c, ret); - if (thr) - bch2_fsck_thread_exit(&thr->thr); - up(&c->online_fsck_mutex); - bch2_ro_ref_put(c); - } - return ret; -} - #define BCH_IOCTL(_name, _argtype) \ do { \ _argtype i; \ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index ce8fc677bef90..3d4b5ead0afdb 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "checksum.h" #include "errcode.h" +#include "error.h" #include "super.h" #include "super-io.h" @@ -22,7 +23,7 @@ /* * bch2_checksum state is an abstraction of the checksum state calculated over different pages. * it features page merging without having the checksum algorithm lose its state. - * for native checksum aglorithms (like crc), a default seed value will do. + * for native checksum algorithms (like crc), a default seed value will do. * for hash-like algorithms, a state needs to be stored */ @@ -252,6 +253,10 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (!bch2_csum_type_is_encryption(type)) return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; + return do_encrypt(c->chacha20, nonce, data, len); } @@ -337,8 +342,9 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, size_t sgl_len = 0; int ret = 0; - if (!bch2_csum_type_is_encryption(type)) - return 0; + if (bch2_fs_inconsistent_on(!c->chacha20, + c, "attempting to encrypt without encryption key")) + return -BCH_ERR_no_encryption_key; darray_init(&sgl); diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index e40499fde9a40..43b9d71f2f2b4 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -109,7 +109,7 @@ int bch2_enable_encryption(struct bch_fs *, bool); void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); -static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, +static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opt type, bool data) { switch (type) { diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 1410365a88915..f99ff18195979 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -2,13 +2,33 @@ #include "bcachefs.h" #include "checksum.h" #include "compress.h" +#include "error.h" #include "extents.h" +#include "opts.h" #include "super-io.h" #include #include #include +static inline enum bch_compression_opts bch2_compression_type_to_opt(enum bch_compression_type type) +{ + switch (type) { + case BCH_COMPRESSION_TYPE_none: + case BCH_COMPRESSION_TYPE_incompressible: + return BCH_COMPRESSION_OPT_none; + case BCH_COMPRESSION_TYPE_lz4_old: + case BCH_COMPRESSION_TYPE_lz4: + return BCH_COMPRESSION_OPT_lz4; + case BCH_COMPRESSION_TYPE_gzip: + return BCH_COMPRESSION_OPT_gzip; + case BCH_COMPRESSION_TYPE_zstd: + return BCH_COMPRESSION_OPT_zstd; + default: + BUG(); + } +} + /* Bounce buffer: */ struct bbuf { void *b; @@ -158,6 +178,19 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, void *workspace; int ret; + enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); + mempool_t *workspace_pool = &c->compress_workspace[opt]; + if (unlikely(!mempool_initialized(workspace_pool))) { + if (fsck_err(c, compression_type_not_marked_in_sb, + "compression type %s set but not marked in superblock", + __bch2_compression_types[crc.compression_type])) + ret = bch2_check_set_has_compressed_data(c, opt); + else + ret = -BCH_ERR_compression_workspace_not_initialized; + if (ret) + goto out; + } + src_data = bio_map_or_bounce(c, src, READ); switch (crc.compression_type) { @@ -176,13 +209,13 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, .avail_out = dst_len, }; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); ret = zlib_inflate(&strm, Z_FINISH); - mempool_free(workspace, &c->decompress_workspace); + mempool_free(workspace, workspace_pool); if (ret != Z_STREAM_END) goto err; @@ -195,14 +228,14 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, if (real_src_len > src_len - 4) goto err; - workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); ret = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); - mempool_free(workspace, &c->decompress_workspace); + mempool_free(workspace, workspace_pool); if (ret != dst_len) goto err; @@ -212,6 +245,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, BUG(); } ret = 0; +fsck_err: out: bio_unmap_or_unbounce(c, src_data); return ret; @@ -394,8 +428,21 @@ static unsigned __bio_compress(struct bch_fs *c, unsigned pad; int ret = 0; - BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); - BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + /* bch2_compression_decode catches unknown compression types: */ + BUG_ON(compression.type >= BCH_COMPRESSION_OPT_NR); + + mempool_t *workspace_pool = &c->compress_workspace[compression.type]; + if (unlikely(!mempool_initialized(workspace_pool))) { + if (fsck_err(c, compression_opt_not_marked_in_sb, + "compression opt %s set but not marked in superblock", + bch2_compression_opts[compression.type])) { + ret = bch2_check_set_has_compressed_data(c, compression.type); + if (ret) /* memory allocation failure, don't compress */ + return 0; + } else { + return 0; + } + } /* If it's only one block, don't bother trying to compress: */ if (src->bi_iter.bi_size <= c->opts.block_size) @@ -404,7 +451,7 @@ static unsigned __bio_compress(struct bch_fs *c, dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); - workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); + workspace = mempool_alloc(workspace_pool, GFP_NOFS); *src_len = src->bi_iter.bi_size; *dst_len = dst->bi_iter.bi_size; @@ -447,7 +494,7 @@ static unsigned __bio_compress(struct bch_fs *c, *src_len = round_down(*src_len, block_bytes(c)); } - mempool_free(workspace, &c->compress_workspace[compression_type]); + mempool_free(workspace, workspace_pool); if (ret) goto err; @@ -477,6 +524,9 @@ static unsigned __bio_compress(struct bch_fs *c, err: ret = BCH_COMPRESSION_TYPE_incompressible; goto out; +fsck_err: + ret = 0; + goto out; } unsigned bch2_bio_compress(struct bch_fs *c, @@ -559,7 +609,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) { unsigned i; - mempool_exit(&c->decompress_workspace); for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) mempool_exit(&c->compress_workspace[i]); mempool_exit(&c->compression_bounce[WRITE]); @@ -568,7 +617,6 @@ void bch2_fs_compress_exit(struct bch_fs *c) static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) { - size_t decompress_workspace_size = 0; ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), c->opts.encoded_extent_max); @@ -576,19 +624,17 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) struct { unsigned feature; - enum bch_compression_type type; + enum bch_compression_opts type; size_t compress_workspace; - size_t decompress_workspace; } compression_types[] = { - { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, - max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS), - 0 }, - { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, - zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), - zlib_inflate_workspacesize(), }, - { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, - c->zstd_workspace_size, - zstd_dctx_workspace_bound() }, + { BCH_FEATURE_lz4, BCH_COMPRESSION_OPT_lz4, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, + { BCH_FEATURE_gzip, BCH_COMPRESSION_OPT_gzip, + max(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize()) }, + { BCH_FEATURE_zstd, BCH_COMPRESSION_OPT_zstd, + max(c->zstd_workspace_size, + zstd_dctx_workspace_bound()) }, }, *i; bool have_compressed = false; @@ -613,9 +659,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); i++) { - decompress_workspace_size = - max(decompress_workspace_size, i->decompress_workspace); - if (!(features & (1 << i->feature))) continue; @@ -628,11 +671,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return -BCH_ERR_ENOMEM_compression_workspace_init; } - if (!mempool_initialized(&c->decompress_workspace) && - mempool_init_kvmalloc_pool(&c->decompress_workspace, - 1, decompress_workspace_size)) - return -BCH_ERR_ENOMEM_decompression_workspace_init; - return 0; } diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 8f4c3f0665c45..c6151495985fd 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -83,7 +83,7 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each_reverse(_d, _i) \ - for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i) + for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) #define darray_init(_d) \ do { \ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 8e75a852b3584..9cf1bce45baf9 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -110,11 +110,8 @@ static void trace_move_extent_fail2(struct data_update *m, { struct bch_fs *c = m->op.c; struct bkey_s_c old = bkey_i_to_s_c(m->k.k); - const union bch_extent_entry *entry; - struct bch_extent_ptr *ptr; - struct extent_ptr_decoded p; struct printbuf buf = PRINTBUF; - unsigned i, rewrites_found = 0; + unsigned rewrites_found = 0; if (!trace_move_extent_fail_enabled()) return; @@ -122,27 +119,25 @@ static void trace_move_extent_fail2(struct data_update *m, prt_str(&buf, msg); if (insert) { - i = 0; + const union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; + + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { - if (((1U << i) & m->data_opts.rewrite_ptrs) && + if ((ptr_bit & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) - rewrites_found |= 1U << i; - i++; + rewrites_found |= ptr_bit; + ptr_bit <<= 1; } } - prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", - (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, - (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); + prt_str(&buf, "rewrites found:\t"); + bch2_prt_u64_base2(&buf, rewrites_found); + prt_newline(&buf); - prt_printf(&buf, "\nrewrites found: %u%u%u%u", - (rewrites_found & (1 << 0)) != 0, - (rewrites_found & (1 << 1)) != 0, - (rewrites_found & (1 << 2)) != 0, - (rewrites_found & (1 << 3)) != 0); + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); prt_str(&buf, "\nold: "); bch2_bkey_val_to_text(&buf, c, old); @@ -194,7 +189,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, struct bpos next_pos; bool should_check_enospc; s64 i_sectors_delta = 0, disk_sectors_delta = 0; - unsigned rewrites_found = 0, durability, i; + unsigned rewrites_found = 0, durability, ptr_bit; bch2_trans_begin(trans); @@ -229,18 +224,18 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * other updates * @new: extent with new pointers that we'll be adding to @insert * - * Fist, drop rewrite_ptrs from @new: + * First, drop rewrite_ptrs from @new: */ - i = 0; + ptr_bit = 1; bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { - if (((1U << i) & m->data_opts.rewrite_ptrs) && + if ((ptr_bit & m->data_opts.rewrite_ptrs) && (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && !ptr->cached) { bch2_extent_ptr_set_cached(c, &m->op.opts, bkey_i_to_s(insert), ptr); - rewrites_found |= 1U << i; + rewrites_found |= ptr_bit; } - i++; + ptr_bit <<= 1; } if (m->data_opts.rewrite_ptrs && @@ -323,8 +318,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, * it's been hard to reproduce, so this should give us some more * information when it does occur: */ - int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), __btree_node_type(0, m->btree_id), - BCH_VALIDATE_commit); + int invalid = bch2_bkey_validate(c, bkey_i_to_s_c(insert), + (struct bkey_validate_context) { + .btree = m->btree_id, + .flags = BCH_VALIDATE_commit, + }); if (invalid) { struct printbuf buf = PRINTBUF; @@ -362,7 +360,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: - bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?: + bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: bch2_trans_update(trans, &iter, insert, BTREE_UPDATE_internal_snapshot_node) ?: bch2_trans_commit(trans, &op->res, @@ -540,7 +538,7 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_newline(out); prt_str(out, "compression:\t"); - bch2_compression_opt_to_text(out, background_compression(*io_opts)); + bch2_compression_opt_to_text(out, io_opts->background_compression); prt_newline(out); prt_str(out, "opts.replicas:\t"); @@ -614,7 +612,7 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; - unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; int ret = 0; /* @@ -622,7 +620,7 @@ int bch2_data_update_init(struct btree_trans *trans, * and we have to check for this because we go rw before repairing the * snapshots table - just skip it, we can move it later. */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot))) + if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) return -BCH_ERR_data_update_done; if (!bkey_get_dev_refs(c, k)) @@ -652,22 +650,22 @@ int bch2_data_update_init(struct btree_trans *trans, BCH_WRITE_DATA_ENCODED| BCH_WRITE_MOVE| m->data_opts.write_flags; - m->op.compression_opt = background_compression(io_opts); + m->op.compression_opt = io_opts.background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; unsigned durability_have = 0, durability_removing = 0; - i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached) { rcu_read_lock(); - if (BIT(i) & m->data_opts.rewrite_ptrs) { + if (ptr_bit & m->data_opts.rewrite_ptrs) { if (crc_is_compressed(p.crc)) reserve_sectors += k.k->size; m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); durability_removing += bch2_extent_ptr_desired_durability(c, &p); - } else if (!(BIT(i) & m->data_opts.kill_ptrs)) { + } else if (!(ptr_bit & m->data_opts.kill_ptrs)) { bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } @@ -687,7 +685,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; - i++; + ptr_bit <<= 1; } unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); @@ -705,7 +703,7 @@ int bch2_data_update_init(struct btree_trans *trans, /* * If device(s) were set to durability=0 after data was written to them - * we can end up with a duribilty=0 extent, and the normal algorithm + * we can end up with a durability=0 extent, and the normal algorithm * that tries not to increase durability doesn't work: */ if (!(durability_have + durability_removing)) @@ -750,14 +748,14 @@ int bch2_data_update_init(struct btree_trans *trans, void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { - if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { - opts->kill_ptrs |= 1U << i; - opts->rewrite_ptrs ^= 1U << i; + if ((opts->rewrite_ptrs & ptr_bit) && ptr->cached) { + opts->kill_ptrs |= ptr_bit; + opts->rewrite_ptrs ^= ptr_bit; } - i++; + ptr_bit <<= 1; } } diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 45aec1afdb0e3..b5de52a50d10b 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -472,7 +472,9 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); - prt_printf(out, "%px btree=%s l=%u\n", b, bch2_btree_id_str(b->c.btree_id), b->c.level); + prt_printf(out, "%px ", b); + bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); + prt_printf(out, "\n"); printbuf_indent_add(out, 2); diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index faffc98d56053..600eee936f136 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -101,7 +101,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = { }; int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); @@ -120,7 +120,7 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, * Check new keys don't exceed the max length * (older keys may be larger.) */ - bkey_fsck_err_on((flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, + bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && d_name.len > BCH_NAME_MAX, c, dirent_name_too_long, "dirent name too big (%u > %u)", d_name.len, BCH_NAME_MAX); @@ -266,7 +266,7 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } else { target->subvol = le32_to_cpu(d.v->d_child_subvol); - ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_cached, &s); + ret = bch2_subvolume_get(trans, target->subvol, true, &s); target->inum = le64_to_cpu(s.inode); } @@ -500,7 +500,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 struct bkey_s_c k; int ret; - for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_dirents, SPOS(dir, 0, snapshot), POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { @@ -549,7 +549,7 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) bch2_bkey_buf_init(&sk); int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_dirents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_dirents, POS(inum.inum, ctx->pos), POS(inum.inum, U64_MAX), inum.subvol, 0, k, ({ diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 53ad996660226..362b3b2f2f2e3 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -4,10 +4,10 @@ #include "str_hash.h" -enum bch_validate_flags; extern const struct bch_hash_desc bch2_dirent_hash_desc; -int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_dirent_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_dirent ((struct bkey_ops) { \ diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index 07eb8fa1b0266..78ba98147f54e 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -25,7 +25,7 @@ * expensive, so we also have * * - In memory accounting, where accounting is stored as an array of percpu - * counters, indexed by an eytzinger array of disk acounting keys/bpos (which + * counters, indexed by an eytzinger array of disk accounting keys/bpos (which * are the same thing, excepting byte swabbing on big endian). * * Cheap to read, but non persistent. @@ -79,6 +79,8 @@ static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_ memcpy_u64s_small(acc->v.d, d, nr); } +static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); + int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) @@ -96,9 +98,16 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, accounting_key_init(&k_i.k, k, d, nr); - return likely(!gc) - ? bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k) - : bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (unlikely(gc)) { + int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + if (ret == -BCH_ERR_btree_insert_need_mark_replicas) + ret = drop_locks_do(trans, + bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: + bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); + return ret; + } else { + return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); + } } int bch2_mod_dev_cached_sectors(struct btree_trans *trans, @@ -127,14 +136,15 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, k.k->p); void *end = &acc_k + 1; int ret = 0; - bkey_fsck_err_on(bversion_zero(k.k->bversion), + bkey_fsck_err_on((from.flags & BCH_VALIDATE_commit) && + bversion_zero(k.k->bversion), c, accounting_key_version_0, "accounting key with version=0"); @@ -217,7 +227,8 @@ void bch2_accounting_key_to_text(struct printbuf *out, struct disk_accounting_po prt_printf(out, "id=%u", k->snapshot.id); break; case BCH_DISK_ACCOUNTING_btree: - prt_printf(out, "btree=%s", bch2_btree_id_str(k->btree.id)); + prt_str(out, "btree="); + bch2_btree_id_to_text(out, k->btree.id); break; } } @@ -243,10 +254,10 @@ void bch2_accounting_swab(struct bkey_s k) } static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r, - struct disk_accounting_pos acc) + struct disk_accounting_pos *acc) { - unsafe_memcpy(r, &acc.replicas, - replicas_entry_bytes(&acc.replicas), + unsafe_memcpy(r, &acc->replicas, + replicas_entry_bytes(&acc->replicas), "variable length struct"); } @@ -257,7 +268,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc switch (acc_k.type) { case BCH_DISK_ACCOUNTING_replicas: - __accounting_to_replicas(r, acc_k); + __accounting_to_replicas(r, &acc_k); return true; default: return false; @@ -322,6 +333,14 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, NULL); + + if (trace_accounting_mem_insert_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_accounting_to_text(&buf, c, a.s_c); + trace_accounting_mem_insert(c, buf.buf); + printbuf_exit(&buf); + } return 0; err: free_percpu(n.v[1]); @@ -383,7 +402,7 @@ void bch2_accounting_mem_gc(struct bch_fs *c) * Read out accounting keys for replicas entries, as an array of * bch_replicas_usage entries. * - * Note: this may be deprecated/removed at smoe point in the future and replaced + * Note: this may be deprecated/removed at some point in the future and replaced * with something more general, it exists to support the ioctl used by the * 'bcachefs fs usage' command. */ @@ -461,32 +480,6 @@ int bch2_fs_accounting_read(struct bch_fs *c, darray_char *out_buf, unsigned acc return ret; } -void bch2_fs_accounting_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_accounting_mem *acc = &c->accounting; - - percpu_down_read(&c->mark_lock); - out->atomic++; - - eytzinger0_for_each(i, acc->k.nr) { - struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[i].pos); - - bch2_accounting_key_to_text(out, &acc_k); - - u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false); - - prt_str(out, ":"); - for (unsigned j = 0; j < acc->k.data[i].nr_counters; j++) - prt_printf(out, " %llu", v[j]); - prt_newline(out); - } - - --out->atomic; - percpu_up_read(&c->mark_lock); -} - static void bch2_accounting_free_counters(struct bch_accounting_mem *acc, bool gc) { darray_for_each(acc->k, e) { @@ -625,7 +618,7 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, switch (acc.type) { case BCH_DISK_ACCOUNTING_replicas: { struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, acc); + __accounting_to_replicas(&r.e, &acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && @@ -699,11 +692,45 @@ int bch2_accounting_read(struct bch_fs *c) struct btree_trans *trans = bch2_trans_get(c); struct printbuf buf = PRINTBUF; - int ret = for_each_btree_key(trans, iter, - BTREE_ID_accounting, POS_MIN, + /* + * We might run more than once if we rewind to start topology repair or + * btree node scan - and those might cause us to get different results, + * so we can't just skip if we've already run. + * + * Instead, zero out any accounting we have: + */ + percpu_down_write(&c->mark_lock); + darray_for_each(acc->k, e) + percpu_memset(e->v[0], 0, sizeof(u64) * e->nr_counters); + for_each_member_device(c, ca) + percpu_memset(ca->usage, 0, sizeof(*ca->usage)); + percpu_memset(c->usage, 0, sizeof(*c->usage)); + percpu_up_write(&c->mark_lock); + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_accounting, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + iter.flags &= ~BTREE_ITER_with_journal; + int ret = for_each_btree_key_continue(trans, iter, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, ({ struct bkey u; struct bkey_s_c k = bch2_btree_path_peek_slot_exact(btree_iter_path(trans, &iter), &u); + + if (k.k->type != KEY_TYPE_accounting) + continue; + + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, k.k->p); + + if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) + break; + + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + continue; + } + accounting_read_key(trans, k); })); if (ret) @@ -715,6 +742,12 @@ int bch2_accounting_read(struct bch_fs *c) darray_for_each(*keys, i) { if (i->k->k.type == KEY_TYPE_accounting) { + struct disk_accounting_pos acc_k; + bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); + + if (!bch2_accounting_is_mem(acc_k)) + continue; + struct bkey_s_c k = bkey_i_to_s_c(i->k); unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), @@ -748,15 +781,16 @@ int bch2_accounting_read(struct bch_fs *c) keys->gap = keys->nr = dst - keys->data; percpu_down_write(&c->mark_lock); - unsigned i = 0; - while (i < acc->k.nr) { - unsigned idx = inorder_to_eytzinger0(i, acc->k.nr); + darray_for_each_reverse(acc->k, i) { struct disk_accounting_pos acc_k; - bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos); + bpos_to_disk_accounting_pos(&acc_k, i->pos); u64 v[BCH_ACCOUNTING_MAX_COUNTERS]; - bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false); + memset(v, 0, sizeof(v)); + + for (unsigned j = 0; j < i->nr_counters; j++) + v[j] = percpu_u64_get(i->v[0] + j); /* * If the entry counters are zeroed, it should be treated as @@ -765,26 +799,25 @@ int bch2_accounting_read(struct bch_fs *c) * Remove it, so that if it's re-added it gets re-marked in the * superblock: */ - ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters) + ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, - v, acc->k.data[idx].nr_counters); + : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { - free_percpu(acc->k.data[idx].v[0]); - free_percpu(acc->k.data[idx].v[1]); - darray_remove_item(&acc->k, &acc->k.data[idx]); - eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), - accounting_pos_cmp, NULL); + free_percpu(i->v[0]); + free_percpu(i->v[1]); + darray_remove_item(&acc->k, i); ret = 0; continue; } if (ret) goto fsck_err; - i++; } + eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), + accounting_pos_cmp, NULL); + preempt_disable(); struct bch_fs_usage_base *usage = this_cpu_ptr(c->usage); @@ -804,7 +837,7 @@ int bch2_accounting_read(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; percpu_u64_set(&d->buckets, v[0]); @@ -881,10 +914,13 @@ void bch2_verify_accounting_clean(struct bch_fs *c) bpos_to_disk_accounting_pos(&acc_k, k.k->p); if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) - continue; + break; - if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + if (!bch2_accounting_is_mem(acc_k)) { + struct disk_accounting_pos next = { .type = acc_k.type + 1 }; + bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); continue; + } bch2_accounting_mem_read(c, k.k->p, v, nr); @@ -910,7 +946,7 @@ void bch2_verify_accounting_clean(struct bch_fs *c) break; case BCH_DISK_ACCOUNTING_dev_data_type: { rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (!ca) { rcu_read_unlock(); continue; diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 4ea6c8a092bc1..5360cbb3ec298 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_DISK_ACCOUNTING_H #define _BCACHEFS_DISK_ACCOUNTING_H +#include "btree_update.h" #include "eytzinger.h" #include "sb-members.h" @@ -62,27 +63,32 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage static inline void bpos_to_disk_accounting_pos(struct disk_accounting_pos *acc, struct bpos p) { - acc->_pad = p; + BUILD_BUG_ON(sizeof(*acc) != sizeof(p)); + #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - bch2_bpos_swab(&acc->_pad); + acc->_pad = p; +#else + memcpy_swab(acc, &p, sizeof(p)); #endif } -static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *k) +static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos *acc) { - struct bpos ret = k->_pad; - + struct bpos p; #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - bch2_bpos_swab(&ret); + p = acc->_pad; +#else + memcpy_swab(&p, acc, sizeof(p)); #endif - return ret; + return p; } int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); -int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_accounting_key_to_text(struct printbuf *, struct disk_accounting_pos *); void bch2_accounting_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_accounting_swab(struct bkey_s); @@ -112,6 +118,12 @@ enum bch_accounting_mode { int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +{ + return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc.type != BCH_DISK_ACCOUNTING_inum; +} + /* * Update in memory counters so they match the btree update we're doing; called * from transaction commit path @@ -126,9 +138,10 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, bpos_to_disk_accounting_pos(&acc_k, a.k->p); bool gc = mode == BCH_ACCOUNTING_gc; - EBUG_ON(gc && !acc->gc_running); + if (gc && !acc->gc_running) + return 0; - if (acc_k.type == BCH_DISK_ACCOUNTING_inum) + if (!bch2_accounting_is_mem(acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { @@ -141,7 +154,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, break; case BCH_DISK_ACCOUNTING_dev_data_type: rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu(c, acc_k.dev_data_type.dev); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); @@ -204,9 +217,45 @@ static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, bch2_accounting_mem_read_counters(acc, idx, v, nr, false); } +static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) +{ + EBUG_ON(!res->ref); + + return (struct bversion) { + .hi = res->seq >> 32, + .lo = (res->seq << 32) | (res->offset + offset), + }; +} + +static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, + struct bkey_i_accounting *a, + unsigned commit_flags) +{ + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, + (u64 *) a - (u64 *) trans->journal_entries); + + EBUG_ON(bversion_zero(a->k.bversion)); + + return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) + ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) + : 0; +} + +static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans, + struct bkey_i_accounting *a_i, + unsigned commit_flags) +{ + if (likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply))) { + struct bkey_s_accounting a = accounting_i_to_s(a_i); + + bch2_accounting_neg(a); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); + bch2_accounting_neg(a); + } +} + int bch2_fs_replicas_usage_read(struct bch_fs *, darray_char *); int bch2_fs_accounting_read(struct bch_fs *, darray_char *, unsigned); -void bch2_fs_accounting_to_text(struct printbuf *, struct bch_fs *); int bch2_gc_accounting_start(struct bch_fs *); int bch2_gc_accounting_done(struct bch_fs *); diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 7b6e6c97e6aa6..6588f69e98d7e 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -10,7 +10,7 @@ * Here, the key has considerably more structure than a typical key (bpos); an * accounting key is 'struct disk_accounting_pos', which is a union of bpos. * - * More specifically: a key is just a muliword integer (where word endianness + * More specifically: a key is just a multiword integer (where word endianness * matches native byte order), so we're treating bpos as an opaque 20 byte * integer and mapping bch_accounting_key to that. * diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 749dcf368841d..859db4f2160b9 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -26,6 +26,7 @@ #include "util.h" #include +#include #ifdef __KERNEL__ @@ -109,7 +110,7 @@ struct ec_bio { /* Stripes btree keys: */ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; int ret = 0; @@ -129,7 +130,7 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k, "invalid csum granularity (%u >= 64)", s->csum_granularity_bits); - ret = bch2_bkey_ptrs_validate(c, k, flags); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -304,13 +305,12 @@ static int mark_stripe_bucket(struct btree_trans *trans, } if (flags & BTREE_TRIGGER_gc) { - percpu_down_read(&c->mark_lock); struct bucket *g = gc_bucket(ca, bucket.offset); if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -BCH_ERR_mark_stripe; - goto err_unlock; + goto err; } bucket_lock(g); @@ -318,8 +318,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, ret = __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &new, flags); alloc_to_bucket(g, new); bucket_unlock(g); -err_unlock: - percpu_up_read(&c->mark_lock); + if (!ret) ret = bch2_alloc_key_to_dev_counters(trans, ca, &old, &new, flags); } @@ -732,7 +731,7 @@ static void ec_block_endio(struct bio *bio) ? BCH_MEMBER_ERROR_write : BCH_MEMBER_ERROR_read, "erasure coding %s error: %s", - bio_data_dir(bio) ? "write" : "read", + str_write_read(bio_data_dir(bio)), bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); @@ -909,19 +908,19 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, bch2_bkey_val_to_text(&msgbuf, c, orig_k); bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); - printbuf_exit(&msgbuf);; + printbuf_exit(&msgbuf); ret = -BCH_ERR_stripe_reconstruct; goto out; } /* stripe bucket accounting: */ -static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) +static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx) { ec_stripes_heap n, *h = &c->ec_stripes_heap; if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) + if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), GFP_KERNEL)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; mutex_lock(&c->ec_stripes_heap_lock); @@ -935,11 +934,11 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) free_heap(&n); } - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) + if (!genradix_ptr_alloc(&c->stripes, idx, GFP_KERNEL)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; if (c->gc_pos.phase != GC_PHASE_not_running && - !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) + !genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; return 0; @@ -949,7 +948,7 @@ static int ec_stripe_mem_alloc(struct btree_trans *trans, struct btree_iter *iter) { return allocate_dropping_locks_errcode(trans, - __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); + __ec_stripe_mem_alloc(trans->c, iter->pos.offset)); } /* @@ -1275,11 +1274,11 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket, u8 gen, struct ec_stripe_buf *s, - struct bpos *bp_pos) + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) { struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_fs *c = trans->c; - struct bch_backpointer bp; struct btree_iter iter; struct bkey_s_c k; const struct bch_extent_ptr *ptr_c; @@ -1288,33 +1287,26 @@ static int ec_stripe_update_extent(struct btree_trans *trans, struct bkey_i *n; int ret, dev, block; - ret = bch2_get_next_backpointer(trans, ca, bucket, gen, - bp_pos, &bp, BTREE_ITER_cached); - if (ret) - return ret; - if (bpos_eq(*bp_pos, SPOS_MAX)) - return 0; - - if (bp.level) { + if (bp.v->level) { struct printbuf buf = PRINTBUF; struct btree_iter node_iter; struct btree *b; - b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); + b = bch2_backpointer_get_node(trans, bp, &node_iter, last_flushed); bch2_trans_iter_exit(trans, &node_iter); if (!b) return 0; prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); - bch2_backpointer_to_text(&buf, &bp); + bch2_bkey_val_to_text(&buf, c, bp.s_c); bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); return -EIO; } - k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_intent); + k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); ret = bkey_err(k); if (ret) return ret; @@ -1373,7 +1365,6 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_fs *c = trans->c; struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; struct bch_extent_ptr ptr = v->ptrs[block]; - struct bpos bp_pos = POS_MIN; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); @@ -1382,19 +1373,27 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); - while (1) { - ret = commit_do(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc, - ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, &bp_pos)); - if (ret) - break; - if (bkey_eq(bp_pos, POS_MAX)) + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + ret = for_each_btree_key_max_commit(trans, bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket_pos), + bucket_pos_to_bp_end(ca, bucket_pos), 0, bp_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc, ({ + if (bkey_ge(bp_k.k->p, bucket_pos_to_bp(ca, bpos_nosnap_successor(bucket_pos), 0))) break; - bp_pos = bpos_nosnap_successor(bp_pos); - } + if (bp_k.k->type != KEY_TYPE_backpointer) + continue; + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, + bkey_s_c_to_backpointer(bp_k), &last_flushed); + })); + + bch2_bkey_buf_exit(&last_flushed, c); bch2_dev_put(ca); return ret; } @@ -1716,7 +1715,7 @@ static void ec_stripe_key_init(struct bch_fs *c, set_bkey_val_u64s(&s->k, u64s); } -static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; @@ -1724,7 +1723,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) s = kzalloc(sizeof(*s), GFP_KERNEL); if (!s) - return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + return NULL; mutex_init(&s->lock); closure_init(&s->iodone, NULL); @@ -1739,10 +1738,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, s->nr_parity, h->blocksize, h->disk_label); - - h->s = s; - h->nr_created++; - return 0; + return s; } static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) @@ -1887,25 +1883,26 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, return h; } -static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, +static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct ec_stripe_head *h, struct ec_stripe_new *s, enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; struct bch_devs_mask devs = h->devs; struct open_bucket *ob; struct open_buckets buckets; - struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; + struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; bool have_cache = true; int ret = 0; - BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); - BUG_ON(v->nr_redundant != h->s->nr_parity); + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); + BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); - for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { + for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* * Note: we don't yet repair invalid blocks (failed/removed * devices) when reusing stripes - we still need a codepath to @@ -1915,21 +1912,21 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) __clear_bit(v->ptrs[i].dev, devs.d); - if (i < h->s->nr_data) + if (i < s->nr_data) nr_have_data++; else nr_have_parity++; } - BUG_ON(nr_have_data > h->s->nr_data); - BUG_ON(nr_have_parity > h->s->nr_parity); + BUG_ON(nr_have_data > s->nr_data); + BUG_ON(nr_have_parity > s->nr_parity); buckets.nr = 0; - if (nr_have_parity < h->s->nr_parity) { + if (nr_have_parity < s->nr_parity) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->parity_stripe, &devs, - h->s->nr_parity, + s->nr_parity, &nr_have_parity, &have_cache, 0, BCH_DATA_parity, @@ -1937,14 +1934,14 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data + h->s->nr_parity, - h->s->nr_data); - BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data + s->nr_parity, + s->nr_data); + BUG_ON(j >= s->nr_data + s->nr_parity); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -1952,11 +1949,11 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ } buckets.nr = 0; - if (nr_have_data < h->s->nr_data) { + if (nr_have_data < s->nr_data) { ret = bch2_bucket_alloc_set_trans(trans, &buckets, &h->block_stripe, &devs, - h->s->nr_data, + s->nr_data, &nr_have_data, &have_cache, 0, BCH_DATA_user, @@ -1964,13 +1961,13 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_ cl); open_bucket_for_each(c, &buckets, ob, i) { - j = find_next_zero_bit(h->s->blocks_gotten, - h->s->nr_data, 0); - BUG_ON(j >= h->s->nr_data); + j = find_next_zero_bit(s->blocks_gotten, + s->nr_data, 0); + BUG_ON(j >= s->nr_data); - h->s->blocks[j] = buckets.v[i]; + s->blocks[j] = buckets.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); - __set_bit(j, h->s->blocks_gotten); + __set_bit(j, s->blocks_gotten); } if (ret) @@ -2016,73 +2013,78 @@ static s64 get_existing_stripe(struct bch_fs *c, return ret; } -static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new *s) { - struct bch_fs *c = trans->c; - struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; - struct bch_stripe *existing_v; + struct bch_stripe *new_v = &bkey_i_to_stripe(&s->new_stripe.key)->v; + struct bch_stripe *existing_v = &bkey_i_to_stripe(&s->existing_stripe.key)->v; unsigned i; - s64 idx; - int ret; - - /* - * If we can't allocate a new stripe, and there's no stripes with empty - * blocks for us to reuse, that means we have to wait on copygc: - */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; - - ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, h->s); - return ret; - } - existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; - - BUG_ON(existing_v->nr_redundant != h->s->nr_parity); - h->s->nr_data = existing_v->nr_blocks - + BUG_ON(existing_v->nr_redundant != s->nr_parity); + s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; - ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); + int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { - bch2_stripe_close(c, h->s); + bch2_stripe_close(c, s); return ret; } - BUG_ON(h->s->existing_stripe.size != h->blocksize); - BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); + BUG_ON(s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); /* * Free buckets we initially allocated - they might conflict with * blocks from the stripe we're reusing: */ - for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { - bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); - h->s->blocks[i] = 0; + for_each_set_bit(i, s->blocks_gotten, new_v->nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + s->blocks[i]); + s->blocks[i] = 0; } - memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); - memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); + memset(s->blocks_gotten, 0, sizeof(s->blocks_gotten)); + memset(s->blocks_allocated, 0, sizeof(s->blocks_allocated)); - for (i = 0; i < existing_v->nr_blocks; i++) { + for (unsigned i = 0; i < existing_v->nr_blocks; i++) { if (stripe_blockcount_get(existing_v, i)) { - __set_bit(i, h->s->blocks_gotten); - __set_bit(i, h->s->blocks_allocated); + __set_bit(i, s->blocks_gotten); + __set_bit(i, s->blocks_allocated); } - ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + ec_block_io(c, &s->existing_stripe, READ, i, &s->iodone); } - bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); - h->s->have_existing_stripe = true; + bkey_copy(&s->new_stripe.key, &s->existing_stripe.key); + s->have_existing_stripe = true; return 0; } -static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h, + struct ec_stripe_new *s) +{ + struct bch_fs *c = trans->c; + s64 idx; + int ret; + + /* + * If we can't allocate a new stripe, and there's no stripes with empty + * blocks for us to reuse, that means we have to wait on copygc: + */ + idx = get_existing_stripe(c, h); + if (idx < 0) + return -BCH_ERR_stripe_alloc_blocked; + + ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); + bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, + "reading stripe key: %s", bch2_err_str(ret)); + if (ret) { + bch2_stripe_close(c, s); + return ret; + } + + return init_new_stripe_from_existing(c, s); +} + +static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h, + struct ec_stripe_new *s) { struct bch_fs *c = trans->c; struct btree_iter iter; @@ -2091,15 +2093,19 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); int ret; - if (!h->s->res.sectors) { - ret = bch2_disk_reservation_get(c, &h->s->res, + if (!s->res.sectors) { + ret = bch2_disk_reservation_get(c, &s->res, h->blocksize, - h->s->nr_parity, + s->nr_parity, BCH_DISK_RESERVATION_NOFAIL); if (ret) return ret; } + /* + * Allocate stripe slot + * XXX: we're going to need a bitrange btree of free stripes + */ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, BTREE_ITER_slots|BTREE_ITER_intent, k, ret) { if (bkey_gt(k.k->p, POS(0, U32_MAX))) { @@ -2114,7 +2120,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st } if (bkey_deleted(k.k) && - bch2_try_open_stripe(c, h->s, k.k->p.offset)) + bch2_try_open_stripe(c, s, k.k->p.offset)) break; } @@ -2125,16 +2131,16 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st ret = ec_stripe_mem_alloc(trans, &iter); if (ret) { - bch2_stripe_close(c, h->s); + bch2_stripe_close(c, s); goto err; } - h->s->new_stripe.key.k.p = iter.pos; + s->new_stripe.key.k.p = iter.pos; out: bch2_trans_iter_exit(trans, &iter); return ret; err: - bch2_disk_reservation_put(c, &h->s->res); + bch2_disk_reservation_put(c, &s->res); goto out; } @@ -2165,22 +2171,27 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, return h; if (!h->s) { - ret = ec_new_stripe_alloc(c, h); - if (ret) { + h->s = ec_new_stripe_alloc(c, h); + if (!h->s) { + ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; bch_err(c, "failed to allocate new stripe"); goto err; } + + h->nr_created++; } - if (h->s->allocated) + struct ec_stripe_new *s = h->s; + + if (s->allocated) goto allocated; - if (h->s->have_existing_stripe) + if (s->have_existing_stripe) goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h); + ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2192,15 +2203,15 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * existing stripe: */ while (1) { - ret = __bch2_ec_stripe_head_reuse(trans, h); + ret = __bch2_ec_stripe_head_reuse(trans, h, s); if (!ret) break; if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: - __bch2_ec_stripe_head_reserve(trans, h); + ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; goto allocate_buf; @@ -2218,19 +2229,19 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, watermark, cl); + ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); if (ret) goto err; allocate_buf: - ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); + ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); if (ret) goto err; - h->s->allocated = true; + s->allocated = true; allocated: - BUG_ON(!h->s->idx); - BUG_ON(!h->s->new_stripe.data[0]); + BUG_ON(!s->idx); + BUG_ON(!s->new_stripe.data[0]); BUG_ON(trans->restarted); return h; err: @@ -2295,7 +2306,7 @@ static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_ int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) { return bch2_trans_run(c, - for_each_btree_key_upto_commit(trans, iter, + for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ @@ -2371,7 +2382,7 @@ int bch2_stripes_read(struct bch_fs *c) if (k.k->type != KEY_TYPE_stripe) continue; - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + ret = __ec_stripe_mem_alloc(c, k.k->p.offset); if (ret) break; @@ -2458,11 +2469,9 @@ void bch2_fs_ec_exit(struct bch_fs *c) while (1) { mutex_lock(&c->ec_stripe_head_lock); - h = list_first_entry_or_null(&c->ec_stripe_head_list, - struct ec_stripe_head, list); - if (h) - list_del(&h->list); + h = list_pop_entry(&c->ec_stripe_head_list, struct ec_stripe_head, list); mutex_unlock(&c->ec_stripe_head_lock); + if (!h) break; diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 43326370b410a..583ca6a226dae 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -6,9 +6,8 @@ #include "buckets_types.h" #include "extents_types.h" -enum bch_validate_flags; - -int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_stripe_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned, diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 9c4fe5cdbfb77..099999443d4d2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -54,7 +54,8 @@ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ - x(ENOMEM, ENOMEM_decompression_workspace_init) \ + x(ENOMEM, ENOMEM_backpointer_mismatches_bitmap) \ + x(EIO, compression_workspace_not_initialized) \ x(ENOMEM, ENOMEM_bucket_gens) \ x(ENOMEM, ENOMEM_buckets_nouse) \ x(ENOMEM, ENOMEM_usage_init) \ @@ -116,6 +117,7 @@ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ + x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ @@ -148,6 +150,7 @@ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(BCH_ERR_transaction_restart, transaction_restart_commit) \ x(0, no_btree_node) \ x(BCH_ERR_no_btree_node, no_btree_node_relock) \ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ @@ -164,7 +167,6 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ x(0, backpointer_to_overwritten_btree_node) \ - x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ x(BCH_ERR_fsck, fsck_fix) \ @@ -173,7 +175,9 @@ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(0, restart_recovery) \ + x(EINVAL, restart_recovery) \ + x(EINVAL, not_in_recovery) \ + x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ @@ -192,7 +196,9 @@ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ - x(EINVAL, btree_iter_with_journal_not_supported) \ + x(EINVAL, no_resize_with_buckets_nouse) \ + x(EINVAL, inode_unpack_error) \ + x(EINVAL, varint_decode_error) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -241,7 +247,10 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + x(EIO, journal_shutdown) \ + x(EIO, journal_flush_err) \ x(EIO, btree_node_read_err) \ + x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ x(EIO, btree_node_write_all_failed) \ x(EIO, btree_node_read_error) \ @@ -257,6 +266,8 @@ x(EIO, no_device_to_read_from) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ + x(EIO, no_encryption_key) \ + x(EIO, insufficient_journal_devices) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -305,6 +316,7 @@ static inline long bch2_err_class(long err) #define BLK_STS_REMOVED ((__force blk_status_t)128) +#include const char *bch2_blk_status_to_str(blk_status_t); -#endif /* _BCACHFES_ERRCODE_H */ +#endif /* _BCACHEFS_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index b679def8fb98c..6c03a04c82a3c 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "btree_iter.h" #include "error.h" +#include "fs-common.h" #include "journal.h" #include "recovery_passes.h" #include "super.h" @@ -33,7 +35,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) int bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + if (!test_bit(BCH_FS_recovery_running, &c->flags)) { bch2_inconsistent_error(c); return -BCH_ERR_btree_need_topology_repair; } else { @@ -218,6 +220,30 @@ static const u8 fsck_flags_extra[] = { #undef x }; +static int do_fsck_ask_yn(struct bch_fs *c, + struct btree_trans *trans, + struct printbuf *question, + const char *action) +{ + prt_str(question, ", "); + prt_str(question, action); + + if (bch2_fs_stdio_redirect(c)) + bch2_print(c, "%s", question->buf); + else + bch2_print_string_as_lines(KERN_ERR, question->buf); + + int ask = bch2_fsck_ask_yn(c, trans); + + if (trans) { + int ret = bch2_trans_relock(trans); + if (ret) + return ret; + } + + return ask; +} + int __bch2_fsck_err(struct bch_fs *c, struct btree_trans *trans, enum bch_fsck_flags flags, @@ -226,7 +252,7 @@ int __bch2_fsck_err(struct bch_fs *c, { struct fsck_err_state *s = NULL; va_list args; - bool print = true, suppressing = false, inconsistent = false; + bool print = true, suppressing = false, inconsistent = false, exiting = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; @@ -256,9 +282,10 @@ int __bch2_fsck_err(struct bch_fs *c, !trans && bch2_current_has_btree_trans(c)); - if ((flags & FSCK_CAN_FIX) && - test_bit(err, c->sb.errors_silent)) - return -BCH_ERR_fsck_fix; + if (test_bit(err, c->sb.errors_silent)) + return flags & FSCK_CAN_FIX + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; bch2_sb_error_count(c, err); @@ -284,21 +311,19 @@ int __bch2_fsck_err(struct bch_fs *c, if (s) { /* * We may be called multiple times for the same error on - * transaction restart - this memoizes instead of asking the user + * transaction restart - this memorizes instead of asking the user * multiple times for the same error: */ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { ret = s->ret; - mutex_unlock(&c->fsck_error_msgs_lock); - goto err; + goto err_unlock; } kfree(s->last_msg); s->last_msg = kstrdup(buf.buf, GFP_KERNEL); if (!s->last_msg) { - mutex_unlock(&c->fsck_error_msgs_lock); ret = -ENOMEM; - goto err; + goto err_unlock; } if (c->opts.ratelimit_errors && @@ -318,13 +343,19 @@ int __bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if ((flags & FSCK_CAN_FIX) && - (flags & FSCK_AUTOFIX) && + if ((flags & FSCK_AUTOFIX) && (c->opts.errors == BCH_ON_ERROR_continue || c->opts.errors == BCH_ON_ERROR_fix_safe)) { prt_str(out, ", "); - prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + if (flags & FSCK_CAN_FIX) { + prt_actioning(out, action); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } + + goto print; } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { @@ -348,31 +379,18 @@ int __bch2_fsck_err(struct bch_fs *c, : c->opts.fix_errors; if (fix == FSCK_FIX_ask) { - prt_str(out, ", "); - prt_str(out, action); - - if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s", out->buf); - else - bch2_print_string_as_lines(KERN_ERR, out->buf); print = false; - int ask = bch2_fsck_ask_yn(c, trans); - - if (trans) { - ret = bch2_trans_relock(trans); - if (ret) { - mutex_unlock(&c->fsck_error_msgs_lock); - goto err; - } - } + ret = do_fsck_ask_yn(c, trans, out, action); + if (ret < 0) + goto err_unlock; - if (ask >= YN_ALLNO && s) - s->fix = ask == YN_ALLNO + if (ret >= YN_ALLNO && s) + s->fix = ret == YN_ALLNO ? FSCK_FIX_no : FSCK_FIX_yes; - ret = ask & 1 + ret = ret & 1 ? -BCH_ERR_fsck_fix : -BCH_ERR_fsck_ignore; } else if (fix == FSCK_FIX_yes || @@ -385,9 +403,7 @@ int __bch2_fsck_err(struct bch_fs *c, prt_str(out, ", not "); prt_actioning(out, action); } - } else if (flags & FSCK_NEED_FSCK) { - prt_str(out, " (run fsck to correct)"); - } else { + } else if (!(flags & FSCK_CAN_IGNORE)) { prt_str(out, " (repair unimplemented)"); } @@ -396,14 +412,13 @@ int __bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; - bool exiting = - test_bit(BCH_FS_fsck_running, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore); - - if (exiting) + if (test_bit(BCH_FS_fsck_running, &c->flags) && + (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore)) { + exiting = true; print = true; - + } +print: if (print) { if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s\n", out->buf); @@ -419,17 +434,24 @@ int __bch2_fsck_err(struct bch_fs *c, if (s) s->ret = ret; - mutex_unlock(&c->fsck_error_msgs_lock); - if (inconsistent) bch2_inconsistent_error(c); - if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); + /* + * We don't yet track whether the filesystem currently has errors, for + * log_fsck_err()s: that would require us to track for every error type + * which recovery pass corrects it, to get the fsck exit status correct: + */ + if (flags & FSCK_CAN_FIX) { + if (ret == -BCH_ERR_fsck_fix) { + set_bit(BCH_FS_errors_fixed, &c->flags); + } else { + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); + } } +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); err: if (action != action_orig) kfree(action); @@ -437,28 +459,52 @@ int __bch2_fsck_err(struct bch_fs *c, return ret; } +static const char * const bch2_bkey_validate_contexts[] = { +#define x(n) #n, + BKEY_VALIDATE_CONTEXTS() +#undef x + NULL +}; + int __bch2_bkey_fsck_err(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags validate_flags, + struct bkey_validate_context from, enum bch_sb_error_id err, const char *fmt, ...) { - if (validate_flags & BCH_VALIDATE_silent) + if (from.flags & BCH_VALIDATE_silent) return -BCH_ERR_fsck_delete_bkey; unsigned fsck_flags = 0; - if (!(validate_flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) + if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { + if (test_bit(err, c->sb.errors_silent)) + return -BCH_ERR_fsck_delete_bkey; + fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; + } + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + fsck_flags |= fsck_flags_extra[err]; struct printbuf buf = PRINTBUF; - va_list args; + prt_printf(&buf, "invalid bkey in %s", + bch2_bkey_validate_contexts[from.from]); + + if (from.from == BKEY_VALIDATE_journal) + prt_printf(&buf, " journal seq=%llu offset=%u", + from.journal_seq, from.journal_offset); + + prt_str(&buf, " btree="); + bch2_btree_id_to_text(&buf, from.btree); + prt_printf(&buf, " level=%u: ", from.level); - prt_str(&buf, "invalid bkey "); bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, "\n "); + + va_list args; va_start(args, fmt); prt_vprintf(&buf, fmt, args); va_end(args); + prt_str(&buf, ": delete?"); int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); @@ -483,3 +529,36 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } + +int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +{ + u32 restart_count = trans->restart_count; + int ret = 0; + + /* XXX: we don't yet attempt to print paths when we don't know the subvol */ + if (inum.subvol) + ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (!inum.subvol || ret) + prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + + return trans_was_restarted(trans, restart_count); +} + +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) +{ + int ret = bch2_inum_err_msg_trans(trans, out, inum); + prt_printf(out, " offset %llu: ", offset); + return ret; +} + +void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) +{ + bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); +} + +void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) +{ + bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 6551ada926b60..7acf2a27ca281 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -45,32 +45,11 @@ int bch2_topology_error(struct bch_fs *); bch2_inconsistent_error(c); \ }) -#define bch2_fs_inconsistent_on(cond, c, ...) \ +#define bch2_fs_inconsistent_on(cond, ...) \ ({ \ bool _ret = unlikely(!!(cond)); \ - \ - if (_ret) \ - bch2_fs_inconsistent(c, __VA_ARGS__); \ - _ret; \ -}) - -/* - * Later we might want to mark only the particular device inconsistent, not the - * entire filesystem: - */ - -#define bch2_dev_inconsistent(ca, ...) \ -do { \ - bch_err(ca, __VA_ARGS__); \ - bch2_inconsistent_error((ca)->fs); \ -} while (0) - -#define bch2_dev_inconsistent_on(cond, ca, ...) \ -({ \ - bool _ret = unlikely(!!(cond)); \ - \ if (_ret) \ - bch2_dev_inconsistent(ca, __VA_ARGS__); \ + bch2_fs_inconsistent(__VA_ARGS__); \ _ret; \ }) @@ -123,9 +102,9 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, void bch2_flush_fsck_errs(struct bch_fs *); -#define __fsck_err(c, _flags, _err_type, ...) \ +#define fsck_err_wrap(_do) \ ({ \ - int _ret = bch2_fsck_err(c, _flags, _err_type, __VA_ARGS__); \ + int _ret = _do; \ if (_ret != -BCH_ERR_fsck_fix && \ _ret != -BCH_ERR_fsck_ignore) { \ ret = _ret; \ @@ -135,6 +114,8 @@ void bch2_flush_fsck_errs(struct bch_fs *); _ret == -BCH_ERR_fsck_fix; \ }) +#define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) + /* These macros return true if error should be fixed: */ /* XXX: mark in superblock that filesystem contains errors, if we ignore: */ @@ -149,12 +130,6 @@ void bch2_flush_fsck_errs(struct bch_fs *); (unlikely(cond) ? __fsck_err(c, _flags, _err_type, __VA_ARGS__) : false);\ }) -#define need_fsck_err_on(cond, c, _err_type, ...) \ - __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) - -#define need_fsck_err(c, _err_type, ...) \ - __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, _err_type, __VA_ARGS__) - #define mustfix_fsck_err(c, _err_type, ...) \ __fsck_err(c, FSCK_CAN_FIX, _err_type, __VA_ARGS__) @@ -167,11 +142,22 @@ void bch2_flush_fsck_errs(struct bch_fs *); #define fsck_err_on(cond, c, _err_type, ...) \ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) +#define log_fsck_err(c, _err_type, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE, _err_type, __VA_ARGS__) + +#define log_fsck_err_on(cond, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + if (_ret) \ + log_fsck_err(__VA_ARGS__); \ + _ret; \ +}) + enum bch_validate_flags; __printf(5, 6) int __bch2_bkey_fsck_err(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags, + struct bkey_validate_context from, enum bch_sb_error_id, const char *, ...); @@ -181,7 +167,7 @@ int __bch2_bkey_fsck_err(struct bch_fs *, */ #define bkey_fsck_err(c, _err_type, _err_msg, ...) \ do { \ - int _ret = __bch2_bkey_fsck_err(c, k, flags, \ + int _ret = __bch2_bkey_fsck_err(c, k, from, \ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ if (_ret != -BCH_ERR_fsck_fix && \ @@ -252,4 +238,10 @@ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); _ret; \ }) +int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); +int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); + +void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); +void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); + #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 5f4fecb358da0..6aac579a692ae 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -64,7 +64,7 @@ static int count_iters_for_insert(struct btree_trans *trans, break; case KEY_TYPE_reflink_p: { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - u64 idx = le64_to_cpu(p.v->idx); + u64 idx = REFLINK_P_IDX(p.v); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); struct btree_iter iter; @@ -128,7 +128,7 @@ int bch2_extent_atomic_end(struct btree_trans *trans, bch2_trans_copy_iter(©, iter); - for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { + for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { unsigned offset = 0; if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 37e3d69bec06a..05d5f71a7ca9f 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -21,6 +21,7 @@ #include "extents.h" #include "inode.h" #include "journal.h" +#include "rebalance.h" #include "replicas.h" #include "super.h" #include "super-io.h" @@ -88,6 +89,14 @@ static inline bool ptr_better(struct bch_fs *c, u64 l1 = dev_latency(c, p1.ptr.dev); u64 l2 = dev_latency(c, p2.ptr.dev); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + l1 *= l1; + l2 *= l2; + /* Pick at random, biased in favor of the faster device: */ return bch2_rand_range(l1 + l2) > l1; @@ -169,7 +178,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -177,7 +186,7 @@ int bch2_btree_ptr_validate(struct bch_fs *c, struct bkey_s_c k, c, btree_ptr_val_too_big, "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX); - ret = bch2_bkey_ptrs_validate(c, k, flags); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -189,7 +198,7 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); int ret = 0; @@ -203,12 +212,13 @@ int bch2_btree_ptr_v2_validate(struct bch_fs *c, struct bkey_s_c k, c, btree_ptr_v2_min_key_bad, "min_key > key"); - if (flags & BCH_VALIDATE_write) + if ((from.flags & BCH_VALIDATE_write) && + c->sb.version_min >= bcachefs_metadata_version_btree_ptr_sectors_written) bkey_fsck_err_on(!bp.v->sectors_written, c, btree_ptr_v2_written_0, "sectors_written == 0"); - ret = bch2_bkey_ptrs_validate(c, k, flags); + ret = bch2_bkey_ptrs_validate(c, k, from); fsck_err: return ret; } @@ -395,7 +405,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) /* KEY_TYPE_reservation: */ int bch2_reservation_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); int ret = 0; @@ -1120,6 +1130,57 @@ void bch2_extent_crc_unpacked_to_text(struct printbuf *out, struct bch_extent_cr bch2_prt_compression_type(out, crc->compression_type); } +static void bch2_extent_rebalance_to_text(struct printbuf *out, struct bch_fs *c, + const struct bch_extent_rebalance *r) +{ + prt_str(out, "rebalance:"); + + prt_printf(out, " replicas=%u", r->data_replicas); + if (r->data_replicas_from_inode) + prt_str(out, " (inode)"); + + prt_str(out, " checksum="); + bch2_prt_csum_opt(out, r->data_checksum); + if (r->data_checksum_from_inode) + prt_str(out, " (inode)"); + + if (r->background_compression || r->background_compression_from_inode) { + prt_str(out, " background_compression="); + bch2_compression_opt_to_text(out, r->background_compression); + + if (r->background_compression_from_inode) + prt_str(out, " (inode)"); + } + + if (r->background_target || r->background_target_from_inode) { + prt_str(out, " background_target="); + if (c) + bch2_target_to_text(out, c, r->background_target); + else + prt_printf(out, "%u", r->background_target); + + if (r->background_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->promote_target || r->promote_target_from_inode) { + prt_str(out, " promote_target="); + if (c) + bch2_target_to_text(out, c, r->promote_target); + else + prt_printf(out, "%u", r->promote_target); + + if (r->promote_target_from_inode) + prt_str(out, " (inode)"); + } + + if (r->erasure_code || r->erasure_code_from_inode) { + prt_printf(out, " ec=%u", r->erasure_code); + if (r->erasure_code_from_inode) + prt_str(out, " (inode)"); + } +} + void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { @@ -1155,18 +1216,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, (u64) ec->idx, ec->block); break; } - case BCH_EXTENT_ENTRY_rebalance: { - const struct bch_extent_rebalance *r = &entry->rebalance; - - prt_str(out, "rebalance: target "); - if (c) - bch2_target_to_text(out, c, r->target); - else - prt_printf(out, "%u", r->target); - prt_str(out, " compression "); - bch2_compression_opt_to_text(out, r->compression); + case BCH_EXTENT_ENTRY_rebalance: + bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; - } + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1178,13 +1231,19 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, static int extent_ptr_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags, + struct bkey_validate_context from, const struct bch_extent_ptr *ptr, unsigned size_ondisk, bool metadata) { int ret = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr2) + bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, + c, ptr_to_duplicate_device, + "multiple pointers to same device (%u)", ptr->dev); + /* bad pointers are repaired by check_fix_ptrs(): */ rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); @@ -1199,13 +1258,6 @@ static int extent_ptr_validate(struct bch_fs *c, unsigned bucket_size = ca->mi.bucket_size; rcu_read_unlock(); - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - bkey_for_each_ptr(ptrs, ptr2) - bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, - c, ptr_to_duplicate_device, - "multiple pointers to same device (%u)", ptr->dev); - - bkey_fsck_err_on(bucket >= nbuckets, c, ptr_after_last_bucket, "pointer past last bucket (%llu > %llu)", bucket, nbuckets); @@ -1221,7 +1273,7 @@ static int extent_ptr_validate(struct bch_fs *c, } int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1248,7 +1300,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, switch (extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: - ret = extent_ptr_validate(c, k, flags, &entry->ptr, size_ondisk, false); + ret = extent_ptr_validate(c, k, from, &entry->ptr, size_ondisk, false); if (ret) return ret; @@ -1270,9 +1322,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); - bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, - c, ptr_crc_uncompressed_size_too_small, - "checksum offset + key size > uncompressed size"); bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, ptr_crc_csum_type_unknown, "invalid checksum type"); @@ -1280,6 +1329,19 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, c, ptr_crc_compression_type_unknown, "invalid compression type"); + bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, + c, ptr_crc_uncompressed_size_too_small, + "checksum offset + key size > uncompressed size"); + bkey_fsck_err_on(crc_is_encoded(crc) && + (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && + (from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), + c, ptr_crc_uncompressed_size_too_big, + "too large encoded extent"); + bkey_fsck_err_on(!crc_is_compressed(crc) && + crc.compressed_size != crc.uncompressed_size, + c, ptr_crc_uncompressed_size_mismatch, + "not compressed but compressed != uncompressed size"); + if (bch2_csum_type_is_encryption(crc.csum_type)) { if (nonce == UINT_MAX) nonce = crc.offset + crc.nonce; @@ -1293,12 +1355,6 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, "redundant crc entry"); crc_since_last_ptr = true; - bkey_fsck_err_on(crc_is_encoded(crc) && - (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) && - (flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit)), - c, ptr_crc_uncompressed_size_too_big, - "too large encoded extent"); - size_ondisk = crc.compressed_size; break; case BCH_EXTENT_ENTRY_stripe_ptr: @@ -1391,166 +1447,6 @@ void bch2_ptr_swab(struct bkey_s k) } } -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - - bkey_extent_entry_for_each(ptrs, entry) - if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) - return &entry->rebalance; - - return NULL; -} - -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k, - unsigned target, unsigned compression) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned rewrite_ptrs = 0; - - if (compression) { - unsigned compression_type = bch2_compression_opt_to_type(compression); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned i = 0; - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - rewrite_ptrs = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - rewrite_ptrs |= 1U << i; - i++; - } - } -incompressible: - if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { - unsigned i = 0; - - bkey_for_each_ptr(ptrs, ptr) { - if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target)) - rewrite_ptrs |= 1U << i; - i++; - } - } - - return rewrite_ptrs; -} - -bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - /* - * If it's an indirect extent, we don't delete the rebalance entry when - * done so that we know what options were applied - check if it still - * needs work done: - */ - if (r && - k.k->type == KEY_TYPE_reflink_v && - !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression)) - r = NULL; - - return r != NULL; -} - -static u64 __bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k, - unsigned target, unsigned compression) -{ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - u64 sectors = 0; - - if (compression) { - unsigned compression_type = bch2_compression_opt_to_type(compression); - - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || - p.ptr.unwritten) { - sectors = 0; - goto incompressible; - } - - if (!p.ptr.cached && p.crc.compression_type != compression_type) - sectors += p.crc.compressed_size; - } - } -incompressible: - if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) { - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, target)) - sectors += p.crc.compressed_size; - } - - return sectors; -} - -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) -{ - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - return r ? __bch2_bkey_sectors_need_rebalance(c, k, r->target, r->compression) : 0; -} - -int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k, - struct bch_io_opts *opts) -{ - struct bkey_s k = bkey_i_to_s(_k); - struct bch_extent_rebalance *r; - unsigned target = opts->background_target; - unsigned compression = background_compression(*opts); - bool needs_rebalance; - - if (!bkey_extent_is_direct_data(k.k)) - return 0; - - /* get existing rebalance entry: */ - r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); - if (r) { - if (k.k->type == KEY_TYPE_reflink_v) { - /* - * indirect extents: existing options take precedence, - * so that we don't move extents back and forth if - * they're referenced by different inodes with different - * options: - */ - if (r->target) - target = r->target; - if (r->compression) - compression = r->compression; - } - - r->target = target; - r->compression = compression; - } - - needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression); - - if (needs_rebalance && !r) { - union bch_extent_entry *new = bkey_val_end(k); - - new->rebalance.type = 1U << BCH_EXTENT_ENTRY_rebalance; - new->rebalance.compression = compression; - new->rebalance.target = target; - new->rebalance.unused = 0; - k.k->u64s += extent_entry_u64s(new); - } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) { - /* - * For indirect extents, don't delete the rebalance entry when - * we're finished so that we know we specifically moved it or - * compressed it to its current location/compression type - */ - extent_entry_drop(k, (union bch_extent_entry *) r); - } - - return 0; -} - /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -1610,7 +1506,7 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) case KEY_TYPE_reflink_p: { struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); - le64_add_cpu(&p.v->idx, sub); + SET_REFLINK_P_IDX(p.v, REFLINK_P_IDX(p.v) + sub); break; } case KEY_TYPE_inline_data: diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index bcffcf60aaaf8..620b284aa34f0 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -8,7 +8,6 @@ struct bch_fs; struct btree_trans; -enum bch_validate_flags; /* extent entries: */ @@ -410,12 +409,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, /* KEY_TYPE_btree_ptr: */ int bch2_btree_ptr_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_btree_ptr_v2_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, int, struct bkey_s); @@ -452,7 +451,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); /* KEY_TYPE_reservation: */ int bch2_reservation_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -696,7 +695,7 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, struct bch_extent_ptr ptr2) @@ -710,15 +709,6 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, void bch2_ptr_swab(struct bkey_s); -const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c); -unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c, - unsigned, unsigned); -bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c); -u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); - -int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *, - struct bch_io_opts *); - /* Generic extent code: */ enum bch_extent_overlap { diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index 3bd2fdbb08174..c198dfc376d6c 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -201,19 +201,8 @@ struct bch_extent_stripe_ptr { #endif }; -struct bch_extent_rebalance { -#if defined(__LITTLE_ENDIAN_BITFIELD) - __u64 type:6, - unused:34, - compression:8, /* enum bch_compression_opt */ - target:16; -#elif defined (__BIG_ENDIAN_BITFIELD) - __u64 target:16, - compression:8, - unused:34, - type:6; -#endif -}; +/* bch_extent_rebalance: */ +#include "rebalance_format.h" union bch_extent_entry { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 0541192d7bc02..13ce67a1c05af 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -12,7 +12,7 @@ #endif /* - * Traversal for trees in eytzinger layout - a full binary tree layed out in an + * Traversal for trees in eytzinger layout - a full binary tree laid out in an * array. * * Consider using an eytzinger tree any time you would otherwise be doing binary diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 7e10a9ddcfd96..7d279f2113122 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -69,9 +69,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!snapshot_src.inum) { /* Inode wasn't specified, just snapshot: */ struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, - BTREE_ITER_cached, &s); + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, &s); if (ret) goto err; @@ -172,6 +170,10 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_dir_offset = dir_offset; } + if (S_ISDIR(mode) && + !new_inode->bi_subvol) + new_inode->bi_depth = dir_u->bi_depth + 1; + inode_iter.flags &= ~BTREE_ITER_all_snapshots; bch2_btree_iter_set_snapshot(&inode_iter, snapshot); @@ -512,6 +514,15 @@ int bch2_rename_trans(struct btree_trans *trans, dst_dir_u->bi_nlink++; } + if (S_ISDIR(src_inode_u->bi_mode) && + !src_inode_u->bi_subvol) + src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + + if (mode == BCH_RENAME_EXCHANGE && + S_ISDIR(dst_inode_u->bi_mode) && + !dst_inode_u->bi_subvol) + dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; @@ -548,3 +559,84 @@ int bch2_rename_trans(struct btree_trans *trans, bch2_trans_iter_exit(trans, &src_dir_iter); return ret; } + +static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) +{ + bch2_printbuf_make_room(out, n); + + unsigned can_print = min(n, printbuf_remaining(out)); + + b += n; + + for (unsigned i = 0; i < can_print; i++) + out->buf[out->pos++] = *((char *) --b); + + printbuf_nul_terminate(out); +} + +static inline void reverse_bytes(void *b, size_t n) +{ + char *e = b + n, *s = b; + + while (s < e) { + --e; + swap(*s, *e); + s++; + } +} + +/* XXX: we don't yet attempt to print paths when we don't know the subvol */ +int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) +{ + unsigned orig_pos = path->pos; + int ret = 0; + + while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && + inum.inum == BCACHEFS_ROOT_INO)) { + struct bch_inode_unpacked inode; + ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); + if (ret) + goto err; + + if (!inode.bi_dir && !inode.bi_dir_offset) { + ret = -BCH_ERR_ENOENT_inode_no_backpointer; + goto err; + } + + u32 snapshot; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + struct btree_iter d_iter; + struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, + BTREE_ID_dirents, SPOS(inode.bi_dir, inode.bi_dir_offset, snapshot), + 0, dirent); + ret = bkey_err(d.s_c); + if (ret) + goto err; + + struct qstr dirent_name = bch2_dirent_get_name(d); + prt_bytes_reversed(path, dirent_name.name, dirent_name.len); + + prt_char(path, '/'); + + if (d.v->d_type == DT_SUBVOL) + inum.subvol = le32_to_cpu(d.v->d_parent_subvol); + inum.inum = d.k->p.inode; + + bch2_trans_iter_exit(trans, &d_iter); + } + + if (orig_pos == path->pos) + prt_char(path, '/'); + + ret = path->allocation_failure ? -ENOMEM : 0; + if (ret) + goto err; + + reverse_bytes(path->buf + orig_pos, path->pos - orig_pos); + return 0; +err: + return ret; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index c934e807b380f..2b59210bb5e8a 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -42,4 +42,6 @@ int bch2_rename_trans(struct btree_trans *, bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); +int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); + #endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 95972809e76d7..ab1d5db2fa56a 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -164,7 +164,8 @@ static void bchfs_read(struct btree_trans *trans, BTREE_ITER_slots); while (1) { struct bkey_s_c k; - unsigned bytes, sectors, offset_into_extent; + unsigned bytes, sectors; + s64 offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -197,7 +198,7 @@ static void bchfs_read(struct btree_trans *trans, k = bkey_i_to_s_c(sk.k); - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); if (readpages_iter) { ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, @@ -230,10 +231,12 @@ static void bchfs_read(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); if (ret) { - bch_err_inum_offset_ratelimited(c, - iter.pos.inode, - iter.pos.offset << 9, - "read error %i from btree lookup", ret); + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; bio_endio(&rbio->bio); } @@ -248,6 +251,7 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_io_opts opts; struct folio *folio; struct readpages_iter readpages_iter; + struct blk_plug plug; bch2_inode_opts_get(&opts, c, &inode->ei_inode); @@ -255,6 +259,16 @@ void bch2_readahead(struct readahead_control *ractl) if (ret) return; + /* + * Besides being a general performance optimization, plugging helps with + * avoiding btree transaction srcu warnings - submitting a bio can + * block, and we don't want todo that with the transaction locked. + * + * However, plugged bios are submitted when we schedule; we ideally + * would have our own scheduler hook to call unlock_long() before + * scheduling. + */ + blk_start_plug(&plug); bch2_pagecache_add_get(inode); struct btree_trans *trans = bch2_trans_get(c); @@ -281,7 +295,7 @@ void bch2_readahead(struct readahead_control *ractl) bch2_trans_put(trans); bch2_pagecache_add_put(inode); - + blk_finish_plug(&plug); darray_exit(&readpages_iter.folios); } @@ -296,9 +310,13 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_read_bio *rbio; struct bch_io_opts opts; + struct blk_plug plug; int ret; DECLARE_COMPLETION_ONSTACK(done); + BUG_ON(folio_test_uptodate(folio)); + BUG_ON(folio_test_dirty(folio)); + if (!bch2_folio_create(folio, GFP_KERNEL)) return -ENOMEM; @@ -313,7 +331,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + blk_start_plug(&plug); bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0)); + blk_finish_plug(&plug); wait_for_completion(&done); ret = blk_status_to_errno(rbio->bio.bi_status); @@ -605,15 +625,6 @@ static int __bch2_writepage(struct folio *folio, BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, sectors << 9, offset << 9)); - /* Check for writing past i_size: */ - WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > - round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_emergency_ro, &c->flags), - "writing past i_size: %llu > %llu (unrounded %llu)\n", - bio_end_sector(&w->io->op.wbio.bio) << 9, - round_up(i_size, block_bytes(c)), - i_size); - w->io->op.res.sectors += reserved_sectors; w->io->op.i_sectors_delta -= dirty_sectors; w->io->op.new_i_size = i_size; @@ -669,7 +680,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, FGP_WRITEBEGIN | fgf_set_order(len), mapping_gfp_mask(mapping)); - if (IS_ERR_OR_NULL(folio)) + if (IS_ERR(folio)) goto err_unlock; offset = pos - folio_pos(folio); diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 6d3a05ae5da84..2089c36b5866d 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -70,6 +70,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct bch_io_opts opts; struct dio_read *dio; struct bio *bio; + struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); size_t shorten; @@ -128,6 +129,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) */ dio->should_dirty = iter_is_iovec(iter); + blk_start_plug(&plug); + goto start; while (iter->count) { bio = bio_alloc_bioset(NULL, @@ -160,6 +163,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); } + blk_finish_plug(&plug); + iter->count += shorten; if (sync) { diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index 1d4910ea0f1d6..e072900e6a5b9 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -29,7 +29,7 @@ int bch2_filemap_get_contig_folios_d(struct address_space *mapping, break; f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); - if (IS_ERR_OR_NULL(f)) + if (IS_ERR(f)) break; BUG_ON(fs->nr && folio_pos(f) != pos); @@ -199,7 +199,7 @@ int bch2_folio_set(struct bch_fs *c, subvol_inum inum, unsigned folio_idx = 0; return bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inum.inum, offset), POS(inum.inum, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 2456c41b215ee..92acf598f0573 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -167,6 +167,34 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, /* fsync: */ +static int bch2_get_inode_journal_seq_trans(struct btree_trans *trans, subvol_inum inum, + u64 *seq) +{ + struct printbuf buf = PRINTBUF; + struct bch_inode_unpacked u; + struct btree_iter iter; + int ret = bch2_inode_peek(trans, &iter, &u, inum, 0); + if (ret) + return ret; + + u64 cur_seq = journal_cur_seq(&trans->c->journal); + *seq = min(cur_seq, u.bi_journal_seq); + + if (fsck_err_on(u.bi_journal_seq > cur_seq, + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", + cur_seq, + (bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_journal_seq = cur_seq; + ret = bch2_inode_write(trans, &iter, &u); + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + /* * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an * insert trigger: look up the btree inode instead @@ -180,9 +208,10 @@ static int bch2_flush_inode(struct bch_fs *c, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) return -EROFS; - struct bch_inode_unpacked u; - int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?: - bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?: + u64 seq; + int ret = bch2_trans_commit_do(c, NULL, NULL, 0, + bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: + bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); bch2_write_ref_put(c, BCH_WRITE_REF_fsync); return ret; @@ -222,7 +251,7 @@ static inline int range_has_data(struct bch_fs *c, u32 subvol, struct bpos end) { return bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, start, end, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, start, end, subvol, 0, k, ({ bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k); }))); @@ -256,7 +285,7 @@ static int __bch2_truncate_folio(struct bch_inode_info *inode, folio = __filemap_get_folio(mapping, index, FGP_LOCK|FGP_CREAT, GFP_KERNEL); - if (IS_ERR_OR_NULL(folio)) { + if (IS_ERR(folio)) { ret = -ENOMEM; goto out; } @@ -806,7 +835,7 @@ static int quota_reserve_range(struct bch_inode_info *inode, u64 sectors = end - start; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, start), POS(inode->v.i_ino, end - 1), @@ -877,16 +906,23 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, bch2_mark_pagecache_unallocated(src, pos_src >> 9, (pos_src + aligned_len) >> 9); + /* + * XXX: we'd like to be telling bch2_remap_range() if we have + * permission to write to the source file, and thus if io path option + * changes should be propagated through the copy, but we need mnt_idmap + * from the pathwalk, awkward + */ ret = bch2_remap_range(c, inode_inum(dst), pos_dst >> 9, inode_inum(src), pos_src >> 9, aligned_len >> 9, - pos_dst + len, &i_sectors_delta); + pos_dst + len, &i_sectors_delta, + false); if (ret < 0) goto err; /* - * due to alignment, we might have remapped slightly more than requsted + * due to alignment, we might have remapped slightly more than requested */ ret = min((u64) ret << 9, (u64) len); @@ -922,7 +958,7 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, 0, k, ({ @@ -958,7 +994,7 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) return -ENXIO; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_extents, POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 405cf08bda347..15725b4ce393c 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -406,7 +406,7 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, sync_inodes_sb(c->vfs_sb); up_read(&c->vfs_sb->s_umount); } -retry: + if (arg.src_ptr) { error = user_path_at(arg.dirfd, (const char __user *)(unsigned long)arg.src_ptr, @@ -486,11 +486,6 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, err2: if (arg.src_ptr) path_put(&src_path); - - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } err1: return error; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index a41d0d8a2f7be..55b38bc7f67b8 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -23,6 +23,7 @@ #include "journal.h" #include "keylist.h" #include "quota.h" +#include "rebalance.h" #include "snapshot.h" #include "super.h" #include "xattr.h" @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -89,10 +91,25 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(trans); - ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), - BTREE_ITER_intent) ?: - (set ? set(trans, inode, &inode_u, p) : 0) ?: - bch2_inode_write(trans, &iter, &inode_u) ?: + ret = bch2_inode_peek(trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_intent); + if (ret) + goto err; + + struct bch_extent_rebalance old_r = bch2_inode_rebalance_opts_get(c, &inode_u); + + ret = (set ? set(trans, inode, &inode_u, p) : 0); + if (ret) + goto err; + + struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + + if (memcmp(&old_r, &new_r, sizeof(new_r))) { + ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); + if (ret) + goto err; + } + + ret = bch2_inode_write(trans, &iter, &inode_u) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); /* @@ -101,7 +118,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, */ if (!ret) bch2_inode_update_after_write(trans, inode, &inode_u, fields); - +err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -160,8 +177,9 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b) static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; + siphash_key_t k = { .key[0] = seed }; - return jhash(&inum->inum, sizeof(inum->inum), seed); + return siphash_2u64(inum->subvol, inum->inum, &k); } static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed) @@ -190,11 +208,18 @@ static const struct rhashtable_params bch2_vfs_inodes_params = { .automatic_shrinking = true, }; +static const struct rhashtable_params bch2_vfs_inodes_by_inum_params = { + .head_offset = offsetof(struct bch_inode_info, by_inum_hash), + .key_offset = offsetof(struct bch_inode_info, ei_inum.inum), + .key_len = sizeof(u64), + .automatic_shrinking = true, +}; + int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { struct bch_fs *c = trans->c; - struct rhashtable *ht = &c->vfs_inodes_table; - subvol_inum inum = (subvol_inum) { .inum = p.offset }; + struct rhltable *ht = &c->vfs_inodes_by_inum_table; + u64 inum = p.offset; DARRAY(u32) subvols; int ret = 0; @@ -219,15 +244,15 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) struct rhash_lock_head __rcu *const *bkt; struct rhash_head *he; unsigned int hash; - struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); + struct bucket_table *tbl = rht_dereference_rcu(ht->ht.tbl, &ht->ht); restart: - hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params); + hash = rht_key_hashfn(&ht->ht, tbl, &inum, bch2_vfs_inodes_by_inum_params); bkt = rht_bucket(tbl, hash); do { struct bch_inode_info *inode; rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) { - if (inode->ei_inum.inum == inum.inum) { + if (inode->ei_inum.inum == inum) { ret = darray_push_gfp(&subvols, inode->ei_inum.subvol, GFP_NOWAIT|__GFP_NOWARN); if (ret) { @@ -248,7 +273,7 @@ int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) /* Ensure we see any new tables. */ smp_rmb(); - tbl = rht_dereference_rcu(tbl->future_tbl, ht); + tbl = rht_dereference_rcu(tbl->future_tbl, &ht->ht); if (unlikely(tbl)) goto restart; rcu_read_unlock(); @@ -327,7 +352,11 @@ static void bch2_inode_hash_remove(struct bch_fs *c, struct bch_inode_info *inod spin_unlock(&inode->v.i_lock); if (remove) { - int ret = rhashtable_remove_fast(&c->vfs_inodes_table, + int ret = rhltable_remove(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + + ret = rhashtable_remove_fast(&c->vfs_inodes_table, &inode->hash, bch2_vfs_inodes_params); BUG_ON(ret); inode->v.i_hash.pprev = NULL; @@ -372,6 +401,11 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, discard_new_inode(&inode->v); return old; } else { + int ret = rhltable_insert(&c->vfs_inodes_by_inum_table, + &inode->by_inum_hash, + bch2_vfs_inodes_by_inum_params); + BUG_ON(ret); + inode_fake_hash(&inode->v); inode_sb_list_add(&inode->v); @@ -383,14 +417,6 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c, } } -#define memalloc_flags_do(_flags, _do) \ -({ \ - unsigned _saved_flags = memalloc_flags_save(_flags); \ - typeof(_do) _ret = _do; \ - memalloc_noreclaim_restore(_saved_flags); \ - _ret; \ -}) - static struct inode *bch2_alloc_inode(struct super_block *sb) { BUG(); @@ -465,7 +491,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_unpacked inode_u; struct bch_subvolume subvol; int ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); bch2_trans_put(trans); @@ -535,8 +561,7 @@ __bch2_create(struct mnt_idmap *idmap, inum.subvol = inode_u.bi_subvol ?: dir->ei_inum.subvol; inum.inum = inode_u.bi_inum; - ret = bch2_subvolume_get(trans, inum.subvol, true, - BTREE_ITER_with_updates, &subvol) ?: + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_trans_commit(trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, @@ -617,7 +642,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; - ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); @@ -628,7 +653,7 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, goto err; /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_bp(&inode_u) && + if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), c, "dirent points to inode that does not point back:\n %s", @@ -1245,7 +1270,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter iter; struct bkey_s_c k; struct bkey_buf cur, prev; - unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; @@ -1278,7 +1302,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_upto(&iter, end); + k = bch2_btree_iter_peek_max(&iter, end); ret = bkey_err(k); if (ret) continue; @@ -1292,9 +1316,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, continue; } - offset_into_extent = iter.pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&cur, c, k); @@ -1306,7 +1329,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, k = bkey_i_to_s_c(cur.k); bch2_bkey_buf_realloc(&prev, c, k.k->u64s); - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + @@ -1736,7 +1759,6 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; - inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; @@ -2200,7 +2222,8 @@ static int bch2_fs_get_tree(struct fs_context *fc) sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); - sb->s_uuid = c->sb.user_uuid; + super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); + super_set_sysfs_name_uuid(sb); sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -2345,13 +2368,16 @@ static int bch2_init_fs_context(struct fs_context *fc) void bch2_fs_vfs_exit(struct bch_fs *c) { + if (c->vfs_inodes_by_inum_table.ht.tbl) + rhltable_destroy(&c->vfs_inodes_by_inum_table); if (c->vfs_inodes_table.tbl) rhashtable_destroy(&c->vfs_inodes_table); } int bch2_fs_vfs_init(struct bch_fs *c) { - return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params); + return rhashtable_init(&c->vfs_inodes_table, &bch2_vfs_inodes_params) ?: + rhltable_init(&c->vfs_inodes_by_inum_table, &bch2_vfs_inodes_by_inum_params); } static struct file_system_type bcache_fs_type = { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 59f9f7ae728d2..7a3dba8fefa0f 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -14,6 +14,7 @@ struct bch_inode_info { struct inode v; struct rhash_head hash; + struct rhlist_head by_inum_hash; subvol_inum ei_inum; struct list_head ei_vfs_inode_list; @@ -33,7 +34,7 @@ struct bch_inode_info { * * XXX: a device may have had a flush issued by some other codepath. It * would be better to keep for each device a sequence number that's - * incremented when we isusue a cache flush, and track here the sequence + * incremented when we issue a cache flush, and track here the sequence * number that needs flushing. */ struct bch_devs_mask ei_devs_need_flush; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 75c8a97a6954c..f50670cd4ce72 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bcachefs_ioctl.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" @@ -16,6 +17,7 @@ #include "recovery_passes.h" #include "snapshot.h" #include "super.h" +#include "thread_with_file.h" #include "xattr.h" #include @@ -73,7 +75,7 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, { u64 sectors = 0; - int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), 0, k, ({ @@ -90,7 +92,7 @@ static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, { u64 subdirs = 0; - int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_dirents, SPOS(inum, 0, snapshot), POS(inum, U64_MAX), 0, k, ({ @@ -107,7 +109,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, u32 *snapshot, u64 *inum) { struct bch_subvolume s; - int ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + int ret = bch2_subvolume_get(trans, subvol, false, &s); *snapshot = le32_to_cpu(s.snapshot); *inum = le64_to_cpu(s.inode); @@ -170,7 +172,7 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, if (ret) return ret; - struct bkey_s_c_dirent d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); *target = le64_to_cpu(d.v->d_inum); *type = d.v->d_type; bch2_trans_iter_exit(trans, &iter); @@ -210,6 +212,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); + struct btree_iter lostfound_iter = { NULL }; u64 inum = 0; unsigned d_type = 0; int ret; @@ -223,8 +226,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) }; struct bch_subvolume subvol; - ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), - false, 0, &subvol); + ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol), false, &subvol); bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u", le32_to_cpu(st.master_subvol), snapshot); if (ret) @@ -288,11 +290,16 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * XXX: we could have a nicer log message here if we had a nice way to * walk backpointers to print a path */ - bch_notice(c, "creating lost+found in subvol %llu snapshot %u", - root_inum.subvol, le32_to_cpu(st.root_snapshot)); + struct printbuf path = PRINTBUF; + ret = bch2_inum_to_path(trans, root_inum, &path); + if (ret) + goto err; + + bch_notice(c, "creating %s/lost+found in subvol %llu snapshot %u", + path.buf, root_inum.subvol, snapshot); + printbuf_exit(&path); u64 now = bch2_current_time(c); - struct btree_iter lostfound_iter = { NULL }; u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, lostfound); @@ -451,7 +458,9 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * continue; struct bch_inode_unpacked child_inode; - bch2_inode_unpack(k, &child_inode); + ret = bch2_inode_unpack(k, &child_inode); + if (ret) + break; if (!inode_should_reattach(&child_inode)) { ret = maybe_delete_dirent(trans, @@ -482,6 +491,13 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * return ret; } +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + static int remove_backpointer(struct btree_trans *trans, struct bch_inode_unpacked *inode) { @@ -490,13 +506,11 @@ static int remove_backpointer(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c_dirent d = - bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, - SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot), 0, - dirent); - int ret = bkey_err(d) ?: - dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + struct bkey_s_c_dirent d = dirent_get_by_pos(trans, &iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); + int ret = bkey_err(d) ?: + dirent_points_to_inode(c, d, inode) ?: + __remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -613,7 +627,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct btree_iter iter = {}; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter); + struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); bch2_trans_iter_exit(trans, &iter); int ret = bkey_err(k); if (ret) @@ -714,7 +728,7 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see /* * We know that @id is a descendant of @ancestor, we're checking if * we've seen a key that overwrote @ancestor - i.e. also a descendent of - * @ascestor and with @id as a descendent. + * @ancestor and with @id as a descendent. * * But we already know that we're scanning IDs between @id and @ancestor * numerically, since snapshot ID lists are kept sorted, so if we find @@ -797,9 +811,8 @@ static int add_inode(struct bch_fs *c, struct inode_walker *w, { struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(inode, &u)); - - return darray_push(&w->inodes, ((struct inode_walker_entry) { + return bch2_inode_unpack(inode, &u) ?: + darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, .snapshot = inode.k->p.snapshot, })); @@ -929,69 +942,16 @@ static int get_visible_inodes(struct btree_trans *trans, return ret; } -static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) -{ - if (d.v->d_type == DT_SUBVOL) { - u32 snap; - u64 inum; - int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; - return !ret; - } else { - struct btree_iter iter; - struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); - int ret = bkey_err(k); - if (ret) - return ret; - - ret = bkey_is_inode(k.k); - bch2_trans_iter_exit(trans, &iter); - return ret; - } -} - /* * Prefer to delete the first one, since that will be the one at the wrong * offset: * return value: 0 -> delete k1, 1 -> delete k2 */ -static int hash_pick_winner(struct btree_trans *trans, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c k1, - struct bkey_s_c k2) -{ - if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && - !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) - return 0; - - switch (desc.btree_id) { - case BTREE_ID_dirents: { - int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1)); - if (ret < 0) - return ret; - if (!ret) - return 0; - - ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2)); - if (ret < 0) - return ret; - if (!ret) - return 1; - return 2; - } - default: - return 0; - } -} - -static int fsck_update_backpointers(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_i *new) +int bch2_fsck_update_backpointers(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_i *new) { if (new->k.type != KEY_TYPE_dirent) return 0; @@ -1019,160 +979,6 @@ static int fsck_update_backpointers(struct btree_trans *trans, return ret; } -static int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) -{ - struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); - int ret = PTR_ERR_OR_ZERO(new); - if (ret) - return ret; - - bkey_dirent_init(&new->k_i); - dirent_copy_target(new, old); - new->k.p = old.k->p; - - for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len); - - if (u64s > U8_MAX) - return -EINVAL; - - new->k.u64s = u64s; - - ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - (subvol_inum) { 0, old.k->p.inode }, - old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (!bch2_err_matches(ret, EEXIST)) - break; - } - - if (ret) - return ret; - - return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); -} - -static int hash_check_key(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; - struct printbuf buf = PRINTBUF; - struct bkey_s_c k; - u64 hash; - int ret = 0; - - if (hash_k.k->type != desc.key_type) - return 0; - - hash = desc.hash_bkey(hash_info, hash_k); - - if (likely(hash == hash_k.k->p.offset)) - return 0; - - if (hash_k.k->p.offset < hash) - goto bad_hash; - - for_each_btree_key_norestart(trans, iter, desc.btree_id, - SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots, k, ret) { - if (bkey_eq(k.k->p, hash_k.k->p)) - break; - - if (k.k->type == desc.key_type && - !desc.cmp_bkey(k, hash_k)) - goto duplicate_entries; - - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); - goto bad_hash; - } - } -out: - bch2_trans_iter_exit(trans, &iter); - printbuf_exit(&buf); - return ret; -bad_hash: - if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", - bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); - if (IS_ERR(new)) - return PTR_ERR(new); - - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info, - (subvol_inum) { 0, hash_k.k->p.inode }, - hash_k.k->p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k) - goto duplicate_entries; - - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - fsck_update_backpointers(trans, s, desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } -fsck_err: - goto out; -duplicate_entries: - ret = hash_pick_winner(trans, desc, hash_info, hash_k, k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0); - break; - case 2: - ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; - goto out; -} - -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, @@ -1260,7 +1066,7 @@ static int get_snapshot_root_inode(struct btree_trans *trans, goto err; BUG(); found_root: - BUG_ON(bch2_inode_unpack(k, root)); + ret = bch2_inode_unpack(k, root); err: bch2_trans_iter_exit(trans, &iter); return ret; @@ -1291,7 +1097,9 @@ static int check_inode(struct btree_trans *trans, if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); + ret = bch2_inode_unpack(k, &u); + if (ret) + goto err; if (snapshot_root->bi_inum != u.bi_inum) { ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); @@ -1302,7 +1110,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), trans, inode_snapshot_mismatch, - "inodes in different snapshots don't match")) { + "inode hash info in different snapshots don't match")) { u.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); do_update = true; @@ -1392,7 +1200,7 @@ static int check_inode(struct btree_trans *trans, if (fsck_err_on(!ret, trans, inode_unlinked_and_not_open, - "inode %llu%u unlinked and not open", + "inode %llu:%u unlinked and not open", u.bi_inum, u.bi_snapshot)) { ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck deleting inode"); @@ -1415,7 +1223,7 @@ static int check_inode(struct btree_trans *trans, if (u.bi_subvol) { struct bch_subvolume s; - ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); + ret = bch2_subvolume_get(trans, u.bi_subvol, false, &s); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -1441,6 +1249,17 @@ static int check_inode(struct btree_trans *trans, do_update = true; } } + + if (fsck_err_on(u.bi_journal_seq > journal_cur_seq(&c->journal), + trans, inode_journal_seq_in_future, + "inode journal seq in future (currently at %llu)\n%s", + journal_cur_seq(&c->journal), + (printbuf_reset(&buf), + bch2_inode_unpacked_to_text(&buf, &u), + buf.buf))) { + u.bi_journal_seq = journal_cur_seq(&c->journal); + do_update = true; + } do_update: if (do_update) { ret = __bch2_fsck_write_inode(trans, &u); @@ -1502,7 +1321,9 @@ static int find_oldest_inode_needs_reattach(struct btree_trans *trans, break; struct bch_inode_unpacked parent_inode; - bch2_inode_unpack(k, &parent_inode); + ret = bch2_inode_unpack(k, &parent_inode); + if (ret) + break; if (!inode_should_reattach(&parent_inode)) break; @@ -1525,7 +1346,9 @@ static int check_unreachable_inode(struct btree_trans *trans, return 0; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(k, &inode)); + ret = bch2_inode_unpack(k, &inode); + if (ret) + return ret; if (!inode_should_reattach(&inode)) return 0; @@ -1649,7 +1472,7 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->count != count2) { bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", w->last_pos.inode, i->snapshot, i->count, count2); - return -BCH_ERR_internal_fsck_err; + i->count = count2; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), @@ -1753,7 +1576,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter1, btree, pos1, BTREE_ITER_all_snapshots| BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); + k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) goto err; @@ -1778,7 +1601,7 @@ static int overlapping_extents_found(struct btree_trans *trans, while (1) { bch2_btree_iter_advance(&iter2); - k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); + k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); ret = bkey_err(k2); if (ret) goto err; @@ -2156,7 +1979,7 @@ static int check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target); } - if (bch2_inode_should_have_bp(target) && + if (bch2_inode_should_have_single_bp(target) && !fsck_err(trans, inode_wrong_backpointer, "dirent points to inode that does not point back:\n %s", (bch2_bkey_val_to_text(&buf, c, d.s_c), @@ -2385,7 +2208,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, trans, subvol_fs_path_parent_wrong, - "subvol with wrong fs_path_parent, should be be %u\n%s", + "subvol with wrong fs_path_parent, should be %u\n%s", parent_subvol, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { struct bkey_i_subvolume *n = @@ -2480,7 +2303,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); if (ret < 0) goto err; if (ret) { @@ -2594,7 +2417,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k); + ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); bch_err_fn(c, ret); return ret; } @@ -2774,6 +2597,48 @@ struct pathbuf_entry { typedef DARRAY(struct pathbuf_entry) pathbuf; +static int bch2_bi_depth_renumber_one(struct btree_trans *trans, struct pathbuf_entry *p, + u32 new_depth) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, p->inum, p->snapshot), 0); + + struct bch_inode_unpacked inode; + int ret = bkey_err(k) ?: + !bkey_is_inode(k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + if (inode.bi_depth != new_depth) { + inode.bi_depth = new_depth; + ret = __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_bi_depth_renumber(struct btree_trans *trans, pathbuf *path, u32 new_bi_depth) +{ + u32 restart_count = trans->restart_count; + int ret = 0; + + darray_for_each_reverse(*path, i) { + ret = nested_lockrestart_do(trans, + bch2_bi_depth_renumber_one(trans, i, new_bi_depth)); + bch_err_fn(trans->c, ret); + if (ret) + break; + + new_bi_depth++; + } + + return ret ?: trans_was_restarted(trans, restart_count); +} + static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { darray_for_each(*p, i) @@ -2783,21 +2648,21 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) +static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; struct btree_iter inode_iter = {}; - struct bch_inode_unpacked inode; + pathbuf path = {}; struct printbuf buf = PRINTBUF; u32 snapshot = inode_k.k->p.snapshot; + bool redo_bi_depth = false; + u32 min_bi_depth = U32_MAX; int ret = 0; - p->nr = 0; - - BUG_ON(bch2_inode_unpack(inode_k, &inode)); - - if (!S_ISDIR(inode.bi_mode)) - return 0; + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(inode_k, &inode); + if (ret) + return ret; while (!inode.bi_subvol) { struct btree_iter dirent_iter; @@ -2807,7 +2672,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - break; + goto out; if (!ret && (ret = dirent_points_to_inode(c, d, &inode))) bch2_trans_iter_exit(trans, &dirent_iter); @@ -2822,7 +2687,7 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &dirent_iter); - ret = darray_push(p, ((struct pathbuf_entry) { + ret = darray_push(&path, ((struct pathbuf_entry) { .inum = inode.bi_inum, .snapshot = snapshot, })); @@ -2834,22 +2699,32 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino bch2_trans_iter_exit(trans, &inode_iter); inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, inode.bi_dir, snapshot), 0); + + struct bch_inode_unpacked parent_inode; ret = bkey_err(inode_k) ?: !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode - : bch2_inode_unpack(inode_k, &inode); + : bch2_inode_unpack(inode_k, &parent_inode); if (ret) { /* Should have been caught in dirents pass */ bch_err_msg(c, ret, "error looking up parent directory"); - break; + goto out; } + min_bi_depth = parent_inode.bi_depth; + + if (parent_inode.bi_depth < inode.bi_depth && + min_bi_depth < U16_MAX) + break; + + inode = parent_inode; snapshot = inode_k.k->p.snapshot; + redo_bi_depth = true; - if (path_is_dup(p, inode.bi_inum, snapshot)) { + if (path_is_dup(&path, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); - darray_for_each(*p, i) + darray_for_each(path, i) pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode.bi_inum, snapshot); @@ -2862,12 +2737,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino ret = reattach_inode(trans, &inode); bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); } - break; + + goto out; } } + + if (inode.bi_subvol) + min_bi_depth = 0; + + if (redo_bi_depth) + ret = bch2_bi_depth_renumber(trans, &path, min_bi_depth); out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); + darray_exit(&path); printbuf_exit(&buf); bch_err_fn(c, ret); return ret; @@ -2879,24 +2762,20 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino */ int bch2_check_directory_structure(struct bch_fs *c) { - pathbuf path = { 0, }; - int ret; - - ret = bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN, BTREE_ITER_intent| BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - if (!bkey_is_inode(k.k)) + if (!S_ISDIR(bkey_inode_mode(k))) continue; if (bch2_inode_flags(k) & BCH_INODE_unlinked) continue; - check_path(trans, &path, k); + check_path_loop(trans, k); }))); - darray_exit(&path); bch_err_fn(c, ret); return ret; @@ -2994,7 +2873,9 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c, /* Should never fail, checked by bch2_inode_invalid: */ struct bch_inode_unpacked u; - BUG_ON(bch2_inode_unpack(k, &u)); + _ret3 = bch2_inode_unpack(k, &u); + if (_ret3) + break; /* * Backpointer and directory structure checks are sufficient for @@ -3072,7 +2953,9 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite if (!bkey_is_inode(k.k)) return 0; - BUG_ON(bch2_inode_unpack(k, &u)); + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; if (S_ISDIR(u.bi_mode)) return 0; @@ -3194,3 +3077,223 @@ int bch2_fix_reflink_p(struct bch_fs *c) bch_err_fn(c, ret); return ret; } + +#ifndef NO_BCACHEFS_CHARDEV + +struct fsck_thread { + struct thread_with_stdio thr; + struct bch_fs *c; + struct bch_opts opts; +}; + +static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) +{ + struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr); + kfree(thr); +} + +static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) +{ + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + int ret = PTR_ERR_OR_ZERO(c); + if (ret) + return ret; + + ret = bch2_fs_start(thr->c); + if (ret) + goto err; + + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); + ret |= 1; + } + if (test_bit(BCH_FS_error, &c->flags)) { + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); + ret |= 4; + } +err: + bch2_fs_stop(c); + return ret; +} + +static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_offline_thread_fn, +}; + +long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) +{ + struct bch_ioctl_fsck_offline arg; + struct fsck_thread *thr = NULL; + darray_str(devs) = {}; + long ret = 0; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + for (size_t i = 0; i < arg.nr_devs; i++) { + u64 dev_u64; + ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64)); + if (ret) + goto err; + + char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX); + ret = PTR_ERR_OR_ZERO(dev_str); + if (ret) + goto err; + + ret = darray_push(&devs, dev_str); + if (ret) { + kfree(dev_str); + goto err; + } + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); + if (!IS_ERR(optstr)) + kfree(optstr); + + if (ret) + goto err; + } + + opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); + opt_set(thr->opts, read_only, 1); + opt_set(thr->opts, ratelimit_errors, 0); + + /* We need request_key() to be called before we punt to kthread: */ + opt_set(thr->opts, nostart, true); + + bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); + + thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + + if (!IS_ERR(thr->c) && + thr->c->opts.errors == BCH_ON_ERROR_panic) + thr->c->opts.errors = BCH_ON_ERROR_ro; + + ret = __bch2_run_thread_with_stdio(&thr->thr); +out: + darray_for_each(devs, i) + kfree(*i); + darray_exit(&devs); + return ret; +err: + if (thr) + bch2_fsck_thread_exit(&thr->thr); + pr_err("ret %s", bch2_err_str(ret)); + goto out; +} + +static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) +{ + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); + struct bch_fs *c = thr->c; + + c->stdio_filter = current; + c->stdio = &thr->thr.stdio; + + /* + * XXX: can we figure out a way to do this without mucking with c->opts? + */ + unsigned old_fix_errors = c->opts.fix_errors; + if (opt_defined(thr->opts, fix_errors)) + c->opts.fix_errors = thr->opts.fix_errors; + else + c->opts.fix_errors = FSCK_FIX_ask; + + c->opts.fsck = true; + set_bit(BCH_FS_fsck_running, &c->flags); + + c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; + int ret = bch2_run_online_recovery_passes(c); + + clear_bit(BCH_FS_fsck_running, &c->flags); + bch_err_fn(c, ret); + + c->stdio = NULL; + c->stdio_filter = NULL; + c->opts.fix_errors = old_fix_errors; + + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + return ret; +} + +static const struct thread_with_stdio_ops bch2_online_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_online_thread_fn, +}; + +long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) +{ + struct fsck_thread *thr = NULL; + long ret = 0; + + if (arg.flags) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!bch2_ro_ref_tryget(c)) + return -EROFS; + + if (down_trylock(&c->online_fsck_mutex)) { + bch2_ro_ref_put(c); + return -EAGAIN; + } + + thr = kzalloc(sizeof(*thr), GFP_KERNEL); + if (!thr) { + ret = -ENOMEM; + goto err; + } + + thr->c = c; + thr->opts = bch2_opts_empty(); + + if (arg.opts) { + char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); + + ret = PTR_ERR_OR_ZERO(optstr) ?: + bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); + if (!IS_ERR(optstr)) + kfree(optstr); + + if (ret) + goto err; + } + + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); +err: + if (ret < 0) { + bch_err_fn(c, ret); + if (thr) + bch2_fsck_thread_exit(&thr->thr); + up(&c->online_fsck_mutex); + bch2_ro_ref_put(c); + } + return ret; +} + +#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 1cca310115309..574948278cd4d 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -2,6 +2,14 @@ #ifndef _BCACHEFS_FSCK_H #define _BCACHEFS_FSCK_H +#include "str_hash.h" + +int bch2_fsck_update_backpointers(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc, + struct bch_hash_info *, + struct bkey_i *); + int bch2_check_inodes(struct bch_fs *); int bch2_check_extents(struct bch_fs *); int bch2_check_indirect_extents(struct bch_fs *); @@ -14,4 +22,7 @@ int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); +long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *); +long bch2_ioctl_fsck_online(struct bch_fs *, struct bch_ioctl_fsck_online); + #endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 039cb7a22244d..3a08742a8cda6 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -14,6 +14,7 @@ #include "extent_update.h" #include "fs.h" #include "inode.h" +#include "opts.h" #include "str_hash.h" #include "snapshot.h" #include "subvolume.h" @@ -47,10 +48,10 @@ static int inode_decode_field(const u8 *in, const u8 *end, u8 *p; if (in >= end) - return -1; + return -BCH_ERR_inode_unpack_error; if (!*in) - return -1; + return -BCH_ERR_inode_unpack_error; /* * position of highest set bit indicates number of bytes: @@ -60,7 +61,7 @@ static int inode_decode_field(const u8 *in, const u8 *end, bytes = byte_table[shift - 1]; if (in + bytes > end) - return -1; + return -BCH_ERR_inode_unpack_error; p = (u8 *) be + 16 - bytes; memcpy(p, in, bytes); @@ -176,7 +177,7 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, return ret; \ \ if (field_bits > sizeof(unpacked->_name) * 8) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ \ unpacked->_name = field[1]; \ in += ret; @@ -217,7 +218,7 @@ static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v2() @@ -268,7 +269,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, \ unpacked->_name = v[0]; \ if (v[1] || v[0] != unpacked->_name) \ - return -1; \ + return -BCH_ERR_inode_unpack_error; \ fieldnr++; BCH_INODE_FIELDS_v3() @@ -428,7 +429,7 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) } static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bch_inode_unpacked unpacked; int ret = 0; @@ -468,7 +469,7 @@ static int __bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, } int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); int ret = 0; @@ -478,13 +479,13 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k, "invalid str hash type (%llu >= %u)", INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_validate(c, k, flags); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); int ret = 0; @@ -494,13 +495,13 @@ int bch2_inode_v2_validate(struct bch_fs *c, struct bkey_s_c k, "invalid str hash type (%llu >= %u)", INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_validate(c, k, flags); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); int ret = 0; @@ -518,7 +519,7 @@ int bch2_inode_v3_validate(struct bch_fs *c, struct bkey_s_c k, "invalid str hash type (%llu >= %u)", INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); - ret = __bch2_inode_validate(c, k, flags); + ret = __bch2_inode_validate(c, k, from); fsck_err: return ret; } @@ -617,7 +618,7 @@ bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter struct bkey_s_c k; int ret = 0; - for_each_btree_key_upto_norestart(trans, *iter, btree, + for_each_btree_key_max_norestart(trans, *iter, btree, bpos_successor(pos), SPOS(pos.inode, pos.offset, U32_MAX), flags|BTREE_ITER_all_snapshots, k, ret) @@ -652,7 +653,7 @@ int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos) struct bkey_s_c k; int ret = 0; - for_each_btree_key_upto_norestart(trans, iter, + for_each_btree_key_max_norestart(trans, iter, BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos), BTREE_ITER_all_snapshots| BTREE_ITER_with_updates, k, ret) @@ -779,7 +780,7 @@ int bch2_trigger_inode(struct btree_trans *trans, } int bch2_inode_generation_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -798,6 +799,28 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } +int bch2_inode_alloc_cursor_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + int ret = 0; + + bkey_fsck_err_on(k.k->p.inode != LOGGED_OPS_INUM_inode_cursors, + c, inode_alloc_cursor_inode_bad, + "k.p.inode bad"); +fsck_err: + return ret; +} + +void bch2_inode_alloc_cursor_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_alloc_cursor i = bkey_s_c_to_inode_alloc_cursor(k); + + prt_printf(out, "idx %llu generation %llu", + le64_to_cpu(i.v->idx), + le64_to_cpu(i.v->gen)); +} + void bch2_inode_init_early(struct bch_fs *c, struct bch_inode_unpacked *inode_u) { @@ -858,43 +881,78 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -/* - * This just finds an empty slot: - */ -int bch2_inode_create(struct btree_trans *trans, - struct btree_iter *iter, - struct bch_inode_unpacked *inode_u, - u32 snapshot, u64 cpu) +static struct bkey_i_inode_alloc_cursor * +bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { struct bch_fs *c = trans->c; - struct bkey_s_c k; - u64 min, max, start, pos, *hint; - int ret = 0; - unsigned bits = (c->opts.inodes_32bit ? 31 : 63); - if (c->opts.shard_inode_numbers) { - bits -= c->inode_shard_bits; + u64 cursor_idx = c->opts.inodes_32bit ? 0 : cpu + 1; - min = (cpu << bits); - max = (cpu << bits) | ~(ULLONG_MAX << bits); + cursor_idx &= ~(~0ULL << c->opts.shard_inode_numbers_bits); - min = max_t(u64, min, BLOCKDEV_INODE_MAX); - hint = c->unused_inode_hints + cpu; + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_inode_cursors, cursor_idx), + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ERR_PTR(ret); + + struct bkey_i_inode_alloc_cursor *cursor = + k.k->type == KEY_TYPE_inode_alloc_cursor + ? bch2_bkey_make_mut_typed(trans, &iter, &k, 0, inode_alloc_cursor) + : bch2_bkey_alloc(trans, &iter, 0, inode_alloc_cursor); + ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + goto err; + + if (c->opts.inodes_32bit) { + *min = BLOCKDEV_INODE_MAX; + *max = INT_MAX; } else { - min = BLOCKDEV_INODE_MAX; - max = ~(ULLONG_MAX << bits); - hint = c->unused_inode_hints; + cursor->v.bits = c->opts.shard_inode_numbers_bits; + + unsigned bits = 63 - c->opts.shard_inode_numbers_bits; + + *min = max(cpu << bits, (u64) INT_MAX + 1); + *max = (cpu << bits) | ~(ULLONG_MAX << bits); } - start = READ_ONCE(*hint); + if (le64_to_cpu(cursor->v.idx) < *min) + cursor->v.idx = cpu_to_le64(*min); - if (start >= max || start < min) - start = min; + if (le64_to_cpu(cursor->v.idx) >= *max) { + cursor->v.idx = cpu_to_le64(*min); + le32_add_cpu(&cursor->v.gen, 1); + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : cursor; +} + +/* + * This just finds an empty slot: + */ +int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, + u32 snapshot, u64 cpu) +{ + u64 min, max; + struct bkey_i_inode_alloc_cursor *cursor = + bch2_inode_alloc_cursor_get(trans, cpu, &min, &max); + int ret = PTR_ERR_OR_ZERO(cursor); + if (ret) + return ret; + + u64 start = le64_to_cpu(cursor->v.idx); + u64 pos = start; - pos = start; bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), BTREE_ITER_all_snapshots| BTREE_ITER_intent); + struct bkey_s_c k; again: while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k)) && @@ -924,6 +982,7 @@ int bch2_inode_create(struct btree_trans *trans, /* Retry from start */ pos = start = min; bch2_btree_iter_set_pos(iter, POS(0, pos)); + le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); @@ -934,9 +993,9 @@ int bch2_inode_create(struct btree_trans *trans, return ret; } - *hint = k.k->p.offset; inode_u->bi_inum = k.k->p.offset; - inode_u->bi_generation = bkey_generation(k); + inode_u->bi_generation = le64_to_cpu(cursor->v.gen); + cursor->v.idx = cpu_to_le64(k.k->p.offset + 1); return 0; } @@ -966,7 +1025,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, bch2_btree_iter_set_snapshot(&iter, snapshot); - k = bch2_btree_iter_peek_upto(&iter, end); + k = bch2_btree_iter_peek_max(&iter, end); ret = bkey_err(k); if (ret) goto err; @@ -998,8 +1057,6 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter = { NULL }; - struct bkey_i_inode_generation delete; - struct bch_inode_unpacked inode_u; struct bkey_s_c k; u32 snapshot; int ret; @@ -1039,13 +1096,7 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) goto err; } - bch2_inode_unpack(k, &inode_u); - - bkey_inode_generation_init(&delete.k_i); - delete.k.p = iter.pos; - delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); - - ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + ret = bch2_btree_delete_at(trans, &iter, 0) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); err: @@ -1141,12 +1192,17 @@ struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, struct bch_inode_unpacked *inode) { -#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); +#define x(_name, _bits) \ + if ((inode)->bi_##_name) { \ + opts->_name = inode->bi_##_name - 1; \ + opts->_name##_from_inode = true; \ + } else { \ + opts->_name = c->opts._name; \ + } BCH_INODE_OPTS() #undef x - if (opts->nocow) - opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; + bch2_io_opts_fixups(opts); } int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts) @@ -1361,7 +1417,7 @@ int bch2_delete_dead_inodes(struct bch_fs *c) /* * if we ran check_inodes() unlinked inodes will have already been * cleaned up but the write buffer will be out of sync; therefore we - * alway need a write buffer flush + * always need a write buffer flush */ ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) @@ -1380,7 +1436,8 @@ int bch2_delete_dead_inodes(struct bch_fs *c) NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); if (ret > 0) { - bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); + bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", + k.k->p.offset, k.k->p.snapshot); ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot); /* diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index eab82b5eb8977..d2e134528f0e6 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -7,15 +7,14 @@ #include "opts.h" #include "snapshot.h" -enum bch_validate_flags; extern const char * const bch2_inode_opts[]; int bch2_inode_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); int bch2_inode_v2_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos); @@ -60,7 +59,7 @@ static inline bool bkey_is_inode(const struct bkey *k) } int bch2_inode_generation_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ @@ -69,6 +68,16 @@ void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bk .min_val_size = 8, \ }) +int bch2_inode_alloc_cursor_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); +void bch2_inode_alloc_cursor_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_alloc_cursor ((struct bkey_ops) { \ + .key_validate = bch2_inode_alloc_cursor_validate, \ + .val_to_text = bch2_inode_alloc_cursor_to_text, \ + .min_val_size = 16, \ +}) + #if 0 typedef struct { u64 lo; @@ -220,6 +229,20 @@ static inline u32 bch2_inode_flags(struct bkey_s_c k) } } +static inline unsigned bkey_inode_mode(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode); + case KEY_TYPE_inode_v2: + return le16_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_mode); + case KEY_TYPE_inode_v3: + return INODEv3_MODE(bkey_s_c_to_inode_v3(k).v); + default: + return 0; + } +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) @@ -249,7 +272,7 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); -static inline bool bch2_inode_should_have_bp(struct bch_inode_unpacked *inode) +static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) { bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; @@ -262,6 +285,14 @@ void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +static inline struct bch_extent_rebalance +bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode) +{ + struct bch_io_opts io_opts; + bch2_inode_opts_get(&io_opts, c, inode); + return io_opts_to_rebalance_opts(&io_opts); +} + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 7928d0c6954fa..b99a5bf1a75ed 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -101,7 +101,9 @@ struct bch_inode_generation { x(bi_dir_offset, 64) \ x(bi_subvol, 32) \ x(bi_parent_subvol, 32) \ - x(bi_nocow, 8) + x(bi_nocow, 8) \ + x(bi_depth, 32) \ + x(bi_inodes_32bit, 8) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -114,7 +116,8 @@ struct bch_inode_generation { x(foreground_target, 16) \ x(background_target, 16) \ x(erasure_code, 16) \ - x(nocow, 8) + x(nocow, 8) \ + x(inodes_32bit, 8) enum inode_opt_id { #define x(name, ...) \ @@ -164,4 +167,12 @@ LE64_BITMASK(INODEv3_FIELDS_START, struct bch_inode_v3, bi_flags, 31, 36); LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); +struct bch_inode_alloc_cursor { + struct bch_val v; + __u8 bits; + __u8 pad; + __le32 gen; + __le64 idx; +}; + #endif /* _BCACHEFS_INODE_FORMAT_H */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index f283051758d6d..e01e60219832e 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -113,11 +113,13 @@ int bch2_extent_fallocate(struct btree_trans *trans, err: if (!ret && sectors_allocated) bch2_increment_clock(c, sectors_allocated, WRITE); - if (should_print_err(ret)) - bch_err_inum_offset_ratelimited(c, - inum.inum, - iter->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + if (should_print_err(ret)) { + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); + prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } err_noprint: bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); @@ -133,7 +135,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, } /* - * Returns -BCH_ERR_transacton_restart if we had to drop locks: + * Returns -BCH_ERR_transaction_restart if we had to drop locks: */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, subvol_inum inum, u64 end, @@ -164,9 +166,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_btree_iter_set_snapshot(iter, snapshot); /* - * peek_upto() doesn't have ideal semantics for extents: + * peek_max() doesn't have ideal semantics for extents: */ - k = bch2_btree_iter_peek_upto(iter, end_pos); + k = bch2_btree_iter_peek_max(iter, end_pos); if (!k.k) break; @@ -426,8 +428,8 @@ case LOGGED_OP_FINSERT_shift_extents: bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); k = insert - ? bch2_btree_iter_peek_prev(&iter) - : bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) + : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; @@ -461,7 +463,7 @@ case LOGGED_OP_FINSERT_shift_extents: op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset); - ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &opts, copy) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?: bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?: bch2_logged_op_update(trans, &op->k_i) ?: diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index b3b934a87c6de..34a3569d085a3 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -21,6 +21,7 @@ #include "io_read.h" #include "io_misc.h" #include "io_write.h" +#include "reflink.h" #include "subvolume.h" #include "trace.h" @@ -231,11 +232,11 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, update_opts.target = opts.foreground_target; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned i = 0; + unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { if (bch2_dev_io_failures(failed, ptr->dev)) - update_opts.rewrite_ptrs |= BIT(i); - i++; + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; } } @@ -321,6 +322,20 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, /* Read */ +static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) +{ + return bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { rbio->subvol, read_pos.inode }, + read_pos.offset << 9); +} + +static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, + struct bch_read_bio *rbio, struct bpos read_pos) +{ + bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); +} + #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 @@ -499,6 +514,29 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, } } +static void bch2_read_io_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bio *bio = &rbio->bio; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); + + if (ca) { + bch2_io_error(ca, BCH_MEMBER_ERROR_read); + bch_err_ratelimited(ca, "%s", buf.buf); + } else { + bch_err_ratelimited(c, "%s", buf.buf); + } + + printbuf_exit(&buf); + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); +} + static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { @@ -562,6 +600,73 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) __bch2_rbio_narrow_crcs(trans, rbio)); } +static void bch2_read_csum_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bio *src = &rbio->bio; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "data "); + bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) { + bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); + bch_err_ratelimited(ca, "%s", buf.buf); + } else { + bch_err_ratelimited(c, "%s", buf.buf); + } + + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + printbuf_exit(&buf); +} + +static void bch2_read_decompress_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decompression error"); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) + bch_err_ratelimited(ca, "%s", buf.buf); + else + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + printbuf_exit(&buf); +} + +static void bch2_read_decrypt_err(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct printbuf buf = PRINTBUF; + + bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); + prt_str(&buf, "decrypt error"); + + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + if (ca) + bch_err_ratelimited(ca, "%s", buf.buf); + else + bch_err_ratelimited(c, "%s", buf.buf); + + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + printbuf_exit(&buf); +} + /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { @@ -668,33 +773,13 @@ static void __bch2_read_endio(struct work_struct *work) goto out; } - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_str(&buf, "data "); - bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) { - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "data %s", buf.buf); - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - } - printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decompression error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decrypt_err: - bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, - rbio->read_pos.offset << 9, - "decrypt error"); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; } @@ -715,16 +800,8 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bio->bi_status) { - if (ca) { - bch_err_inum_offset_ratelimited(ca, - rbio->read_pos.inode, - rbio->read_pos.offset, - "data read error: %s", - bch2_blk_status_to_str(bio->bi_status)); - bch2_io_error(ca, BCH_MEMBER_ERROR_read); - } - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + if (unlikely(bio->bi_status)) { + bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); return; } @@ -750,45 +827,6 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } -int __bch2_read_indirect_extent(struct btree_trans *trans, - unsigned *offset_into_extent, - struct bkey_buf *orig_k) -{ - struct btree_iter iter; - struct bkey_s_c k; - u64 reflink_offset; - int ret; - - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + - *offset_into_extent; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, - POS(0, reflink_offset), 0); - ret = bkey_err(k); - if (ret) - goto err; - - if (k.k->type != KEY_TYPE_reflink_v && - k.k->type != KEY_TYPE_indirect_inline_data) { - bch_err_inum_offset_ratelimited(trans->c, - orig_k->k->k.p.inode, - orig_k->k->k.p.offset << 9, - "%llu len %u points to nonexistent indirect extent %llu", - orig_k->k->k.p.offset, - orig_k->k->k.size, - reflink_offset); - bch2_inconsistent_error(trans->c); - ret = -BCH_ERR_missing_indirect_extent; - goto err; - } - - *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - bch2_bkey_buf_reassemble(orig_k, trans->c, k); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, @@ -868,15 +906,24 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if (!pick_ret) goto hole; - if (pick_ret < 0) { + if (unlikely(pick_ret < 0)) { struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); bch2_bkey_val_to_text(&buf, c, k); - bch_err_inum_offset_ratelimited(c, - read_pos.inode, read_pos.offset << 9, - "no device to read from: %s\n %s", - bch2_err_str(pick_ret), - buf.buf); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + goto err; + } + + if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, orig, read_pos); + prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); goto err; } @@ -1062,11 +1109,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, } if (!rbio->pick.idx) { - if (!rbio->have_ioref) { - bch_err_inum_offset_ratelimited(c, - read_pos.inode, - read_pos.offset << 9, - "no device to read from"); + if (unlikely(!rbio->have_ioref)) { + struct printbuf buf = PRINTBUF; + bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); + prt_printf(&buf, "no device to read from:\n "); + bch2_bkey_val_to_text(&buf, c, k); + + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } @@ -1164,7 +1215,6 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1184,9 +1234,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, if (ret) goto err; - offset_into_extent = iter.pos.offset - + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; + unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&sk, c, k); @@ -1201,9 +1251,9 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: */ - sectors = min(sectors, k.k->size - offset_into_extent); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) @@ -1229,16 +1279,20 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, } bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); if (ret) { - bch_err_inum_offset_ratelimited(c, inum.inum, - bvec_iter.bi_sector << 9, - "read error %i from btree lookup", ret); + struct printbuf buf = PRINTBUF; + bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); + prt_printf(&buf, "read error %i from btree lookup", ret); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); } + + bch2_trans_put(trans); + bch2_bkey_buf_exit(&sk, c); } void bch2_fs_io_read_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index d9c18bb7d4035..a82e8a94ccb61 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,7 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "reflink.h" struct bch_read_bio { struct bch_fs *c; @@ -79,19 +80,32 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; -int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, - struct bkey_buf *); - static inline int bch2_read_indirect_extent(struct btree_trans *trans, enum btree_id *data_btree, - unsigned *offset_into_extent, - struct bkey_buf *k) + s64 *offset_into_extent, + struct bkey_buf *extent) { - if (k->k->k.type != KEY_TYPE_reflink_p) + if (extent->k->k.type != KEY_TYPE_reflink_p) return 0; *data_btree = BTREE_ID_reflink; - return __bch2_read_indirect_extent(trans, offset_into_extent, k); + struct btree_iter iter; + struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, + offset_into_extent, + bkey_i_to_s_c_reflink_p(extent->k), + true, 0); + int ret = bkey_err(k); + if (ret) + return ret; + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + return -BCH_ERR_missing_indirect_extent; + } + + bch2_bkey_buf_reassemble(extent, trans->c, k); + bch2_trans_iter_exit(trans, &iter); + return 0; } enum bch_read_flags { diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 96720adcfee03..9067f2a04d283 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -164,7 +164,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_upto_continue_norestart(iter, + for_each_btree_key_max_continue_norestart(iter, new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), @@ -216,6 +216,7 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, SPOS(0, extent_iter->pos.inode, extent_iter->snapshot), + BTREE_ITER_intent| BTREE_ITER_cached); int ret = bkey_err(k); if (unlikely(ret)) @@ -369,7 +370,7 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, @@ -395,6 +396,21 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ +static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, + u64 offset) +{ + bch2_inum_offset_err_msg(op->c, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + +static void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) +{ + __bch2_write_op_error(out, op, op->pos.offset); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -531,14 +547,14 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); - if (ret && !bch2_err_matches(ret, EROFS)) { + if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "%s write error while doing btree update: %s", - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } if (ret) @@ -621,9 +637,7 @@ void bch2_write_point_do_index_updates(struct work_struct *work) while (1) { spin_lock_irq(&wp->writes_lock); - op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); - if (op) - list_del(&op->wp_list); + op = list_pop_entry(&wp->writes, struct bch_write_op, wp_list); wp_update_state(wp, op != NULL); spin_unlock_irq(&wp->writes_lock); @@ -1080,11 +1094,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, *_dst = dst; return more; csum_err: - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s write error: error verifying existing checksum while rewriting existing data (memory corruption?)", - op->flags & BCH_WRITE_MOVE ? "move" : "user"); + { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1165,7 +1182,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct btree_trans *trans = bch2_trans_get(c); for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents, + int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ @@ -1175,11 +1192,11 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) if (ret && !bch2_err_matches(ret, EROFS)) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - bch_err_inum_offset_ratelimited(c, - insert->k.p.inode, insert->k.p.offset << 9, - "%s write error while doing btree update: %s", - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); } if (ret) { @@ -1339,17 +1356,19 @@ static void bch2_nocow_write(struct bch_write_op *op) if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + bch2_trans_put(trans); + darray_exit(&buckets); + if (ret) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, op->pos.offset << 9, - "%s: btree lookup error %s", __func__, bch2_err_str(ret)); + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); op->error = ret; op->flags |= BCH_WRITE_SUBMITTED; } - bch2_trans_put(trans); - darray_exit(&buckets); - /* fallback to cow write path? */ if (!(op->flags & BCH_WRITE_SUBMITTED)) { closure_sync(&op->cl); @@ -1462,14 +1481,14 @@ static void __bch2_write(struct bch_write_op *op) if (ret <= 0) { op->flags |= BCH_WRITE_SUBMITTED; - if (ret < 0) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s(): %s error: %s", __func__, - op->flags & BCH_WRITE_MOVE ? "move" : "user", - bch2_err_str(ret)); + if (unlikely(ret < 0)) { + if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } op->error = ret; break; } @@ -1491,7 +1510,7 @@ static void __bch2_write(struct bch_write_op *op) /* * Sync or no? * - * If we're running asynchronously, wne may still want to block + * If we're running asynchronously, we may still want to block * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ @@ -1595,12 +1614,11 @@ CLOSURE_CALLBACK(bch2_write) bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; - if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { - bch_err_inum_offset_ratelimited(c, - op->pos.inode, - op->pos.offset << 9, - "%s write error: misaligned write", - op->flags & BCH_WRITE_MOVE ? "move" : "user"); + if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { + struct printbuf buf = PRINTBUF; + bch2_write_op_error(&buf, op); + prt_printf(&buf, "misaligned write"); + printbuf_exit(&buf); op->error = -EIO; goto err; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 2dc0d60c17450..326c9b317d26c 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -167,7 +167,7 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) j->err_seq = journal_cur_seq(j); spin_unlock(&j->lock); - bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", + bch_err(c, "Journal stuck! Have a pre-reservation but journal full (error %s)", bch2_journal_errors[error]); bch2_journal_debug_to_text(&buf, j); bch_err(c, "%s", buf.buf); @@ -217,6 +217,12 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq) if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); bch2_journal_do_writes(j); + + /* + * for __bch2_next_write_buffer_flush_journal_buf(), when quiescing an + * open journal entry + */ + wake_up(&j->wait); } /* @@ -251,6 +257,9 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t if (!__journal_entry_is_open(old)) return; + if (old.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) + old.cur_entry_offset = j->cur_entry_offset_if_blocked; + /* Close out old buffer: */ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); @@ -373,6 +382,10 @@ static int journal_entry_open(struct journal *j) if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) return JOURNAL_ERR_max_in_flight; + if (bch2_fs_fatal_err_on(journal_cur_seq(j) >= JOURNAL_SEQ_MAX, + c, "cannot start: journal seq overflow")) + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + BUG_ON(!j->cur_entry_sectors); buf->expires = @@ -664,7 +677,7 @@ void bch2_journal_entry_res_resize(struct journal *j, * @seq: seq to flush * @parent: closure object to wait with * Returns: 1 if @seq has already been flushed, 0 if @seq is being flushed, - * -EIO if @seq will never be flushed + * -BCH_ERR_journal_flush_err if @seq will never be flushed * * Like bch2_journal_wait_on_seq, except that it triggers a write immediately if * necessary @@ -687,7 +700,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -EIO; + ret = -BCH_ERR_journal_flush_err; goto out; } @@ -794,10 +807,11 @@ int bch2_journal_flush(struct journal *j) } /* - * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * bch2_journal_noflush_seq - ask the journal not to issue any flushes in the + * range [start, end) * @seq */ -bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +bool bch2_journal_noflush_seq(struct journal *j, u64 start, u64 end) { struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 unwritten_seq; @@ -806,19 +820,19 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) return false; - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) return false; spin_lock(&j->lock); - if (seq <= c->journal.flushed_seq_ondisk) + if (c->journal.flushed_seq_ondisk >= start) goto out; for (unwritten_seq = journal_last_unwritten_seq(j); - unwritten_seq < seq; + unwritten_seq < end; unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - /* journal flush already in flight, or flush requseted */ + /* journal flush already in flight, or flush requested */ if (buf->must_flush) goto out; @@ -831,19 +845,14 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) return ret; } -int bch2_journal_meta(struct journal *j) +static int __bch2_journal_meta(struct journal *j) { - struct journal_buf *buf; - struct journal_res res; - int ret; - - memset(&res, 0, sizeof(res)); - - ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + struct journal_res res = {}; + int ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); if (ret) return ret; - buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + struct journal_buf *buf = j->buf + (res.seq & JOURNAL_BUF_MASK); buf->must_flush = true; if (!buf->flush_time) { @@ -856,27 +865,70 @@ int bch2_journal_meta(struct journal *j) return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE); } +int bch2_journal_meta(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) + return -EROFS; + + int ret = __bch2_journal_meta(j); + bch2_write_ref_put(c, BCH_WRITE_REF_journal); + return ret; +} + /* block/unlock the journal: */ void bch2_journal_unblock(struct journal *j) { spin_lock(&j->lock); - j->blocked--; + if (!--j->blocked && + j->cur_entry_offset_if_blocked < JOURNAL_ENTRY_CLOSED_VAL && + j->reservations.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + new.v = old.v; + new.cur_entry_offset = j->cur_entry_offset_if_blocked; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + } spin_unlock(&j->lock); journal_wake(j); } +static void __bch2_journal_block(struct journal *j) +{ + if (!j->blocked++) { + union journal_res_state old, new; + + old.v = atomic64_read(&j->reservations.counter); + do { + j->cur_entry_offset_if_blocked = old.cur_entry_offset; + + if (j->cur_entry_offset_if_blocked >= JOURNAL_ENTRY_CLOSED_VAL) + break; + + new.v = old.v; + new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; + } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); + + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); + } +} + void bch2_journal_block(struct journal *j) { spin_lock(&j->lock); - j->blocked++; + __bch2_journal_block(j); spin_unlock(&j->lock); journal_quiesce(j); } -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret = NULL; @@ -893,13 +945,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou struct journal_buf *buf = j->buf + idx; if (buf->need_flush_to_write_buffer) { - if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); - union journal_res_state s; s.v = atomic64_read_acquire(&j->reservations.counter); - ret = journal_state_count(s, idx) + unsigned open = seq == journal_cur_seq(j) && __journal_entry_is_open(s); + + if (open && !*blocked) { + __bch2_journal_block(j); + *blocked = true; + } + + ret = journal_state_count(s, idx) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -912,11 +968,17 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou return ret; } -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, + u64 max_seq, bool *blocked) { struct journal_buf *ret; + *blocked = false; + + wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, + max_seq, blocked)) != ERR_PTR(-EAGAIN)); + if (IS_ERR_OR_NULL(ret) && *blocked) + bch2_journal_unblock(j); - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); return ret; } @@ -945,19 +1007,17 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } for (nr_got = 0; nr_got < nr_want; nr_got++) { - if (new_fs) { - bu[nr_got] = bch2_bucket_alloc_new_fs(ca); - if (bu[nr_got] < 0) { - ret = -BCH_ERR_ENOSPC_bucket_alloc; - break; - } - } else { - ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, - BCH_DATA_journal, cl); - ret = PTR_ERR_OR_ZERO(ob[nr_got]); - if (ret) - break; + enum bch_watermark watermark = new_fs + ? BCH_WATERMARK_btree + : BCH_WATERMARK_normal; + ob[nr_got] = bch2_bucket_alloc(c, ca, watermark, + BCH_DATA_journal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + if (!new_fs) { ret = bch2_trans_run(c, bch2_trans_mark_metadata_bucket(trans, ca, ob[nr_got]->bucket, BCH_DATA_journal, @@ -967,9 +1027,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bch_err_msg(c, ret, "marking new journal buckets"); break; } - - bu[nr_got] = ob[nr_got]->bucket; } + + bu[nr_got] = ob[nr_got]->bucket; } if (!nr_got) @@ -1009,8 +1069,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (ret) goto err_unblock; - if (!new_fs) - bch2_write_super(c); + bch2_write_super(c); /* Commit: */ if (c) @@ -1044,9 +1103,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bu[i], BCH_DATA_free, 0, BTREE_TRIGGER_transactional)); err_free: - if (!new_fs) - for (i = 0; i < nr_got; i++) - bch2_open_bucket_put(c, ob[i]); + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); kfree(new_bucket_seq); kfree(new_buckets); @@ -1193,7 +1251,7 @@ void bch2_fs_journal_stop(struct journal *j) * Always write a new journal entry, to make sure the clock hands are up * to date (and match the superblock) */ - bch2_journal_meta(j); + __bch2_journal_meta(j); journal_quiesce(j); cancel_delayed_work_sync(&j->write_work); @@ -1217,6 +1275,11 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) bool had_entries = false; u64 last_seq = cur_seq, nr, seq; + if (cur_seq >= JOURNAL_SEQ_MAX) { + bch_err(c, "cannot start: journal seq overflow"); + return -EINVAL; + } + genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; @@ -1474,6 +1537,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) case JOURNAL_ENTRY_CLOSED_VAL: prt_printf(out, "closed\n"); break; + case JOURNAL_ENTRY_BLOCKED_VAL: + prt_printf(out, "blocked\n"); + break; default: prt_printf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; @@ -1499,6 +1565,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { + if (!ca->mi.durability) + continue; + struct journal_device *ja = &ca->journal; if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) @@ -1508,6 +1577,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) continue; prt_printf(out, "dev %u:\n", ca->dev_idx); + prt_printf(out, "durability %u:\n", ca->mi.durability); printbuf_indent_add(out, 2); prt_printf(out, "nr\t%u\n", ja->nr); prt_printf(out, "bucket size\t%u\n", ca->mi.bucket_size); @@ -1519,6 +1589,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_indent_sub(out, 2); } + prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); + rcu_read_unlock(); --out->atomic; diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 2762be6f9814c..cb0df0663946b 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -285,7 +285,8 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq spin_lock(&j->lock); bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); - } + } else if (unlikely(s.cur_entry_offset == JOURNAL_ENTRY_BLOCKED_VAL)) + wake_up(&j->wait); } /* @@ -403,7 +404,7 @@ void bch2_journal_flush_async(struct journal *, struct closure *); int bch2_journal_flush_seq(struct journal *, u64, unsigned); int bch2_journal_flush(struct journal *); -bool bch2_journal_noflush_seq(struct journal *, u64); +bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); void bch2_journal_halt(struct journal *); @@ -411,7 +412,7 @@ void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL - ? -EIO : 0; + ? -BCH_ERR_journal_shutdown : 0; } struct bch_dev; @@ -424,7 +425,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); +struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index fb35dd3363318..5757d8358e93f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,8 @@ #include "sb-clean.h" #include "trace.h" +#include + void bch2_journal_pos_from_member_info_set(struct bch_fs *c) { lockdep_assert_held(&c->sb_lock); @@ -157,7 +159,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, /* * genradixes are indexed by a ulong, not a u64, so we can't index them * by sequence number directly: Assume instead that they will all fall - * within the range of +-2billion of the filrst one we find. + * within the range of +-2billion of the first one we find. */ if (!c->journal_entries_base_seq) c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); @@ -299,7 +301,7 @@ static void journal_entry_err_msg(struct printbuf *out, journal_entry_err_msg(&_buf, version, jset, entry); \ prt_printf(&_buf, msg, ##__VA_ARGS__); \ \ - switch (flags & BCH_VALIDATE_write) { \ + switch (from.flags & BCH_VALIDATE_write) { \ case READ: \ mustfix_fsck_err(c, _err, "%s", _buf.buf); \ break; \ @@ -325,11 +327,11 @@ static void journal_entry_err_msg(struct printbuf *out, static int journal_validate_key(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, - unsigned level, enum btree_id btree_id, struct bkey_i *k, - unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from, + unsigned version, int big_endian) { + enum bch_validate_flags flags = from.flags; int write = flags & BCH_VALIDATE_write; void *next = vstruct_next(entry); int ret = 0; @@ -364,11 +366,10 @@ static int journal_validate_key(struct bch_fs *c, } if (!write) - bch2_bkey_compat(level, btree_id, version, big_endian, + bch2_bkey_compat(from.level, from.btree, version, big_endian, write, NULL, bkey_to_packed(k)); - ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), - __btree_node_type(level, btree_id), write); + ret = bch2_bkey_validate(c, bkey_i_to_s_c(k), from); if (ret == -BCH_ERR_fsck_delete_bkey) { le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -379,7 +380,7 @@ static int journal_validate_key(struct bch_fs *c, goto fsck_err; if (write) - bch2_bkey_compat(level, btree_id, version, big_endian, + bch2_bkey_compat(from.level, from.btree, version, big_endian, write, NULL, bkey_to_packed(k)); fsck_err: return ret; @@ -389,16 +390,15 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; + from.level = entry->level; + from.btree = entry->btree_id; + while (k != vstruct_last(entry)) { - int ret = journal_validate_key(c, jset, entry, - entry->level, - entry->btree_id, - k, version, big_endian, - flags|BCH_VALIDATE_journal); + int ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) continue; else if (ret) @@ -421,7 +421,8 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs bch2_prt_jset_entry_type(out, entry->type); prt_str(out, ": "); } - prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level); + bch2_btree_id_level_to_text(out, entry->btree_id, entry->level); + prt_char(out, ' '); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); first = false; } @@ -431,11 +432,15 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_i *k = entry->start; int ret = 0; + from.root = true; + from.level = entry->level + 1; + from.btree = entry->btree_id; + if (journal_entry_err_on(!entry->u64s || le16_to_cpu(entry->u64s) != k->k.u64s, c, version, jset, entry, @@ -452,8 +457,7 @@ static int journal_entry_btree_root_validate(struct bch_fs *c, return 0; } - ret = journal_validate_key(c, jset, entry, 1, entry->btree_id, k, - version, big_endian, flags); + ret = journal_validate_key(c, jset, entry, k, from, version, big_endian); if (ret == FSCK_DELETED_KEY) ret = 0; fsck_err: @@ -470,7 +474,7 @@ static int journal_entry_prio_ptrs_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { /* obsolete, don't care: */ return 0; @@ -485,7 +489,7 @@ static int journal_entry_blacklist_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -512,7 +516,7 @@ static int journal_entry_blacklist_v2_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_blacklist_v2 *bl_entry; int ret = 0; @@ -554,7 +558,7 @@ static int journal_entry_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_usage *u = container_of(entry, struct jset_entry_usage, entry); @@ -588,7 +592,7 @@ static int journal_entry_data_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_data_usage *u = container_of(entry, struct jset_entry_data_usage, entry); @@ -632,7 +636,7 @@ static int journal_entry_clock_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); @@ -665,14 +669,14 @@ static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry_clock *clock = container_of(entry, struct jset_entry_clock, entry); - prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); + prt_printf(out, "%s=%llu", str_write_read(clock->rw), le64_to_cpu(clock->time)); } static int journal_entry_dev_usage_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct jset_entry_dev_usage *u = container_of(entry, struct jset_entry_dev_usage, entry); @@ -729,7 +733,7 @@ static int journal_entry_log_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return 0; } @@ -738,19 +742,19 @@ static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); - prt_printf(out, "%.*s", bytes, l->d); + prt_printf(out, "%.*s", jset_entry_log_msg_bytes(l), l->d); } static int journal_entry_overwrite_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { + from.flags = 0; return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, @@ -763,10 +767,10 @@ static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); + version, big_endian, from); } static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, @@ -779,7 +783,7 @@ static int journal_entry_datetime_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { unsigned bytes = vstruct_bytes(entry); unsigned expected = 16; @@ -809,7 +813,7 @@ static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs * struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bch_validate_flags); + struct bkey_validate_context); void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); }; @@ -827,11 +831,11 @@ int bch2_journal_entry_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, unsigned version, int big_endian, - enum bch_validate_flags flags) + struct bkey_validate_context from) { return entry->type < BCH_JSET_ENTRY_NR ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, - version, big_endian, flags) + version, big_endian, from) : 0; } @@ -849,10 +853,18 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, static int jset_validate_entries(struct bch_fs *c, struct jset *jset, enum bch_validate_flags flags) { + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; + unsigned version = le32_to_cpu(jset->version); int ret = 0; vstruct_for_each(jset, entry) { + from.journal_offset = (u64 *) entry - jset->_data; + if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), c, version, jset, entry, journal_entry_past_jset_end, @@ -861,8 +873,8 @@ static int jset_validate_entries(struct bch_fs *c, struct jset *jset, break; } - ret = bch2_journal_entry_validate(c, jset, entry, - version, JSET_BIG_ENDIAN(jset), flags); + ret = bch2_journal_entry_validate(c, jset, entry, version, + JSET_BIG_ENDIAN(jset), from); if (ret) break; } @@ -875,13 +887,17 @@ static int jset_validate(struct bch_fs *c, struct jset *jset, u64 sector, enum bch_validate_flags flags) { - unsigned version; + struct bkey_validate_context from = { + .flags = flags, + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -926,15 +942,16 @@ static int jset_validate_early(struct bch_fs *c, unsigned bucket_sectors_left, unsigned sectors_read) { - size_t bytes = vstruct_bytes(jset); - unsigned version; - enum bch_validate_flags flags = BCH_VALIDATE_journal; + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(jset->seq), + }; int ret = 0; if (le64_to_cpu(jset->magic) != jset_magic(c)) return JOURNAL_ENTRY_NONE; - version = le32_to_cpu(jset->version); + unsigned version = le32_to_cpu(jset->version); if (journal_entry_err_on(!bch2_version_compatible(version), c, version, jset, NULL, jset_unsupported_version, @@ -947,6 +964,7 @@ static int jset_validate_early(struct bch_fs *c, return -EINVAL; } + size_t bytes = vstruct_bytes(jset); if (bytes > (sectors_read << 9) && sectors_read < bucket_sectors_left) return JOURNAL_ENTRY_REREAD; @@ -1096,8 +1114,10 @@ static int journal_read_bucket(struct bch_dev *ca, (printbuf_reset(&err), prt_str(&err, "journal "), bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + err.buf))) { saw_bad = true; + bch2_fatal_error(c); + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, @@ -1231,8 +1251,6 @@ int bch2_journal_read(struct bch_fs *c, * those entries will be blacklisted: */ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { - enum bch_validate_flags flags = BCH_VALIDATE_journal; - i = *_i; if (journal_replay_ignore(i)) @@ -1252,6 +1270,10 @@ int bch2_journal_read(struct bch_fs *c, continue; } + struct bkey_validate_context from = { + .from = BKEY_VALIDATE_journal, + .journal_seq = le64_to_cpu(i->j.seq), + }; if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), c, le32_to_cpu(i->j.version), &i->j, NULL, jset_last_seq_newer_than_seq, @@ -1411,27 +1433,50 @@ int bch2_journal_read(struct bch_fs *c, /* journal write: */ +static void journal_advance_devs_to_next_bucket(struct journal *j, + struct dev_alloc_list *devs, + unsigned sectors, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); + if (!ca) + continue; + + struct journal_device *ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); + } + } +} + static void __journal_write_alloc(struct journal *j, struct journal_buf *w, - struct dev_alloc_list *devs_sorted, + struct dev_alloc_list *devs, unsigned sectors, unsigned *replicas, unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_device *ja; - struct bch_dev *ca; - unsigned i; - if (*replicas >= replicas_want) - return; - - for (i = 0; i < devs_sorted->nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + darray_for_each(*devs, i) { + struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) continue; - ja = &ca->journal; + struct journal_device *ja = &ca->journal; /* * Check that we can use this device, and aren't already using @@ -1477,65 +1522,53 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; - struct journal_device *ja; - struct bch_dev *ca; struct dev_alloc_list devs_sorted; unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned i, replicas = 0, replicas_want = + unsigned replicas = 0, replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); + bool advance_done = false; rcu_read_lock(); -retry: - devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + /* We might run more than once if we have to stop and do discards: */ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); + bkey_for_each_ptr(ptrs, p) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); + if (ca) + replicas += ca->mi.durability; + } - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); +retry_target: + devs = target_rw_devs(c, BCH_DATA_journal, target); + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); +retry_alloc: + __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); - if (replicas >= replicas_want) + if (likely(replicas >= replicas_want)) goto done; - for (i = 0; i < devs_sorted.nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - if (!ca) - continue; - - ja = &ca->journal; - - if (sectors > ja->sectors_free && - sectors <= ca->mi.bucket_size && - bch2_journal_dev_buckets_available(j, ja, - journal_space_discarded)) { - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->sectors_free = ca->mi.bucket_size; - - /* - * ja->bucket_seq[ja->cur_idx] must always have - * something sensible: - */ - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); - } + if (!advance_done) { + journal_advance_devs_to_next_bucket(j, &devs_sorted, sectors, w->data->seq); + advance_done = true; + goto retry_alloc; } - __journal_write_alloc(j, w, &devs_sorted, - sectors, &replicas, replicas_want); - if (replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; - goto retry; + advance_done = false; + goto retry_target; } done: rcu_read_unlock(); BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -EROFS; + return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -2023,19 +2056,21 @@ CLOSURE_CALLBACK(bch2_journal_write) bch2_journal_do_discards(j); } - if (ret) { + if (ret && !bch2_journal_error(j)) { struct printbuf buf = PRINTBUF; buf.atomic++; - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu: %s"), + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), bch2_err_str(ret)); __bch2_journal_debug_to_text(&buf, j); spin_unlock(&j->lock); bch2_print_string_as_lines(KERN_ERR, buf.buf); printbuf_exit(&buf); - goto err; } + if (ret) + goto err; /* * write is allocated, no longer need to account for it in diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 2ca9cde30ea8d..12b39fcb4424b 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -63,7 +63,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, int bch2_journal_entry_validate(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, struct jset_entry *); diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index ace291f175dd6..3c8242606da7e 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -38,6 +38,9 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, struct journal_device *ja, enum journal_space_from from) { + if (!ja->nr) + return 0; + unsigned available = (journal_space_from(ja, from) - ja->cur_idx - 1 + ja->nr) % ja->nr; @@ -137,14 +140,18 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne struct bch_fs *c = container_of(j, struct bch_fs, journal); unsigned pos, nr_devs = 0; struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + unsigned min_bucket_size = U32_MAX; BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { - if (!ca->journal.nr) + if (!ca->journal.nr || + !ca->mi.durability) continue; + min_bucket_size = min(min_bucket_size, ca->mi.bucket_size); + space = journal_dev_space_available(j, ca, from); if (!space.next_entry) continue; @@ -164,7 +171,9 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne * We sorted largest to smallest, and we want the smallest out of the * @nr_devs_want largest devices: */ - return dev_space[nr_devs_want - 1]; + space = dev_space[nr_devs_want - 1]; + space.next_entry = min(space.next_entry, min_bucket_size); + return space; } void bch2_journal_space_available(struct journal *j) @@ -758,10 +767,12 @@ static int bch2_journal_reclaim_thread(void *arg) journal_empty = fifo_empty(&j->pin); spin_unlock(&j->lock); + long timeout = j->next_reclaim - jiffies; + if (journal_empty) schedule(); - else if (time_after(j->next_reclaim, jiffies)) - schedule_timeout(j->next_reclaim - jiffies); + else if (timeout > 0) + schedule_timeout(timeout); else break; } diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 19183fcf7ad7f..0c16fde8dfad2 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -9,6 +9,9 @@ #include "super_types.h" #include "fifo.h" +/* btree write buffer steals 8 bits for its own purposes: */ +#define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) + #define JOURNAL_BUF_BITS 2 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) @@ -107,11 +110,12 @@ union journal_res_state { #define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ /* - * We stash some journal state as sentinal values in cur_entry_offset: + * We stash some journal state as sentinel values in cur_entry_offset: * note - cur_entry_offset is in units of u64s */ #define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) #define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) @@ -193,6 +197,7 @@ struct journal { * insufficient devices: */ enum journal_errors cur_entry_error; + unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; /* @@ -202,7 +207,7 @@ struct journal { darray_u64 early_journal_entries; /* - * Protects journal_buf->data, when accessing without a jorunal + * Protects journal_buf->data, when accessing without a journal * reservation: for synchronization between the btree write buffer code * and the journal write path: */ diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c index 60e00702d1a46..75f27ec26f85f 100644 --- a/fs/bcachefs/logged_ops.c +++ b/fs/bcachefs/logged_ops.c @@ -63,8 +63,10 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter, int bch2_resume_logged_ops(struct bch_fs *c) { int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, - BTREE_ID_logged_ops, POS_MIN, + for_each_btree_key_max(trans, iter, + BTREE_ID_logged_ops, + POS(LOGGED_OPS_INUM_logged_ops, 0), + POS(LOGGED_OPS_INUM_logged_ops, U64_MAX), BTREE_ITER_prefetch, k, resume_logged_op(trans, &iter, k))); bch_err_fn(c, ret); @@ -74,9 +76,8 @@ int bch2_resume_logged_ops(struct bch_fs *c) static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k) { struct btree_iter iter; - int ret; - - ret = bch2_bkey_get_empty_slot(trans, &iter, BTREE_ID_logged_ops, POS_MAX); + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_logged_ops, POS(LOGGED_OPS_INUM_logged_ops, U64_MAX)); if (ret) return ret; diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h index 6a4bf7129dba2..cfb67c95d4c8a 100644 --- a/fs/bcachefs/logged_ops_format.h +++ b/fs/bcachefs/logged_ops_format.h @@ -2,6 +2,11 @@ #ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H #define _BCACHEFS_LOGGED_OPS_FORMAT_H +enum logged_ops_inums { + LOGGED_OPS_INUM_logged_ops, + LOGGED_OPS_INUM_inode_cursors, +}; + struct bch_logged_op_truncate { struct bch_val v; __le32 subvol; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 10857eccdeafe..ce794d55818fb 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -12,7 +12,7 @@ /* KEY_TYPE_lru is obsolete: */ int bch2_lru_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -192,7 +192,7 @@ int bch2_check_lrus(struct bch_fs *c) int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_lru, POS_MIN, BTREE_ITER_prefetch, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_lru_key(trans, &iter, k, &last_flushed))); bch2_bkey_buf_exit(&last_flushed, c); diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index e6a7d8241bb80..f31a6cf1514cd 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -33,7 +33,7 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) return BCH_LRU_read; } -int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); void bch2_lru_pos_to_text(struct printbuf *, struct bpos); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 47e4a3c3d26e7..281d6f9d1a74f 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -152,7 +152,7 @@ struct mean_and_variance { u128_u sum_squares; }; -/* expontentially weighted variant */ +/* exponentially weighted variant */ struct mean_and_variance_weighted { s64 mean; u64 variance; @@ -200,4 +200,4 @@ u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, u8 weight); -#endif // MEAN_AND_VAIRANCE_H_ +#endif // MEAN_AND_VARIANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index e9d9c0212e44b..86f38db112d8c 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -25,7 +25,7 @@ static void mean_and_variance_basic_test(struct kunit *test) } /* - * Test values computed using a spreadsheet from the psuedocode at the bottom: + * Test values computed using a spreadsheet from the pseudocode at the bottom: * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 0ef4a86850bbc..c493ea625553c 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -21,6 +21,8 @@ #include "journal_reclaim.h" #include "keylist.h" #include "move.h" +#include "rebalance.h" +#include "reflink.h" #include "replicas.h" #include "snapshot.h" #include "super-io.h" @@ -196,6 +198,13 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt) list_del(&ctxt->list); mutex_unlock(&c->moving_context_lock); + /* + * Generally, releasing a transaction within a transaction restart means + * an unhandled transaction restart: but this can happen legitimately + * within the move code, e.g. when bch2_move_ratelimit() tells us to + * exit before we've retried + */ + bch2_trans_begin(ctxt->trans); bch2_trans_put(ctxt->trans); memset(ctxt, 0, sizeof(*ctxt)); } @@ -379,34 +388,42 @@ int bch2_move_extent(struct moving_context *ctxt, return ret; } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, + struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ + struct btree_iter *extent_iter, struct bkey_s_c extent_k) { struct bch_fs *c = trans->c; u32 restart_count = trans->restart_count; + struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; - if (io_opts->cur_inum != extent_k.k->p.inode) { + if (extent_k.k->type == KEY_TYPE_reflink_v) + goto out; + + if (io_opts->cur_inum != extent_pos.inode) { io_opts->d.nr = 0; - ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode), + ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_pos.inode), BTREE_ITER_all_snapshots, k, ({ - if (k.k->p.offset != extent_k.k->p.inode) + if (k.k->p.offset != extent_pos.inode) break; if (!bkey_is_inode(k.k)) continue; struct bch_inode_unpacked inode; - BUG_ON(bch2_inode_unpack(k, &inode)); + _ret3 = bch2_inode_unpack(k, &inode); + if (_ret3) + break; struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot }; bch2_inode_opts_get(&e.io_opts, trans->c, &inode); darray_push(&io_opts->d, e); })); - io_opts->cur_inum = extent_k.k->p.inode; + io_opts->cur_inum = extent_pos.inode; } ret = ret ?: trans_was_restarted(trans, restart_count); @@ -415,43 +432,46 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, if (extent_k.k->p.snapshot) darray_for_each(io_opts->d, i) - if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) - return &i->io_opts; - - return &io_opts->fs_io_opts; + if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot)) { + opts_ret = &i->io_opts; + break; + } +out: + ret = bch2_get_update_rebalance_opts(trans, opts_ret, extent_iter, extent_k); + if (ret) + return ERR_PTR(ret); + return opts_ret; } int bch2_move_get_io_opts_one(struct btree_trans *trans, struct bch_io_opts *io_opts, + struct btree_iter *extent_iter, struct bkey_s_c extent_k) { - struct btree_iter iter; - struct bkey_s_c k; - int ret; + struct bch_fs *c = trans->c; + + *io_opts = bch2_opts_to_inode_opts(c->opts); /* reflink btree? */ - if (!extent_k.k->p.inode) { - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); - return 0; - } + if (!extent_k.k->p.inode) + goto out; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + struct btree_iter inode_iter; + struct bkey_s_c inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot), BTREE_ITER_cached); - ret = bkey_err(k); + int ret = bkey_err(inode_k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - if (!ret && bkey_is_inode(k.k)) { + if (!ret && bkey_is_inode(inode_k.k)) { struct bch_inode_unpacked inode; - bch2_inode_unpack(k, &inode); - bch2_inode_opts_get(io_opts, trans->c, &inode); - } else { - *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + bch2_inode_unpack(inode_k, &inode); + bch2_inode_opts_get(io_opts, c, &inode); } - - bch2_trans_iter_exit(trans, &iter); - return 0; + bch2_trans_iter_exit(trans, &inode_iter); +out: + return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } int bch2_move_ratelimit(struct moving_context *ctxt) @@ -509,9 +529,15 @@ static int bch2_move_data_btree(struct moving_context *ctxt, struct per_snapshot_io_opts snapshot_io_opts; struct bch_io_opts *io_opts; struct bkey_buf sk; - struct btree_iter iter; + struct btree_iter iter, reflink_iter = {}; struct bkey_s_c k; struct data_update_opts data_opts; + /* + * If we're moving a single file, also process reflinked data it points + * to (this includes propagating changed io_opts from the inode to the + * extent): + */ + bool walk_indirect = start.inode == end.inode; int ret = 0, ret2; per_snapshot_io_opts_init(&snapshot_io_opts, c); @@ -531,6 +557,8 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_ratelimit_reset(ctxt->rate); while (!bch2_move_ratelimit(ctxt)) { + struct btree_iter *extent_iter = &iter; + bch2_trans_begin(trans); k = bch2_btree_iter_peek(&iter); @@ -549,10 +577,36 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ctxt->stats) ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + if (walk_indirect && + k.k->type == KEY_TYPE_reflink_p && + REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + + bch2_trans_iter_exit(trans, &reflink_iter); + k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_deleted(k.k)) + goto next_nondata; + + /* + * XXX: reflink pointers may point to multiple indirect + * extents, so don't advance past the entire reflink + * pointer - need to fixup iter->k + */ + extent_iter = &reflink_iter; + } + if (!bkey_extent_is_direct_data(k.k)) goto next_nondata; - io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k); + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, + iter.pos, extent_iter, k); ret = PTR_ERR_OR_ZERO(io_opts); if (ret) continue; @@ -568,7 +622,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts); + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; @@ -589,6 +643,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_btree_iter_advance(&iter); } + bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); per_snapshot_io_opts_exit(&snapshot_io_opts); @@ -654,16 +709,12 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bch_fs *c = trans->c; bool is_kthread = current->flags & PF_KTHREAD; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); - struct btree_iter iter; + struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; - struct bch_backpointer bp; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct bkey_s_c k; struct data_update_opts data_opts; - unsigned dirty_sectors, bucket_size; - u64 fragmentation; - struct bpos bp_pos = POS_MIN; + unsigned sectors_moved = 0; + struct bkey_buf last_flushed; int ret = 0; struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); @@ -672,6 +723,8 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, trace_bucket_evacuate(c, &bucket); + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); bch2_bkey_buf_init(&sk); /* @@ -679,21 +732,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, - bucket, BTREE_ITER_cached); - ret = lockrestart_do(trans, - bkey_err(k = bch2_btree_iter_peek_slot(&iter))); - bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp_start(ca, bucket), 0); bch_err_msg(c, ret, "looking up alloc key"); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); - dirty_sectors = bch2_bucket_sectors_dirty(*a); - bucket_size = ca->mi.bucket_size; - fragmentation = alloc_lru_idx_fragmentation(*a, ca); - ret = bch2_btree_write_buffer_tryflush(trans); bch_err_msg(c, ret, "flushing btree write buffer"); if (ret) @@ -705,18 +750,23 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = bch2_get_next_backpointer(trans, ca, bucket, gen, - &bp_pos, &bp, - BTREE_ITER_cached); + k = bch2_btree_iter_peek(&bp_iter); + ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) goto err; - if (bkey_eq(bp_pos, POS_MAX)) + + if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) break; - if (!bp.level) { - k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); + if (k.k->type != KEY_TYPE_backpointer) + goto next; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + if (!bp.v->level) { + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -728,7 +778,7 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret = bch2_move_get_io_opts_one(trans, &io_opts, k); + ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; @@ -738,14 +788,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, data_opts.target = io_opts.background_target; data_opts.rewrite_ptrs = 0; + unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ unsigned i = 0; - bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { - if (ptr->dev == bucket.inode) { - data_opts.rewrite_ptrs |= 1U << i; - if (ptr->cached) { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { + if (p.ptr.dev == bucket.inode) { + if (p.ptr.cached) { bch2_trans_iter_exit(trans, &iter); goto next; } + data_opts.rewrite_ptrs |= 1U << i; + break; } i++; } @@ -765,14 +819,15 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, goto err; if (ctxt->stats) - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + atomic64_add(sectors, &ctxt->stats->sectors_seen); + sectors_moved += sectors; } else { struct btree *b; - b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); + b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); ret = PTR_ERR_OR_ZERO(b); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - continue; + goto next; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) @@ -796,15 +851,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, atomic64_add(sectors, &ctxt->stats->sectors_seen); atomic64_add(sectors, &ctxt->stats->sectors_moved); } + sectors_moved += btree_sectors(c); } next: - bp_pos = bpos_nosnap_successor(bp_pos); + bch2_btree_iter_advance(&bp_iter); } - trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); + trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); err: + bch2_trans_iter_exit(trans, &bp_iter); bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); + bch2_bkey_buf_exit(&last_flushed, c); return ret; } diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 9baf3093a678a..51e0505a8156a 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -110,9 +110,8 @@ static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opt darray_exit(&io_opts->d); } -struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, - struct per_snapshot_io_opts *, struct bkey_s_c); -int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c); +int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, + struct btree_iter *, struct bkey_s_c); int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index d658be90f7378..85c361e78ba53 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -167,7 +167,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, bch2_trans_begin(trans); - ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru, + ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), 0, k, ({ @@ -350,9 +350,9 @@ static int bch2_copygc_thread(void *arg) bch2_trans_unlock_long(ctxt.trans); cond_resched(); - if (!c->copy_gc_enabled) { + if (!c->opts.copygc_enabled) { move_buckets_wait(&ctxt, buckets, true); - kthread_wait_freezable(c->copy_gc_enabled || + kthread_wait_freezable(c->opts.copygc_enabled || kthread_should_stop()); } diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 0e2ee262fbd4b..6772faf385a50 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include "bcachefs.h" #include "compress.h" @@ -48,12 +49,12 @@ static const char * const __bch2_csum_types[] = { NULL }; -const char * const bch2_csum_opts[] = { +const char * const __bch2_csum_opts[] = { BCH_CSUM_OPTS() NULL }; -static const char * const __bch2_compression_types[] = { +const char * const __bch2_compression_types[] = { BCH_COMPRESSION_TYPES() NULL }; @@ -113,6 +114,7 @@ void bch2_prt_##name(struct printbuf *out, type t) \ PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type, enum bch_jset_entry_type); PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type); PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type); +PRT_STR_OPT_BOUNDSCHECKED(csum_opt, enum bch_csum_opt); PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type); PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type); PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type); @@ -333,17 +335,18 @@ int bch2_opt_parse(struct bch_fs *c, switch (opt->type) { case BCH_OPT_BOOL: if (val) { - ret = kstrtou64(val, 10, res); + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; + } else { + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; + } } else { - ret = 0; *res = 1; } - if (ret < 0 || (*res != 0 && *res != 1)) { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret < 0 ? ret : -BCH_ERR_option_not_bool; - } break; case BCH_OPT_UINT: if (!val) { @@ -710,11 +713,14 @@ void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) { - return (struct bch_io_opts) { + struct bch_io_opts opts = { #define x(_name, _bits) ._name = src._name, BCH_INODE_OPTS() #undef x }; + + bch2_io_opts_fixups(&opts); + return opts; } bool bch2_opt_is_inode_opt(enum bch_opt_id id) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 23dda014e331b..c7d454e378413 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -16,7 +16,8 @@ extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; -extern const char * const bch2_csum_opts[]; +extern const char * const __bch2_csum_opts[]; +extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; extern const char * const __bch2_str_hash_types[]; extern const char * const bch2_str_hash_opts[]; @@ -27,6 +28,7 @@ extern const char * const bch2_d_types[]; void bch2_prt_jset_entry_type(struct printbuf *, enum bch_jset_entry_type); void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type); void bch2_prt_data_type(struct printbuf *, enum bch_data_type); +void bch2_prt_csum_opt(struct printbuf *, enum bch_csum_opt); void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type); void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type); void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type); @@ -171,12 +173,12 @@ enum fsck_err_opts { "size", "Maximum size of checksummed/compressed extents")\ x(metadata_checksum, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_csum_opts), \ + OPT_STR(__bch2_csum_opts), \ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(data_checksum, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_STR(bch2_csum_opts), \ + OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ x(compression, u8, \ @@ -220,14 +222,14 @@ enum fsck_err_opts { BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ x(inodes_32bit, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, true, \ NULL, "Constrain inode numbers to 32 bits") \ - x(shard_inode_numbers, u8, \ - OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ - OPT_BOOL(), \ - BCH_SB_SHARD_INUMS, true, \ + x(shard_inode_numbers_bits, u8, \ + OPT_FS|OPT_FORMAT, \ + OPT_UINT(0, 8), \ + BCH_SB_SHARD_INUMS_NBITS, 0, \ NULL, "Shard new inode numbers by CPU id") \ x(inodes_use_key_cache, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ @@ -473,6 +475,18 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable nocow mode: enables runtime locking in\n"\ "data move path needed if nocow will ever be in use\n")\ + x(copygc_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable copygc: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ + x(rebalance_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable rebalance: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ @@ -488,7 +502,7 @@ enum fsck_err_opts { OPT_DEVICE, \ OPT_UINT(0, S64_MAX), \ BCH2_NO_SB_OPT, 0, \ - "size", "Size of filesystem on device") \ + "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ @@ -504,7 +518,7 @@ enum fsck_err_opts { OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ " prefetched sequentially") #define BCH_DEV_OPT_SETTERS() \ @@ -624,14 +638,39 @@ struct bch_io_opts { #define x(_name, _bits) u##_bits _name; BCH_INODE_OPTS() #undef x +#define x(_name, _bits) u64 _name##_from_inode:1; + BCH_INODE_OPTS() +#undef x }; -static inline unsigned background_compression(struct bch_io_opts opts) +static inline void bch2_io_opts_fixups(struct bch_io_opts *opts) { - return opts.background_compression ?: opts.compression; + if (!opts->background_target) + opts->background_target = opts->foreground_target; + if (!opts->background_compression) + opts->background_compression = opts->compression; + if (opts->nocow) { + opts->compression = opts->background_compression = 0; + opts->data_checksum = 0; + opts->erasure_code = 0; + } } struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); bool bch2_opt_is_inode_opt(enum bch_opt_id); +/* rebalance opts: */ + +static inline struct bch_extent_rebalance io_opts_to_rebalance_opts(struct bch_io_opts *opts) +{ + return (struct bch_extent_rebalance) { + .type = BIT(BCH_EXTENT_ENTRY_rebalance), +#define x(_name) \ + ._name = opts->_name, \ + ._name##_from_inode = opts->_name##_from_inode, + BCH_REBALANCE_OPTS() +#undef x + }; +}; + #endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index 4cf5a2af1e6ff..855c7c07b6185 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -236,7 +236,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *buf) * bch2_printbuf_tabstop_set() - add a tabstop, n spaces from the previous tabstop * * @buf: printbuf to control - * @spaces: number of spaces from previous tabpstop + * @spaces: number of spaces from previous tabstop * * In the future this function may allocate memory if setting more than * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start @@ -388,7 +388,7 @@ void bch2_prt_tab_rjust(struct printbuf *buf) * @str: string to print * @count: number of bytes to print * - * The following contol characters are handled as so: + * The following control characters are handled as so: * \n: prt_newline newline that obeys current indent level * \t: prt_tab advance to next tabstop * \r: prt_tab_rjust advance to next tabstop, with right justification @@ -435,7 +435,7 @@ void bch2_prt_human_readable_s64(struct printbuf *out, s64 v) * @out: output printbuf * @v: integer to print * - * Units are either raw (default), or human reabable units (controlled via + * Units are either raw (default), or human readable units (controlled via * @buf->human_readable_units) */ void bch2_prt_units_u64(struct printbuf *out, u64 v) @@ -451,7 +451,7 @@ void bch2_prt_units_u64(struct printbuf *out, u64 v) * @out: output printbuf * @v: integer to print * - * Units are either raw (default), or human reabable units (controlled via + * Units are either raw (default), or human readable units (controlled via * @buf->human_readable_units) */ void bch2_prt_units_s64(struct printbuf *out, s64 v) diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 1d570387b77f1..bfb2ca11afe4c 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -25,7 +25,7 @@ * everything to the kernel log buffer, and then those pretty-printers can be * used by other code that outputs to kernel log, sysfs, debugfs, etc. * - * Memory allocation: Outputing to a printbuf may allocate memory. This + * Memory allocation: Outputting to a printbuf may allocate memory. This * allocation is done with GFP_KERNEL, by default: use the newer * memalloc_*_(save|restore) functions as needed. * @@ -56,7 +56,7 @@ * next tabstop - right justifying it. * * Make sure you use prt_newline() instead of \n in the format string for indent - * level and tabstops to work corretly. + * level and tabstops to work correctly. * * Output units: printbuf->units exists to tell pretty-printers how to output * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 74f45a8162ad7..8b857fc33244a 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -60,7 +60,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_quota = { }; int bch2_quota_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index a62abcc5332ad..1551800ff44c9 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -5,10 +5,10 @@ #include "inode.h" #include "quota_types.h" -enum bch_validate_flags; extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_quota_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_quota ((struct bkey_ops) { \ diff --git a/fs/bcachefs/rcu_pending.c b/fs/bcachefs/rcu_pending.c index 40a20192eee89..0f60f5632bdb7 100644 --- a/fs/bcachefs/rcu_pending.c +++ b/fs/bcachefs/rcu_pending.c @@ -25,21 +25,37 @@ enum rcu_pending_special { #define RCU_PENDING_KVFREE_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_KVFREE) #define RCU_PENDING_CALL_RCU_FN ((rcu_pending_process_fn) (ulong) RCU_PENDING_CALL_RCU) -static inline unsigned long __get_state_synchronize_rcu(struct srcu_struct *ssp) +#ifdef __KERNEL__ +typedef unsigned long rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l == r; +} +#else +typedef struct urcu_gp_poll_state rcu_gp_poll_state_t; + +static inline bool rcu_gp_poll_cookie_eq(rcu_gp_poll_state_t l, rcu_gp_poll_state_t r) +{ + return l.grace_period_id == r.grace_period_id; +} +#endif + +static inline rcu_gp_poll_state_t __get_state_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? get_state_synchronize_srcu(ssp) : get_state_synchronize_rcu(); } -static inline unsigned long __start_poll_synchronize_rcu(struct srcu_struct *ssp) +static inline rcu_gp_poll_state_t __start_poll_synchronize_rcu(struct srcu_struct *ssp) { return ssp ? start_poll_synchronize_srcu(ssp) : start_poll_synchronize_rcu(); } -static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, unsigned long cookie) +static inline bool __poll_state_synchronize_rcu(struct srcu_struct *ssp, rcu_gp_poll_state_t cookie) { return ssp ? poll_state_synchronize_srcu(ssp, cookie) @@ -71,13 +87,13 @@ struct rcu_pending_seq { GENRADIX(struct rcu_head *) objs; size_t nr; struct rcu_head **cursor; - unsigned long seq; + rcu_gp_poll_state_t seq; }; struct rcu_pending_list { struct rcu_head *head; struct rcu_head *tail; - unsigned long seq; + rcu_gp_poll_state_t seq; }; struct rcu_pending_pcpu { @@ -316,10 +332,10 @@ static void rcu_pending_rcu_cb(struct rcu_head *rcu) } static __always_inline struct rcu_pending_seq * -get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) +get_object_radix(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq) { darray_for_each_reverse(p->objs, objs) - if (objs->seq == seq) + if (rcu_gp_poll_cookie_eq(objs->seq, seq)) return objs; if (darray_push_gfp(&p->objs, ((struct rcu_pending_seq) { .seq = seq }), GFP_ATOMIC)) @@ -329,7 +345,7 @@ get_object_radix(struct rcu_pending_pcpu *p, unsigned long seq) } static noinline bool -rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, +rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, rcu_gp_poll_state_t seq, struct rcu_head *head, void *ptr, unsigned long *flags) { @@ -338,7 +354,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, /* * kvfree_rcu_mightsleep(): we weren't passed an * rcu_head, but we need one: use the low bit of the - * ponter to free to flag that the head needs to be + * pointer to free to flag that the head needs to be * freed as well: */ ptr = (void *)(((unsigned long) ptr)|1UL); @@ -364,7 +380,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, again: for (struct rcu_pending_list *i = p->lists; i < p->lists + NUM_ACTIVE_RCU_POLL_OLDSTATE; i++) { - if (i->seq == seq) { + if (rcu_gp_poll_cookie_eq(i->seq, seq)) { rcu_pending_list_add(i, head); return false; } @@ -385,7 +401,7 @@ rcu_pending_enqueue_list(struct rcu_pending_pcpu *p, unsigned long seq, /* * __rcu_pending_enqueue: enqueue a pending RCU item, to be processed (via - * pending->pracess) once grace period elapses. + * pending->process) once grace period elapses. * * Attempt to enqueue items onto a radix tree; if memory allocation fails, fall * back to a linked list. @@ -408,7 +424,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, struct rcu_pending_pcpu *p; struct rcu_pending_seq *objs; struct genradix_node *new_node = NULL; - unsigned long seq, flags; + unsigned long flags; bool start_gp = false; BUG_ON((ptr != NULL) != (pending->process == RCU_PENDING_KVFREE_FN)); @@ -416,7 +432,7 @@ __rcu_pending_enqueue(struct rcu_pending *pending, struct rcu_head *head, local_irq_save(flags); p = this_cpu_ptr(pending->p); spin_lock(&p->lock); - seq = __get_state_synchronize_rcu(pending->srcu); + rcu_gp_poll_state_t seq = __get_state_synchronize_rcu(pending->srcu); restart: if (may_sleep && unlikely(process_finished_items(pending, p, flags))) diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index cd6647374353f..f265a64da91b5 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -24,6 +24,192 @@ #include #include +/* bch_extent_rebalance: */ + +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + + bkey_extent_entry_for_each(ptrs, entry) + if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance) + return &entry->rebalance; + + return NULL; +} + +static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_compression) + return 0; + + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) + return 0; + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + +static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_ptrs_c ptrs) +{ + if (!opts->background_target || + !bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) + return 0; + + unsigned ptr_bit = 1; + unsigned rewrite_ptrs = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) + rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } + + return rewrite_ptrs; +} + +static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, + struct bch_io_opts *opts, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | + bch2_bkey_ptrs_need_move(c, opts, ptrs); +} + +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) +{ + const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + if (!opts) + return 0; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 sectors = 0; + + if (opts->background_compression) { + unsigned compression_type = bch2_compression_opt_to_type(opts->background_compression); + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible || + p.ptr.unwritten) { + sectors = 0; + goto incompressible; + } + + if (!p.ptr.cached && p.crc.compression_type != compression_type) + sectors += p.crc.compressed_size; + } + } +incompressible: + if (opts->background_target && + bch2_target_accepts_data(c, BCH_DATA_user, opts->background_target)) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + sectors += p.crc.compressed_size; + } + + return sectors; +} + +static bool bch2_bkey_rebalance_needs_update(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_s_c k) +{ + if (!bkey_extent_is_direct_data(k.k)) + return 0; + + const struct bch_extent_rebalance *old = bch2_bkey_rebalance_opts(k); + + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k)) { + struct bch_extent_rebalance new = io_opts_to_rebalance_opts(opts); + return old == NULL || memcmp(old, &new, sizeof(new)); + } else { + return old != NULL; + } +} + +int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bch_io_opts *opts, + struct bkey_i *_k) +{ + if (!bkey_extent_is_direct_data(&_k->k)) + return 0; + + struct bkey_s k = bkey_i_to_s(_k); + struct bch_extent_rebalance *old = + (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c); + + if (k.k->type == KEY_TYPE_reflink_v || bch2_bkey_ptrs_need_rebalance(c, opts, k.s_c)) { + if (!old) { + old = bkey_val_end(k); + k.k->u64s += sizeof(*old) / sizeof(u64); + } + + *old = io_opts_to_rebalance_opts(opts); + } else { + if (old) + extent_entry_drop(k, (union bch_extent_entry *) old); + } + + return 0; +} + +int bch2_get_update_rebalance_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct btree_iter *iter, + struct bkey_s_c k) +{ + BUG_ON(iter->flags & BTREE_ITER_is_extents); + BUG_ON(iter->flags & BTREE_ITER_filter_snapshots); + + const struct bch_extent_rebalance *r = k.k->type == KEY_TYPE_reflink_v + ? bch2_bkey_rebalance_opts(k) : NULL; + if (r) { +#define x(_name) \ + if (r->_name##_from_inode) { \ + io_opts->_name = r->_name; \ + io_opts->_name##_from_inode = true; \ + } + BCH_REBALANCE_OPTS() +#undef x + } + + if (!bch2_bkey_rebalance_needs_update(trans->c, io_opts, k)) + return 0; + + struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + 8); + int ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + bkey_reassemble(n, k); + + /* On successful transaction commit, @k was invalidated: */ + + return bch2_bkey_set_needs_rebalance(trans->c, io_opts, n) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; +} + #define REBALANCE_WORK_SCAN_OFFSET (U64_MAX - 1) static const char * const bch2_rebalance_state_strs[] = { @@ -33,7 +219,7 @@ static const char * const bch2_rebalance_state_strs[] = { #undef x }; -static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) { struct btree_iter iter; struct bkey_s_c k; @@ -71,9 +257,8 @@ static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum) int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) { int ret = bch2_trans_commit_do(c, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_lazy_rw, - __bch2_set_rebalance_needs_scan(trans, inum)); + BCH_TRANS_COMMIT_no_enospc, + bch2_set_rebalance_needs_scan_trans(trans, inum)); rebalance_wakeup(c); return ret; } @@ -121,6 +306,9 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { + if (!bch2_bkey_rebalance_opts(k)) + return 0; + struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); int ret = PTR_ERR_OR_ZERO(n); if (ret) @@ -134,31 +322,27 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, struct bpos work_pos, struct btree_iter *extent_iter, + struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { struct bch_fs *c = trans->c; - struct bkey_s_c k; bch2_trans_iter_exit(trans, extent_iter); bch2_trans_iter_init(trans, extent_iter, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, BTREE_ITER_all_snapshots); - k = bch2_btree_iter_peek_slot(extent_iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); if (bkey_err(k)) return k; - const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL; - if (!r) { - /* raced due to btree write buffer, nothing to do */ - return bkey_s_c_null; - } + int ret = bch2_move_get_io_opts_one(trans, io_opts, extent_iter, k); + if (ret) + return bkey_s_c_err(ret); memset(data_opts, 0, sizeof(*data_opts)); - - data_opts->rewrite_ptrs = - bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression); - data_opts->target = r->target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; if (!data_opts->rewrite_ptrs) { @@ -178,12 +362,28 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, if (trace_rebalance_extent_enabled()) { struct printbuf buf = PRINTBUF; - prt_str(&buf, "target="); - bch2_target_to_text(&buf, c, r->target); - prt_str(&buf, " compression="); - bch2_compression_opt_to_text(&buf, r->compression); - prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + unsigned p = bch2_bkey_ptrs_need_compress(c, io_opts, k, ptrs); + if (p) { + prt_str(&buf, "compression="); + bch2_compression_opt_to_text(&buf, io_opts->background_compression); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, p); + prt_newline(&buf); + } + + p = bch2_bkey_ptrs_need_move(c, io_opts, ptrs); + if (p) { + prt_str(&buf, "move="); + bch2_target_to_text(&buf, c, io_opts->background_target); + prt_str(&buf, " "); + bch2_prt_u64_base2(&buf, p); + prt_newline(&buf); + } trace_rebalance_extent(c, buf.buf); printbuf_exit(&buf); @@ -212,14 +412,10 @@ static int do_rebalance_extent(struct moving_context *ctxt, bch2_bkey_buf_init(&sk); ret = bkey_err(k = next_rebalance_extent(trans, work_pos, - extent_iter, &data_opts)); + extent_iter, &io_opts, &data_opts)); if (ret || !k.k) goto out; - ret = bch2_move_get_io_opts_one(trans, &io_opts, k); - if (ret) - goto out; - atomic64_add(k.k->size, &ctxt->stats->sectors_seen); /* @@ -253,20 +449,8 @@ static bool rebalance_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - unsigned target, compression; - - if (k.k->p.inode) { - target = io_opts->background_target; - compression = background_compression(*io_opts); - } else { - const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k); - - target = r ? r->target : io_opts->background_target; - compression = r ? r->compression : background_compression(*io_opts); - } - - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression); - data_opts->target = target; + data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); + data_opts->target = io_opts->background_target; data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; return data_opts->rewrite_ptrs != 0; } @@ -338,9 +522,9 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!r->enabled) { + if (!c->opts.rebalance_enabled) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(r->enabled || + kthread_wait_freezable(c->opts.rebalance_enabled || kthread_should_stop()); } @@ -453,7 +637,7 @@ void bch2_rebalance_stop(struct bch_fs *c) c->rebalance.thread = NULL; if (p) { - /* for sychronizing with rebalance_wakeup() */ + /* for synchronizing with rebalance_wakeup() */ synchronize_rcu(); kthread_stop(p); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 28a52638f16cc..0a0821ab895d8 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -2,8 +2,18 @@ #ifndef _BCACHEFS_REBALANCE_H #define _BCACHEFS_REBALANCE_H +#include "compress.h" +#include "disk_groups.h" #include "rebalance_types.h" +u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *, struct bkey_s_c); +int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bch_io_opts *, struct bkey_i *); +int bch2_get_update_rebalance_opts(struct btree_trans *, + struct bch_io_opts *, + struct btree_iter *, + struct bkey_s_c); + +int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); int bch2_set_fs_needs_rebalance(struct bch_fs *); diff --git a/fs/bcachefs/rebalance_format.h b/fs/bcachefs/rebalance_format.h new file mode 100644 index 0000000000000..ff9a1342a22b4 --- /dev/null +++ b/fs/bcachefs/rebalance_format.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_FORMAT_H +#define _BCACHEFS_REBALANCE_FORMAT_H + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:3, + + promote_target_from_inode:1, + erasure_code_from_inode:1, + data_checksum_from_inode:1, + background_compression_from_inode:1, + data_replicas_from_inode:1, + background_target_from_inode:1, + + promote_target:16, + erasure_code:1, + data_checksum:4, + data_replicas:4, + background_compression:8, /* enum bch_compression_opt */ + background_target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 background_target:16, + background_compression:8, + data_replicas:4, + data_checksum:4, + erasure_code:1, + promote_target:16, + + background_target_from_inode:1, + data_replicas_from_inode:1, + background_compression_from_inode:1, + data_checksum_from_inode:1, + erasure_code_from_inode:1, + promote_target_from_inode:1, + + unused:3, + type:6; +#endif +}; + +/* subset of BCH_INODE_OPTS */ +#define BCH_REBALANCE_OPTS() \ + x(data_checksum) \ + x(background_compression) \ + x(data_replicas) \ + x(promote_target) \ + x(background_target) \ + x(erasure_code) + +#endif /* _BCACHEFS_REBALANCE_FORMAT_H */ + diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 0fffb536c1d0c..fe5098c17dfc0 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -30,8 +30,6 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; - - unsigned enabled:1; }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 3c7f941dde39a..98825437381c6 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -34,21 +34,83 @@ #define QSTR(n) { { { .len = strlen(n) } }, .name = n } -void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) { - if (btree >= BTREE_ID_NR_MAX) - return; - u64 b = BIT_ULL(btree); + int ret = 0; + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (!(c->sb.btrees_lost_data & b)) { - bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree)); + struct printbuf buf = PRINTBUF; + bch2_btree_id_to_text(&buf, btree); + bch_err(c, "flagging btree %s lost data", buf.buf); + printbuf_exit(&buf); + ext->btrees_lost_data |= cpu_to_le64(b); + } - mutex_lock(&c->sb_lock); - bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b); - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + /* Once we have runtime self healing for topology errors we won't need this: */ + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + + /* Btree node accounting will be off: */ + __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + +#ifdef CONFIG_BCACHEFS_DEBUG + /* + * These are much more minor, and don't need to be corrected right away, + * but in debug mode we want the next fsck run to be clean: + */ + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; +#endif + + switch (btree) { + case BTREE_ID_alloc: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_cached_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + goto out; + case BTREE_ID_backpointers: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + goto out; + case BTREE_ID_need_discard: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_freespace: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_bucket_gens: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_lru: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + goto out; + case BTREE_ID_accounting: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + goto out; + default: + ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + goto out; } +out: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +static void kill_btree(struct bch_fs *c, enum btree_id btree) +{ + bch2_btree_id_root(c, btree)->alive = false; + bch2_shoot_down_journal_keys(c, btree, 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); } /* for -o reconstruct_alloc: */ @@ -79,6 +141,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) __set_bit_le64(BCH_FSCK_ERR_fs_usage_persistent_reserved_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_fs_usage_replicas_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_to_missing_lru_entry, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); @@ -99,16 +163,9 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - bch2_shoot_down_journal_keys(c, BTREE_ID_alloc, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_freespace, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); - bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens, - 0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX); + for (unsigned i = 0; i < btree_id_nr_alive(c); i++) + if (btree_id_is_alloc(i)) + kill_btree(c, i); } /* @@ -354,10 +411,13 @@ int bch2_journal_replay(struct bch_fs *c) ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim : 0), bch2_journal_replay_key(trans, k)); - bch_err_msg(c, ret, "while replaying key at btree %s level %u:", - bch2_btree_id_str(k->btree_id), k->level); - if (ret) + if (ret) { + struct printbuf buf = PRINTBUF; + bch2_btree_id_level_to_text(&buf, k->btree_id, k->level); + bch_err_msg(c, ret, "while replaying key at %s:", buf.buf); + printbuf_exit(&buf); goto err; + } BUG_ON(k->btree_id != BTREE_ID_accounting && !k->overwritten); } @@ -403,7 +463,9 @@ static int journal_replay_entry_early(struct bch_fs *c, switch (entry->type) { case BCH_JSET_ENTRY_btree_root: { - struct btree_root *r; + + if (unlikely(!entry->u64s)) + return 0; if (fsck_err_on(entry->btree_id >= BTREE_ID_NR_MAX, c, invalid_btree_id, @@ -417,15 +479,11 @@ static int journal_replay_entry_early(struct bch_fs *c, return ret; } - r = bch2_btree_id_root(c, entry->btree_id); + struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); - if (entry->u64s) { - r->level = entry->level; - bkey_copy(&r->key, (struct bkey_i *) entry->start); - r->error = 0; - } else { - r->error = -BCH_ERR_btree_node_read_error; - } + r->level = entry->level; + bkey_copy(&r->key, (struct bkey_i *) entry->start); + r->error = 0; r->alive = true; break; } @@ -505,6 +563,7 @@ static int journal_replay_early(struct bch_fs *c, static int read_btree_roots(struct bch_fs *c) { + struct printbuf buf = PRINTBUF; int ret = 0; for (unsigned i = 0; i < btree_id_nr_alive(c); i++) { @@ -513,33 +572,22 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) - continue; + printbuf_reset(&buf); + bch2_btree_id_level_to_text(&buf, i, r->level); if (mustfix_fsck_err_on((ret = r->error), c, btree_root_bkey_invalid, "invalid btree root %s", - bch2_btree_id_str(i)) || + buf.buf) || mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)), c, btree_root_read_error, - "error reading btree root %s l=%u: %s", - bch2_btree_id_str(i), r->level, bch2_err_str(ret))) { - if (btree_id_is_alloc(i)) { - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs); - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + "error reading btree root %s: %s", + buf.buf, bch2_err_str(ret))) { + if (btree_id_is_alloc(i)) r->error = 0; - } else if (!(c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) { - bch_info(c, "will run btree node scan"); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes); - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); - } - ret = 0; - bch2_btree_lost_data(c, i); + ret = bch2_btree_lost_data(c, i); + BUG_ON(ret); } } @@ -553,6 +601,7 @@ static int read_btree_roots(struct bch_fs *c) } } fsck_err: + printbuf_exit(&buf); return ret; } @@ -563,6 +612,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_latest_compatible_version(c->sb.version)); unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; unsigned new_version = 0; + bool ret = false; if (old_version < bcachefs_metadata_required_upgrade_below) { if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || @@ -618,14 +668,32 @@ static bool check_version_upgrade(struct bch_fs *c) } bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); - bch2_sb_upgrade(c, new_version); + ret = true; + } + if (new_version > c->sb.version_incompat && + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, new_version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + + bch_info(c, "%s", buf.buf); printbuf_exit(&buf); - return true; + + ret = true; } - return false; + if (ret) + bch2_sb_upgrade(c, new_version, + c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible); + + return ret; } int bch2_fs_recovery(struct bch_fs *c) @@ -660,8 +728,13 @@ int bch2_fs_recovery(struct bch_fs *c) goto err; } - if (c->opts.norecovery) - c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1; + if (c->opts.norecovery) { + c->opts.recovery_pass_last = c->opts.recovery_pass_last + ? min(c->opts.recovery_pass_last, BCH_RECOVERY_PASS_snapshots_read) + : BCH_RECOVERY_PASS_snapshots_read; + c->opts.nochanges = true; + c->opts.read_only = true; + } mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); @@ -708,17 +781,20 @@ int bch2_fs_recovery(struct bch_fs *c) c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + if (c->sb.version_upgrade_complete < bcachefs_metadata_version_autofix_errors) { + SET_BCH_SB_ERROR_ACTION(c->disk_sb.sb, BCH_ON_ERROR_fix_safe); + write_sb = true; + } + if (write_sb) bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - c->opts.recovery_passes |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); - if (c->opts.fsck) set_bit(BCH_FS_fsck_running, &c->flags); if (c->sb.clean) set_bit(BCH_FS_clean_recovery, &c->flags); + set_bit(BCH_FS_recovery_running, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -807,15 +883,15 @@ int bch2_fs_recovery(struct bch_fs *c) c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) - bch2_reconstruct_alloc(c); - zero_out_btree_mem_ptr(&c->journal_keys); ret = journal_replay_early(c, clean); if (ret) goto err; + if (c->opts.reconstruct_alloc) + bch2_reconstruct_alloc(c); + /* * After an unclean shutdown, skip then next few journal sequence * numbers as they may have been referenced by btree writes that @@ -870,16 +946,17 @@ int bch2_fs_recovery(struct bch_fs *c) */ set_bit(BCH_FS_may_go_rw, &c->flags); clear_bit(BCH_FS_fsck_running, &c->flags); + clear_bit(BCH_FS_recovery_running, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); + bch2_async_btree_node_rewrites_flush(c); + /* fsync if we fixed errors */ - if (test_bit(BCH_FS_errors_fixed, &c->flags) && - bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); } /* If we fixed errors, verify that fs is actually clean now: */ @@ -1021,7 +1098,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_check_version_downgrade(c); if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { - bch2_sb_upgrade(c, bcachefs_metadata_version_current); + bch2_sb_upgrade(c, bcachefs_metadata_version_current, false); SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); bch2_write_super(c); } @@ -1035,7 +1112,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; set_bit(BCH_FS_btree_running, &c->flags); set_bit(BCH_FS_may_go_rw, &c->flags); @@ -1076,9 +1152,6 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - for_each_online_member(c, ca) - ca->new_fs_bucket_idx = 0; - ret = bch2_fs_freespace_init(c); if (ret) goto err; @@ -1137,6 +1210,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); + c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index 4bf818de1f2fe..b0d55754b21b9 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,7 +2,7 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -void bch2_btree_lost_data(struct bch_fs *, enum btree_id); +int bch2_btree_lost_data(struct bch_fs *, enum btree_id); int bch2_journal_replay(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index dff589ddc984d..0b3c951c32da9 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -46,7 +46,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) return bch2_fs_read_write_early(c); return 0; } @@ -100,20 +100,34 @@ u64 bch2_recovery_passes_from_stable(u64 v) /* * For when we need to rewind recovery passes and run a pass we skipped: */ -int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) +static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) { - if (c->opts.recovery_passes & BIT_ULL(pass)) + if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) + return -BCH_ERR_not_in_recovery; + + if (c->recovery_passes_complete & BIT_ULL(pass)) return 0; - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && + c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { + if (print) + bch_info(c, "need recovery pass %s (%u), but already rw", + bch2_recovery_passes[pass], pass); + return -BCH_ERR_cannot_rewind_recovery; + } + + if (print) + bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); c->opts.recovery_passes |= BIT_ULL(pass); - if (c->curr_recovery_pass >= pass) { - c->curr_recovery_pass = pass; + if (c->curr_recovery_pass > pass) { + c->next_recovery_pass = pass; c->recovery_passes_complete &= (1ULL << pass) >> 1; return -BCH_ERR_restart_recovery; } else { @@ -121,6 +135,27 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, } } +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + unsigned long flags; + spin_lock_irqsave(&c->recovery_pass_lock, flags); + int ret = __bch2_run_explicit_recovery_pass(c, pass); + spin_unlock_irqrestore(&c->recovery_pass_lock, flags); + return ret; +} + +int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + + return bch2_run_explicit_recovery_pass(c, pass); +} + int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, enum bch_recovery_pass pass) { @@ -233,31 +268,48 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { - if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) - break; - - if (should_run_recovery_pass(c, c->curr_recovery_pass)) { - unsigned pass = c->curr_recovery_pass; + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + c->next_recovery_pass = c->curr_recovery_pass + 1; - ret = bch2_run_recovery_pass(c, c->curr_recovery_pass) ?: - bch2_journal_flush(&c->journal); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery) || - (ret && c->curr_recovery_pass < pass)) - continue; - if (ret) - break; + spin_lock_irq(&c->recovery_pass_lock); + unsigned pass = c->curr_recovery_pass; - c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass); + if (c->opts.recovery_pass_last && + c->curr_recovery_pass > c->opts.recovery_pass_last) { + spin_unlock_irq(&c->recovery_pass_lock); + break; } - c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass); - - if (!test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, c->curr_recovery_pass); - - c->curr_recovery_pass++; + if (!should_run_recovery_pass(c, pass)) { + c->curr_recovery_pass++; + c->recovery_pass_done = max(c->recovery_pass_done, pass); + spin_unlock_irq(&c->recovery_pass_lock); + continue; + } + spin_unlock_irq(&c->recovery_pass_lock); + + ret = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); + + if (!ret && !test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, pass); + + spin_lock_irq(&c->recovery_pass_lock); + if (c->next_recovery_pass < c->curr_recovery_pass) { + /* + * bch2_run_explicit_recovery_pass() was called: we + * can't always catch -BCH_ERR_restart_recovery because + * it may have been called from another thread (btree + * node read completion) + */ + ret = 0; + c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); + } else { + c->recovery_passes_complete |= BIT_ULL(pass); + c->recovery_pass_done = max(c->recovery_pass_done, pass); + } + c->curr_recovery_pass = c->next_recovery_pass; + spin_unlock_irq(&c->recovery_pass_lock); } return ret; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 99b464e127b80..7d7339c8fa294 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -9,6 +9,7 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); +int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); int bch2_run_online_recovery_passes(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 94dc20ca20653..71baad41d8c5b 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -8,53 +8,59 @@ #define PASS_ALWAYS BIT(3) #define PASS_ONLINE BIT(4) +#ifdef CONFIG_BCACHEFS_DEBUG +#define PASS_FSCK_DEBUG BIT(1) +#else +#define PASS_FSCK_DEBUG 0 +#endif + /* * Passes may be reordered, but the second field is a persistent identifier and * must never change: */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) \ +#define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ + x(scan_for_btree_nodes, 37, 0) \ + x(check_topology, 4, 0) \ + x(accounting_read, 39, PASS_ALWAYS) \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, PASS_ALWAYS) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_allocations, 5, PASS_FSCK) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) /* We normally enumerate recovery passes in the order we run them: */ enum bch_recovery_pass { diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index f457925fa362e..93ba4f4e47cae 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -15,6 +15,17 @@ #include +static inline bool bkey_extent_is_reflink_data(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_reflink_v: + case KEY_TYPE_indirect_inline_data: + return true; + default: + return false; + } +} + static inline unsigned bkey_type_to_indirect(const struct bkey *k) { switch (k->type) { @@ -30,15 +41,15 @@ static inline unsigned bkey_type_to_indirect(const struct bkey *k) /* reflink pointers */ int bch2_reflink_p_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; - bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad), + bkey_fsck_err_on(REFLINK_P_IDX(p.v) < le32_to_cpu(p.v->front_pad), c, reflink_p_front_pad_bad, "idx < front_pad (%llu < %u)", - le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); + REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad)); fsck_err: return ret; } @@ -49,7 +60,7 @@ void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); prt_printf(out, "idx %llu front_pad %u back_pad %u", - le64_to_cpu(p.v->idx), + REFLINK_P_IDX(p.v), le32_to_cpu(p.v->front_pad), le32_to_cpu(p.v->back_pad)); } @@ -65,49 +76,250 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r */ return false; - if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + if (REFLINK_P_IDX(l.v) + l.k->size != REFLINK_P_IDX(r.v)) + return false; + + if (REFLINK_P_ERROR(l.v) != REFLINK_P_ERROR(r.v)) return false; bch2_key_resize(l.k, l.k->size + r.k->size); return true; } +/* indirect extents */ + +int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + int ret = 0; + + bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, REFLINK_P_IDX_MAX)), + c, reflink_v_pos_bad, + "indirect extent above maximum position 0:%llu", + REFLINK_P_IDX_MAX); + + ret = bch2_bkey_ptrs_validate(c, k, from); +fsck_err: + return ret; +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +#if 0 +Currently disabled, needs to be debugged: + +bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); + + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} +#endif + +/* indirect inline data */ + +int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, + struct bkey_validate_context from) +{ + return 0; +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, + struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + prt_printf(out, "refcount %llu datalen %u: %*phN", + le64_to_cpu(d.v->refcount), datalen, + min(datalen, 32U), d.v->data); +} + +/* lookup */ + +static int bch2_indirect_extent_not_missing(struct btree_trans *trans, struct bkey_s_c_reflink_p p, + bool should_commit) +{ + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + SET_REFLINK_P_ERROR(&new->v, false); + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); + if (ret) + return ret; + + if (!should_commit) + return 0; + + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; +} + +static int bch2_indirect_extent_missing_error(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 missing_start, u64 missing_end, + bool should_commit) +{ + if (REFLINK_P_ERROR(p.v)) + return -BCH_ERR_missing_indirect_extent; + + struct bch_fs *c = trans->c; + u64 live_start = REFLINK_P_IDX(p.v); + u64 live_end = REFLINK_P_IDX(p.v) + p.k->size; + u64 refd_start = live_start - le32_to_cpu(p.v->front_pad); + u64 refd_end = live_end + le32_to_cpu(p.v->back_pad); + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(missing_start < refd_start); + BUG_ON(missing_end > refd_end); + + if (fsck_err(trans, reflink_p_to_missing_reflink_v, + "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + missing_start, missing_end)) { + struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + /* + * Is the missing range not actually needed? + * + * p.v->idx refers to the data that we actually want, but if the + * indirect extent we point to was bigger, front_pad and back_pad + * indicate the range we took a reference on. + */ + + if (missing_end <= live_start) { + new->v.front_pad = cpu_to_le32(live_start - missing_end); + } else if (missing_start >= live_end) { + new->v.back_pad = cpu_to_le32(missing_start - live_end); + } else { + struct bpos new_start = bkey_start_pos(&new->k); + struct bpos new_end = new->k.p; + + if (missing_start > live_start) + new_start.offset += missing_start - live_start; + if (missing_end < live_end) + new_end.offset -= live_end - missing_end; + + bch2_cut_front(new_start, &new->k_i); + bch2_cut_back(new_end, &new->k_i); + + SET_REFLINK_P_ERROR(&new->v, true); + } + + ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i, BTREE_TRIGGER_norun); + if (ret) + goto err; + + if (should_commit) + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * This is used from the read path, which doesn't expect to have to do a + * transaction commit, and from triggers, which should not be doing a commit: + */ +struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *trans, + struct btree_iter *iter, + s64 *offset_into_extent, + struct bkey_s_c_reflink_p p, + bool should_commit, + unsigned iter_flags) +{ + BUG_ON(*offset_into_extent < -((s64) le32_to_cpu(p.v->front_pad))); + BUG_ON(*offset_into_extent >= p.k->size + le32_to_cpu(p.v->back_pad)); + + u64 reflink_offset = REFLINK_P_IDX(p.v) + *offset_into_extent; + + struct bkey_s_c k = bch2_bkey_get_iter(trans, iter, BTREE_ID_reflink, + POS(0, reflink_offset), iter_flags); + if (bkey_err(k)) + return k; + + if (unlikely(!bkey_extent_is_reflink_data(k.k))) { + bch2_trans_iter_exit(trans, iter); + + unsigned size = min((u64) k.k->size, + REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad) - + reflink_offset); + bch2_key_resize(&iter->k, size); + + int ret = bch2_indirect_extent_missing_error(trans, p, reflink_offset, + k.k->p.offset, should_commit); + if (ret) + return bkey_s_c_err(ret); + } else if (unlikely(REFLINK_P_ERROR(p.v))) { + bch2_trans_iter_exit(trans, iter); + + int ret = bch2_indirect_extent_not_missing(trans, p, should_commit); + if (ret) + return bkey_s_c_err(ret); + } + + *offset_into_extent = reflink_offset - bkey_start_offset(k.k); + return k; +} + +/* reflink pointer trigger */ + static int trans_trigger_reflink_p_segment(struct btree_trans *trans, struct bkey_s_c_reflink_p p, u64 *idx, enum btree_iter_update_trigger_flags flags) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i *k; - __le64 *refcount; - int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; struct printbuf buf = PRINTBUF; - int ret; - k = bch2_bkey_get_mut_noupdate(trans, &iter, - BTREE_ID_reflink, POS(0, *idx), - BTREE_ITER_with_updates); - ret = PTR_ERR_OR_ZERO(k); + s64 offset_into_extent = *idx - REFLINK_P_IDX(p.v); + struct btree_iter iter; + struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, &offset_into_extent, p, false, + BTREE_ITER_intent| + BTREE_ITER_with_updates); + int ret = bkey_err(k); if (ret) - goto err; + return ret; - refcount = bkey_refcount(bkey_i_to_s(k)); - if (!refcount) { - bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "nonexistent indirect extent at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; + if (bkey_deleted(k.k)) { + if (!(flags & BTREE_TRIGGER_overwrite)) + ret = -BCH_ERR_missing_indirect_extent; + goto next; } + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { bch2_bkey_val_to_text(&buf, c, p.s_c); - bch2_trans_inconsistent(trans, - "indirect extent refcount underflow at %llu while marking\n %s", - *idx, buf.buf); - ret = -EIO; - goto err; + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + log_fsck_err(trans, reflink_refcount_underflow, + "indirect extent refcount underflow while marking\n %s", + buf.buf); + goto next; } if (flags & BTREE_TRIGGER_insert) { @@ -115,25 +327,26 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, u64 pad; pad = max_t(s64, le32_to_cpu(v->front_pad), - le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + REFLINK_P_IDX(v) - bkey_start_offset(&new->k)); BUG_ON(pad > U32_MAX); v->front_pad = cpu_to_le32(pad); pad = max_t(s64, le32_to_cpu(v->back_pad), - k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + new->k.p.offset - p.k->size - REFLINK_P_IDX(v)); BUG_ON(pad > U32_MAX); v->back_pad = cpu_to_le32(pad); } - le64_add_cpu(refcount, add); + le64_add_cpu(refcount, !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1); bch2_btree_iter_set_pos_to_extent_start(&iter); - ret = bch2_trans_update(trans, &iter, k, 0); + ret = bch2_trans_update(trans, &iter, new, 0); if (ret) goto err; - - *idx = k->k.p.offset; +next: + *idx = k.k->p.offset; err: +fsck_err: bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); return ret; @@ -147,9 +360,7 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, struct bch_fs *c = trans->c; struct reflink_gc *r; int add = !(flags & BTREE_TRIGGER_overwrite) ? 1 : -1; - u64 start = le64_to_cpu(p.v->idx); - u64 end = le64_to_cpu(p.v->idx) + p.k->size; - u64 next_idx = end + le32_to_cpu(p.v->back_pad); + u64 next_idx = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); s64 ret = 0; struct printbuf buf = PRINTBUF; @@ -168,36 +379,14 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, *idx = r->offset; return 0; not_found: - BUG_ON(!(flags & BTREE_TRIGGER_check_repair)); - - if (fsck_err(trans, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - *idx, next_idx)) { - struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c); - ret = PTR_ERR_OR_ZERO(update); + if (flags & BTREE_TRIGGER_check_repair) { + ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); if (ret) goto err; - - if (next_idx <= start) { - bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx); - } else if (*idx >= end) { - bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end); - } else { - bkey_error_init(update); - update->k.p = p.k->p; - update->k.size = p.k->size; - set_bkey_val_u64s(&update->k, 0); - } - - ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_norun); } *idx = next_idx; err: -fsck_err: printbuf_exit(&buf); return ret; } @@ -210,8 +399,8 @@ static int __trigger_reflink_p(struct btree_trans *trans, struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); int ret = 0; - u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); - u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad); + u64 idx = REFLINK_P_IDX(p.v) - le32_to_cpu(p.v->front_pad); + u64 end = REFLINK_P_IDX(p.v) + p.k->size + le32_to_cpu(p.v->back_pad); if (flags & BTREE_TRIGGER_transactional) { while (idx < end && !ret) @@ -253,35 +442,7 @@ int bch2_trigger_reflink_p(struct btree_trans *trans, return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags); } -/* indirect extents */ - -int bch2_reflink_v_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) -{ - return bch2_bkey_ptrs_validate(c, k, flags); -} - -void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) -{ - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); - - prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); - - bch2_bkey_ptrs_to_text(out, c, k); -} - -#if 0 -Currently disabled, needs to be debugged: - -bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -{ - struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); - struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); - - return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -} -#endif +/* indirect extent trigger */ static inline void check_indirect_extent_deleting(struct bkey_s new, @@ -307,25 +468,6 @@ int bch2_trigger_reflink_v(struct btree_trans *trans, return bch2_trigger_extent(trans, btree_id, level, old, new, flags); } -/* indirect inline data */ - -int bch2_indirect_inline_data_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) -{ - return 0; -} - -void bch2_indirect_inline_data_to_text(struct printbuf *out, - struct bch_fs *c, struct bkey_s_c k) -{ - struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); - unsigned datalen = bkey_inline_data_bytes(k.k); - - prt_printf(out, "refcount %llu datalen %u: %*phN", - le64_to_cpu(d.v->refcount), datalen, - min(datalen, 32U), d.v->data); -} - int bch2_trigger_indirect_inline_data(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c old, struct bkey_s new, @@ -336,9 +478,12 @@ int bch2_trigger_indirect_inline_data(struct btree_trans *trans, return 0; } +/* create */ + static int bch2_make_extent_indirect(struct btree_trans *trans, struct btree_iter *extent_iter, - struct bkey_i *orig) + struct bkey_i *orig, + bool reflink_p_may_update_opts_field) { struct bch_fs *c = trans->c; struct btree_iter reflink_iter = { NULL }; @@ -358,6 +503,14 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, if (ret) goto err; + /* + * XXX: we're assuming that 56 bits will be enough for the life of the + * filesystem: we need to implement wraparound, with a cursor in the + * logged ops btree: + */ + if (bkey_ge(reflink_iter.pos, POS(0, REFLINK_P_IDX_MAX - orig->k.size))) + return -ENOSPC; + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); ret = PTR_ERR_OR_ZERO(r_v); if (ret) @@ -394,7 +547,10 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, memset(&r_p->v, 0, sizeof(r_p->v)); #endif - r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + SET_REFLINK_P_IDX(&r_p->v, bkey_start_offset(&r_v->k)); + + if (reflink_p_may_update_opts_field) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&r_p->v, true); ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, BTREE_UPDATE_internal_snapshot_node); @@ -409,7 +565,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) struct bkey_s_c k; int ret; - for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { + for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; @@ -426,7 +582,8 @@ s64 bch2_remap_range(struct bch_fs *c, subvol_inum dst_inum, u64 dst_offset, subvol_inum src_inum, u64 src_offset, u64 remap_sectors, - u64 new_i_size, s64 *i_sectors_delta) + u64 new_i_size, s64 *i_sectors_delta, + bool may_change_src_io_path_opts) { struct btree_trans *trans; struct btree_iter dst_iter, src_iter; @@ -439,6 +596,8 @@ s64 bch2_remap_range(struct bch_fs *c, struct bpos src_want; u64 dst_done = 0; u32 dst_snapshot, src_snapshot; + bool reflink_p_may_update_opts_field = + bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) @@ -520,7 +679,8 @@ s64 bch2_remap_range(struct bch_fs *c, src_k = bkey_i_to_s_c(new_src.k); ret = bch2_make_extent_indirect(trans, &src_iter, - new_src.k); + new_src.k, + reflink_p_may_update_opts_field); if (ret) continue; @@ -533,11 +693,15 @@ s64 bch2_remap_range(struct bch_fs *c, struct bkey_i_reflink_p *dst_p = bkey_reflink_p_init(new_dst.k); - u64 offset = le64_to_cpu(src_p.v->idx) + + u64 offset = REFLINK_P_IDX(src_p.v) + (src_want.offset - bkey_start_offset(src_k.k)); - dst_p->v.idx = cpu_to_le64(offset); + SET_REFLINK_P_IDX(&dst_p->v, offset); + + if (reflink_p_may_update_opts_field && + may_change_src_io_path_opts) + SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); } @@ -547,7 +711,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(src_k.k->p.offset - src_want.offset, dst_end.offset - dst_iter.pos.offset)); - ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?: + ret = bch2_bkey_set_needs_rebalance(c, &opts, new_dst.k) ?: bch2_extent_update(trans, dst_inum, &dst_iter, new_dst.k, &disk_res, new_i_size, i_sectors_delta, @@ -591,3 +755,97 @@ s64 bch2_remap_range(struct bch_fs *c, return dst_done ?: ret ?: ret2; } + +/* fsck */ + +static int bch2_gc_write_reflink_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + size_t *idx) +{ + struct bch_fs *c = trans->c; + const __le64 *refcount = bkey_refcount_c(k); + struct printbuf buf = PRINTBUF; + struct reflink_gc *r; + int ret = 0; + + if (!refcount) + return 0; + + while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && + r->offset < k.k->p.offset) + ++*idx; + + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + return -EINVAL; + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), + trans, reflink_v_refcount_wrong, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; + else + *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); + } +out: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_gc_reflink_done(struct bch_fs *c) +{ + size_t idx = 0; + + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_gc_write_reflink_key(trans, &iter, k, &idx))); + c->reflink_gc_nr = 0; + return ret; +} + +int bch2_gc_reflink_start(struct bch_fs *c) +{ + c->reflink_gc_nr = 0; + + int ret = bch2_trans_run(c, + for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch, k, ({ + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, + c->reflink_gc_nr++, GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + 0; + }))); + + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 51afe11d8ed65..1632780bdf181 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -2,9 +2,8 @@ #ifndef _BCACHEFS_REFLINK_H #define _BCACHEFS_REFLINK_H -enum bch_validate_flags; - -int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_reflink_p_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, @@ -19,7 +18,8 @@ int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned, .min_val_size = 16, \ }) -int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_reflink_v_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, @@ -34,7 +34,7 @@ int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned, }) int bch2_indirect_inline_data_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); void bch2_indirect_inline_data_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_trigger_indirect_inline_data(struct btree_trans *, @@ -73,7 +73,15 @@ static inline __le64 *bkey_refcount(struct bkey_s k) } } +struct bkey_s_c bch2_lookup_indirect_extent(struct btree_trans *, struct btree_iter *, + s64 *, struct bkey_s_c_reflink_p, + bool, unsigned); + s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, - subvol_inum, u64, u64, u64, s64 *); + subvol_inum, u64, u64, u64, s64 *, + bool); + +int bch2_gc_reflink_done(struct bch_fs *); +int bch2_gc_reflink_start(struct bch_fs *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h index 6772eebb1fc66..92995e4f898e2 100644 --- a/fs/bcachefs/reflink_format.h +++ b/fs/bcachefs/reflink_format.h @@ -4,7 +4,7 @@ struct bch_reflink_p { struct bch_val v; - __le64 idx; + __le64 idx_flags; /* * A reflink pointer might point to an indirect extent which is then * later split (by copygc or rebalance). If we only pointed to part of @@ -17,6 +17,11 @@ struct bch_reflink_p { __le32 back_pad; } __packed __aligned(8); +LE64_BITMASK(REFLINK_P_IDX, struct bch_reflink_p, idx_flags, 0, 56); +LE64_BITMASK(REFLINK_P_ERROR, struct bch_reflink_p, idx_flags, 56, 57); +LE64_BITMASK(REFLINK_P_MAY_UPDATE_OPTIONS, + struct bch_reflink_p, idx_flags, 57, 58); + struct bch_reflink_v { struct bch_val v; __le64 refcount; diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index 0052752818045..59c8770e4a0e9 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -23,6 +23,10 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) { + struct bkey_validate_context from = { + .flags = write, + .from = BKEY_VALIDATE_superblock, + }; struct jset_entry *entry; int ret; @@ -40,7 +44,7 @@ int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *cle ret = bch2_journal_entry_validate(c, NULL, entry, le16_to_cpu(c->disk_sb.sb->version), BCH_SB_BIG_ENDIAN(c->disk_sb.sb), - write); + from); if (ret) return ret; } diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 62ea478215d08..fdcf598f08b12 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -2,86 +2,91 @@ #ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H #define _BCACHEFS_SB_COUNTERS_FORMAT_H -#define BCH_PERSISTENT_COUNTERS() \ - x(io_read, 0) \ - x(io_write, 1) \ - x(io_move, 2) \ - x(bucket_invalidate, 3) \ - x(bucket_discard, 4) \ - x(bucket_alloc, 5) \ - x(bucket_alloc_fail, 6) \ - x(btree_cache_scan, 7) \ - x(btree_cache_reap, 8) \ - x(btree_cache_cannibalize, 9) \ - x(btree_cache_cannibalize_lock, 10) \ - x(btree_cache_cannibalize_lock_fail, 11) \ - x(btree_cache_cannibalize_unlock, 12) \ - x(btree_node_write, 13) \ - x(btree_node_read, 14) \ - x(btree_node_compact, 15) \ - x(btree_node_merge, 16) \ - x(btree_node_split, 17) \ - x(btree_node_rewrite, 18) \ - x(btree_node_alloc, 19) \ - x(btree_node_free, 20) \ - x(btree_node_set_root, 21) \ - x(btree_path_relock_fail, 22) \ - x(btree_path_upgrade_fail, 23) \ - x(btree_reserve_get_fail, 24) \ - x(journal_entry_full, 25) \ - x(journal_full, 26) \ - x(journal_reclaim_finish, 27) \ - x(journal_reclaim_start, 28) \ - x(journal_write, 29) \ - x(read_promote, 30) \ - x(read_bounce, 31) \ - x(read_split, 33) \ - x(read_retry, 32) \ - x(read_reuse_race, 34) \ - x(move_extent_read, 35) \ - x(move_extent_write, 36) \ - x(move_extent_finish, 37) \ - x(move_extent_fail, 38) \ - x(move_extent_start_fail, 39) \ - x(copygc, 40) \ - x(copygc_wait, 41) \ - x(gc_gens_end, 42) \ - x(gc_gens_start, 43) \ - x(trans_blocked_journal_reclaim, 44) \ - x(trans_restart_btree_node_reused, 45) \ - x(trans_restart_btree_node_split, 46) \ - x(trans_restart_fault_inject, 47) \ - x(trans_restart_iter_upgrade, 48) \ - x(trans_restart_journal_preres_get, 49) \ - x(trans_restart_journal_reclaim, 50) \ - x(trans_restart_journal_res_get, 51) \ - x(trans_restart_key_cache_key_realloced, 52) \ - x(trans_restart_key_cache_raced, 53) \ - x(trans_restart_mark_replicas, 54) \ - x(trans_restart_mem_realloced, 55) \ - x(trans_restart_memory_allocation_failure, 56) \ - x(trans_restart_relock, 57) \ - x(trans_restart_relock_after_fill, 58) \ - x(trans_restart_relock_key_cache_fill, 59) \ - x(trans_restart_relock_next_node, 60) \ - x(trans_restart_relock_parent_for_fill, 61) \ - x(trans_restart_relock_path, 62) \ - x(trans_restart_relock_path_intent, 63) \ - x(trans_restart_too_many_iters, 64) \ - x(trans_restart_traverse, 65) \ - x(trans_restart_upgrade, 66) \ - x(trans_restart_would_deadlock, 67) \ - x(trans_restart_would_deadlock_write, 68) \ - x(trans_restart_injected, 69) \ - x(trans_restart_key_cache_upgrade, 70) \ - x(trans_traverse_all, 71) \ - x(transaction_commit, 72) \ - x(write_super, 73) \ - x(trans_restart_would_deadlock_recursion_limit, 74) \ - x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) \ - x(write_buffer_flush_slowpath, 77) \ - x(write_buffer_flush_sync, 78) +enum counters_flags { + TYPE_COUNTER = BIT(0), /* event counters */ + TYPE_SECTORS = BIT(1), /* amount counters, the unit is sectors */ +}; + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0, TYPE_SECTORS) \ + x(io_write, 1, TYPE_SECTORS) \ + x(io_move, 2, TYPE_SECTORS) \ + x(bucket_invalidate, 3, TYPE_COUNTER) \ + x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_alloc, 5, TYPE_COUNTER) \ + x(bucket_alloc_fail, 6, TYPE_COUNTER) \ + x(btree_cache_scan, 7, TYPE_COUNTER) \ + x(btree_cache_reap, 8, TYPE_COUNTER) \ + x(btree_cache_cannibalize, 9, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock, 10, TYPE_COUNTER) \ + x(btree_cache_cannibalize_lock_fail, 11, TYPE_COUNTER) \ + x(btree_cache_cannibalize_unlock, 12, TYPE_COUNTER) \ + x(btree_node_write, 13, TYPE_COUNTER) \ + x(btree_node_read, 14, TYPE_COUNTER) \ + x(btree_node_compact, 15, TYPE_COUNTER) \ + x(btree_node_merge, 16, TYPE_COUNTER) \ + x(btree_node_split, 17, TYPE_COUNTER) \ + x(btree_node_rewrite, 18, TYPE_COUNTER) \ + x(btree_node_alloc, 19, TYPE_COUNTER) \ + x(btree_node_free, 20, TYPE_COUNTER) \ + x(btree_node_set_root, 21, TYPE_COUNTER) \ + x(btree_path_relock_fail, 22, TYPE_COUNTER) \ + x(btree_path_upgrade_fail, 23, TYPE_COUNTER) \ + x(btree_reserve_get_fail, 24, TYPE_COUNTER) \ + x(journal_entry_full, 25, TYPE_COUNTER) \ + x(journal_full, 26, TYPE_COUNTER) \ + x(journal_reclaim_finish, 27, TYPE_COUNTER) \ + x(journal_reclaim_start, 28, TYPE_COUNTER) \ + x(journal_write, 29, TYPE_COUNTER) \ + x(read_promote, 30, TYPE_COUNTER) \ + x(read_bounce, 31, TYPE_COUNTER) \ + x(read_split, 33, TYPE_COUNTER) \ + x(read_retry, 32, TYPE_COUNTER) \ + x(read_reuse_race, 34, TYPE_COUNTER) \ + x(move_extent_read, 35, TYPE_SECTORS) \ + x(move_extent_write, 36, TYPE_SECTORS) \ + x(move_extent_finish, 37, TYPE_SECTORS) \ + x(move_extent_fail, 38, TYPE_COUNTER) \ + x(move_extent_start_fail, 39, TYPE_COUNTER) \ + x(copygc, 40, TYPE_COUNTER) \ + x(copygc_wait, 41, TYPE_COUNTER) \ + x(gc_gens_end, 42, TYPE_COUNTER) \ + x(gc_gens_start, 43, TYPE_COUNTER) \ + x(trans_blocked_journal_reclaim, 44, TYPE_COUNTER) \ + x(trans_restart_btree_node_reused, 45, TYPE_COUNTER) \ + x(trans_restart_btree_node_split, 46, TYPE_COUNTER) \ + x(trans_restart_fault_inject, 47, TYPE_COUNTER) \ + x(trans_restart_iter_upgrade, 48, TYPE_COUNTER) \ + x(trans_restart_journal_preres_get, 49, TYPE_COUNTER) \ + x(trans_restart_journal_reclaim, 50, TYPE_COUNTER) \ + x(trans_restart_journal_res_get, 51, TYPE_COUNTER) \ + x(trans_restart_key_cache_key_realloced, 52, TYPE_COUNTER) \ + x(trans_restart_key_cache_raced, 53, TYPE_COUNTER) \ + x(trans_restart_mark_replicas, 54, TYPE_COUNTER) \ + x(trans_restart_mem_realloced, 55, TYPE_COUNTER) \ + x(trans_restart_memory_allocation_failure, 56, TYPE_COUNTER) \ + x(trans_restart_relock, 57, TYPE_COUNTER) \ + x(trans_restart_relock_after_fill, 58, TYPE_COUNTER) \ + x(trans_restart_relock_key_cache_fill, 59, TYPE_COUNTER) \ + x(trans_restart_relock_next_node, 60, TYPE_COUNTER) \ + x(trans_restart_relock_parent_for_fill, 61, TYPE_COUNTER) \ + x(trans_restart_relock_path, 62, TYPE_COUNTER) \ + x(trans_restart_relock_path_intent, 63, TYPE_COUNTER) \ + x(trans_restart_too_many_iters, 64, TYPE_COUNTER) \ + x(trans_restart_traverse, 65, TYPE_COUNTER) \ + x(trans_restart_upgrade, 66, TYPE_COUNTER) \ + x(trans_restart_would_deadlock, 67, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_write, 68, TYPE_COUNTER) \ + x(trans_restart_injected, 69, TYPE_COUNTER) \ + x(trans_restart_key_cache_upgrade, 70, TYPE_COUNTER) \ + x(trans_traverse_all, 71, TYPE_COUNTER) \ + x(transaction_commit, 72, TYPE_COUNTER) \ + x(write_super, 73, TYPE_COUNTER) \ + x(trans_restart_would_deadlock_recursion_limit, 74, TYPE_COUNTER) \ + x(trans_restart_write_buffer_flush, 75, TYPE_COUNTER) \ + x(trans_restart_split_race, 76, TYPE_COUNTER) \ + x(write_buffer_flush_slowpath, 77, TYPE_COUNTER) \ + x(write_buffer_flush_sync, 78, TYPE_COUNTER) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 8767c33c2b513..051214fdc7352 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -81,7 +81,16 @@ BCH_FSCK_ERR_accounting_mismatch) \ x(inode_has_child_snapshots, \ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ - BCH_FSCK_ERR_inode_has_child_snapshots_wrong) + BCH_FSCK_ERR_inode_has_child_snapshots_wrong) \ + x(backpointer_bucket_gen, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_backpointer_to_missing_ptr, \ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(disk_accounting_big_endian, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -117,7 +126,19 @@ BCH_FSCK_ERR_bkey_version_in_future) \ x(rebalance_work_acct_fix, \ BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ - BCH_FSCK_ERR_accounting_mismatch) + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) \ + x(backpointer_bucket_gen, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_backpointer_bucket_offset_wrong, \ + BCH_FSCK_ERR_backpointer_to_missing_ptr, \ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(disk_accounting_big_endian, \ + BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \ + BCH_FSCK_ERR_accounting_mismatch, \ + BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ + BCH_FSCK_ERR_accounting_key_junk_at_end) struct upgrade_downgrade_entry { u64 recovery_passes; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 9feb6739f77a1..e26317c367f74 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -5,9 +5,8 @@ enum bch_fsck_flags { FSCK_CAN_FIX = 1 << 0, FSCK_CAN_IGNORE = 1 << 1, - FSCK_NEED_FSCK = 1 << 2, - FSCK_NO_RATELIMIT = 1 << 3, - FSCK_AUTOFIX = 1 << 4, + FSCK_NO_RATELIMIT = 1 << 2, + FSCK_AUTOFIX = 1 << 3, }; #define BCH_SB_ERRS() \ @@ -59,7 +58,7 @@ enum bch_fsck_flags { x(bset_empty, 45, 0) \ x(bset_bad_seq, 46, 0) \ x(bset_blacklisted_journal_seq, 47, 0) \ - x(first_bset_blacklisted_journal_seq, 48, 0) \ + x(first_bset_blacklisted_journal_seq, 48, FSCK_AUTOFIX) \ x(btree_node_bad_btree, 49, 0) \ x(btree_node_bad_level, 50, 0) \ x(btree_node_bad_min_key, 51, 0) \ @@ -68,17 +67,17 @@ enum bch_fsck_flags { x(btree_node_bkey_past_bset_end, 54, 0) \ x(btree_node_bkey_bad_format, 55, 0) \ x(btree_node_bad_bkey, 56, 0) \ - x(btree_node_bkey_out_of_order, 57, 0) \ - x(btree_root_bkey_invalid, 58, 0) \ - x(btree_root_read_error, 59, 0) \ + x(btree_node_bkey_out_of_order, 57, FSCK_AUTOFIX) \ + x(btree_root_bkey_invalid, 58, FSCK_AUTOFIX) \ + x(btree_root_read_error, 59, FSCK_AUTOFIX) \ x(btree_root_bad_min_key, 60, 0) \ x(btree_root_bad_max_key, 61, 0) \ - x(btree_node_read_error, 62, 0) \ - x(btree_node_topology_bad_min_key, 63, 0) \ - x(btree_node_topology_bad_max_key, 64, 0) \ - x(btree_node_topology_overwritten_by_prev_node, 65, 0) \ - x(btree_node_topology_overwritten_by_next_node, 66, 0) \ - x(btree_node_topology_interior_node_empty, 67, 0) \ + x(btree_node_read_error, 62, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_min_key, 63, FSCK_AUTOFIX) \ + x(btree_node_topology_bad_max_key, 64, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_prev_node, 65, FSCK_AUTOFIX) \ + x(btree_node_topology_overwritten_by_next_node, 66, FSCK_AUTOFIX) \ + x(btree_node_topology_interior_node_empty, 67, FSCK_AUTOFIX) \ x(fs_usage_hidden_wrong, 68, FSCK_AUTOFIX) \ x(fs_usage_btree_wrong, 69, FSCK_AUTOFIX) \ x(fs_usage_data_wrong, 70, FSCK_AUTOFIX) \ @@ -123,11 +122,12 @@ enum bch_fsck_flags { x(alloc_key_cached_sectors_wrong, 109, FSCK_AUTOFIX) \ x(alloc_key_stripe_wrong, 110, FSCK_AUTOFIX) \ x(alloc_key_stripe_redundancy_wrong, 111, FSCK_AUTOFIX) \ + x(alloc_key_journal_seq_in_future, 298, FSCK_AUTOFIX) \ x(bucket_sector_count_overflow, 112, 0) \ x(bucket_metadata_type_mismatch, 113, 0) \ - x(need_discard_key_wrong, 114, 0) \ - x(freespace_key_wrong, 115, 0) \ - x(freespace_hole_missing, 116, 0) \ + x(need_discard_key_wrong, 114, FSCK_AUTOFIX) \ + x(freespace_key_wrong, 115, FSCK_AUTOFIX) \ + x(freespace_hole_missing, 116, FSCK_AUTOFIX) \ x(bucket_gens_val_size_bad, 117, 0) \ x(bucket_gens_key_wrong, 118, FSCK_AUTOFIX) \ x(bucket_gens_hole_wrong, 119, FSCK_AUTOFIX) \ @@ -139,9 +139,10 @@ enum bch_fsck_flags { x(discarding_bucket_not_in_need_discard_btree, 291, 0) \ x(backpointer_bucket_offset_wrong, 125, 0) \ x(backpointer_level_bad, 294, 0) \ - x(backpointer_to_missing_device, 126, 0) \ - x(backpointer_to_missing_alloc, 127, 0) \ - x(backpointer_to_missing_ptr, 128, 0) \ + x(backpointer_dev_bad, 297, 0) \ + x(backpointer_to_missing_device, 126, FSCK_AUTOFIX) \ + x(backpointer_to_missing_alloc, 127, FSCK_AUTOFIX) \ + x(backpointer_to_missing_ptr, 128, FSCK_AUTOFIX) \ x(lru_entry_at_time_0, 129, FSCK_AUTOFIX) \ x(lru_entry_to_invalid_bucket, 130, FSCK_AUTOFIX) \ x(lru_entry_bad, 131, FSCK_AUTOFIX) \ @@ -167,14 +168,15 @@ enum bch_fsck_flags { x(ptr_to_incorrect_stripe, 151, 0) \ x(ptr_gen_newer_than_bucket_gen, 152, 0) \ x(ptr_too_stale, 153, 0) \ - x(stale_dirty_ptr, 154, 0) \ + x(stale_dirty_ptr, 154, FSCK_AUTOFIX) \ x(ptr_bucket_data_type_mismatch, 155, 0) \ x(ptr_cached_and_erasure_coded, 156, 0) \ x(ptr_crc_uncompressed_size_too_small, 157, 0) \ + x(ptr_crc_uncompressed_size_too_big, 161, 0) \ + x(ptr_crc_uncompressed_size_mismatch, 300, 0) \ x(ptr_crc_csum_type_unknown, 158, 0) \ x(ptr_crc_compression_type_unknown, 159, 0) \ x(ptr_crc_redundant, 160, 0) \ - x(ptr_crc_uncompressed_size_too_big, 161, 0) \ x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ @@ -209,6 +211,7 @@ enum bch_fsck_flags { x(bkey_in_missing_snapshot, 190, 0) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ + x(inode_alloc_cursor_inode_bad, 301, 0) \ x(inode_unpack_error, 193, 0) \ x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ @@ -232,6 +235,7 @@ enum bch_fsck_flags { x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ + x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ @@ -288,7 +292,7 @@ enum bch_fsck_flags { x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ x(snapshot_node_missing, 264, 0) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ - x(btree_bitmap_not_marked, 266, 0) \ + x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ x(sb_clean_entry_overrun, 267, 0) \ x(btree_ptr_v2_written_0, 268, 0) \ x(subvol_snapshot_bad, 269, 0) \ @@ -306,7 +310,9 @@ enum bch_fsck_flags { x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ - x(MAX, 295, 0) + x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ + x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ + x(MAX, 302, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 2adf1221a440f..0bae879539471 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -9,7 +9,7 @@ #define BCH_SB_MEMBERS_MAX 64 /* - * Sentinal value - indicates a device that does not exist + * Sentinel value - indicates a device that does not exist */ #define BCH_SB_MEMBER_INVALID 255 diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h index 3dfaf34a43b28..b1374d9e1c1a7 100644 --- a/fs/bcachefs/siphash.h +++ b/fs/bcachefs/siphash.h @@ -36,7 +36,7 @@ * optimized for speed on short messages returning a 64bit hash/digest value. * * The number of rounds is defined during the initialization: - * SipHash24_Init() for the fast and resonable strong version + * SipHash24_Init() for the fast and reasonable strong version * SipHash48_Init() for the strong version (half as fast) * * struct SIPHASH_CTX ctx; diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 617d07e53b20c..25fa4f0094c69 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -111,7 +111,7 @@ static inline unsigned pcpu_read_count(struct six_lock *lock) * Returns 1 on success, 0 on failure * * In percpu reader mode, a failed trylock may cause a spurious trylock failure - * for anoter thread taking the competing lock type, and we may havve to do a + * for another thread taking the competing lock type, and we may have to do a * wakeup: when a wakeup is required, we return -1 - wakeup_type. */ static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, @@ -234,7 +234,7 @@ static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_typ /* * Similar to percpu_rwsem_wake_function(), we need to guard - * against the wakee noticing w->lock_acquired, returning, and + * against the wake noticing w->lock_acquired, returning, and * then exiting before we do the wakeup: */ task = get_task_struct(w->task); @@ -597,7 +597,7 @@ static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ * - * When a lock is held multiple times (because six_lock_incement()) was used), + * When a lock is held multiple times (because six_lock_increment()) was used), * this decrements the 'lock held' counter by one. * * For example: @@ -631,7 +631,7 @@ EXPORT_SYMBOL_GPL(six_unlock_ip); /** * six_lock_downgrade - convert an intent lock to a read lock - * @lock: lock to dowgrade + * @lock: lock to downgrade * * @lock will have read count incremented and intent count decremented */ @@ -750,7 +750,7 @@ EXPORT_SYMBOL_GPL(six_lock_increment); * six_lock_wakeup_all - wake up all waiters on @lock * @lock: lock to wake up waiters for * - * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then + * Waking up waiters will cause them to re-run should_sleep_fn, which may then * abort the lock operation. * * This function is never needed in a bug-free program; it's only useful in @@ -798,7 +798,7 @@ EXPORT_SYMBOL_GPL(six_lock_counts); * @lock: lock to add/subtract readers for * @nr: reader count to add/subtract * - * When an upper layer is implementing lock reentrency, we may have both read + * When an upper layer is implementing lock reentrancy, we may have both read * and intent locks on the same lock. * * When we need to take a write lock, the read locks will cause self-deadlock, @@ -829,7 +829,7 @@ EXPORT_SYMBOL_GPL(six_lock_readers_add); * six_lock_exit - release resources held by a lock prior to freeing * @lock: lock to exit * - * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is + * When a lock was initialized in percpu mode (SIX_LOCK_INIT_PCPU), this is * required to free the percpu read counts. */ void six_lock_exit(struct six_lock *lock) diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index 68d46fd7f3912..d110f88e04802 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -79,7 +79,7 @@ * six_unlock_read(&foo->lock); * foo->lock is now fully unlocked. * - * Since the intent state supercedes read, it's legal to increment the read + * Since the intent state supersedes read, it's legal to increment the read * counter when holding an intent lock, but not the reverse. * * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) @@ -296,7 +296,7 @@ void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long * @lock: lock to unlock * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write * - * When a lock is held multiple times (because six_lock_incement()) was used), + * When a lock is held multiple times (because six_lock_increment()) was used), * this decrements the 'lock held' counter by one. * * For example: diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index ae57638506c3a..0dafadc1bcf2f 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_buf.h" +#include "btree_cache.h" #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" @@ -32,7 +33,7 @@ void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_tree_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { int ret = 0; @@ -225,7 +226,7 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, } int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_snapshot s; u32 i, id; @@ -279,23 +280,6 @@ int bch2_snapshot_validate(struct bch_fs *c, struct bkey_s_c k, return ret; } -static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - struct snapshot_t *t = snapshot_t_mut(c, id); - u32 parent = id; - - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); -} - -static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -{ - mutex_lock(&c->snapshot_table_lock); - __set_is_ancestor_bitmap(c, id); - mutex_unlock(&c->snapshot_table_lock); -} - static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -317,6 +301,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + t->live = true; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -335,7 +320,11 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, t->skip[2] = 0; } - __set_is_ancestor_bitmap(c, id); + u32 parent = id; + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); if (BCH_SNAPSHOT_DELETED(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); @@ -365,70 +354,6 @@ int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_with_updates, snapshot, s); } -static int bch2_snapshot_live(struct btree_trans *trans, u32 id) -{ - struct bch_snapshot v; - int ret; - - if (!id) - return 0; - - ret = bch2_snapshot_lookup(trans, id, &v); - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot node %u not found", id); - if (ret) - return ret; - - return !BCH_SNAPSHOT_DELETED(&v); -} - -/* - * If @k is a snapshot with just one live child, it's part of a linear chain, - * which we consider to be an equivalence class: and then after snapshot - * deletion cleanup, there should only be a single key at a given position in - * this equivalence class. - * - * This sets the equivalence class of @k to be the child's equivalence class, if - * it's part of such a linear chain: this correctly sets equivalence classes on - * startup if we run leaf to root (i.e. in natural key order). - */ -static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) -{ - struct bch_fs *c = trans->c; - unsigned i, nr_live = 0, live_idx = 0; - struct bkey_s_c_snapshot snap; - u32 id = k.k->p.offset, child[2]; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - - child[0] = le32_to_cpu(snap.v->children[0]); - child[1] = le32_to_cpu(snap.v->children[1]); - - for (i = 0; i < 2; i++) { - int ret = bch2_snapshot_live(trans, child[i]); - - if (ret < 0) - return ret; - - if (ret) - live_idx = i; - nr_live += ret; - } - - mutex_lock(&c->snapshot_table_lock); - - snapshot_t_mut(c, id)->equiv = nr_live == 1 - ? snapshot_t_mut(c, child[live_idx])->equiv - : id; - - mutex_unlock(&c->snapshot_table_lock); - - return 0; -} - /* fsck: */ static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) @@ -506,7 +431,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, break; } } - bch2_trans_iter_exit(trans, &iter); if (!ret && !found) { @@ -536,6 +460,7 @@ static int check_snapshot_tree(struct btree_trans *trans, struct bch_snapshot s; struct bch_subvolume subvol; struct printbuf buf = PRINTBUF; + struct btree_iter snapshot_iter = {}; u32 root_id; int ret; @@ -545,22 +470,32 @@ static int check_snapshot_tree(struct btree_trans *trans, st = bkey_s_c_to_snapshot_tree(k); root_id = le32_to_cpu(st.v->root_snapshot); - ret = bch2_snapshot_lookup(trans, root_id, &s); + struct bkey_s_c_snapshot snapshot_k = + bch2_bkey_get_iter_typed(trans, &snapshot_iter, BTREE_ID_snapshots, + POS(0, root_id), 0, snapshot); + ret = bkey_err(snapshot_k); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; + if (!ret) + bkey_val_copy(&s, snapshot_k); + if (fsck_err_on(ret || root_id != bch2_snapshot_root(c, root_id) || st.k->p.offset != le32_to_cpu(s.tree), trans, snapshot_tree_to_missing_snapshot, "snapshot tree points to missing/incorrect snapshot:\n %s", - (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + (bch2_bkey_val_to_text(&buf, c, st.s_c), + prt_newline(&buf), + ret + ? prt_printf(&buf, "(%s)", bch2_err_str(ret)) + : bch2_bkey_val_to_text(&buf, c, snapshot_k.s_c), + buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); goto err; } - ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), - false, 0, &subvol); + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), false, &subvol); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -605,6 +540,7 @@ static int check_snapshot_tree(struct btree_trans *trans, } err: fsck_err: + bch2_trans_iter_exit(trans, &snapshot_iter); printbuf_exit(&buf); return ret; } @@ -799,7 +735,7 @@ static int check_snapshot(struct btree_trans *trans, if (should_have_subvol) { id = le32_to_cpu(s.subvol); - ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + ret = bch2_subvolume_get(trans, id, false, &subvol); if (bch2_err_matches(ret, ENOENT)) bch_err(c, "snapshot points to nonexistent subvolume:\n %s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); @@ -902,7 +838,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - if (bch2_snapshot_equiv(c, id)) + if (bch2_snapshot_exists(c, id)) return 0; /* Do we need to reconstruct the snapshot_tree entry as well? */ @@ -951,8 +887,7 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?: - bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i)); + bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); } /* Figure out which snapshot nodes belong in the same tree: */ @@ -1050,7 +985,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_equiv(c, *id), + if (fsck_err_on(!bch2_snapshot_exists(c, *id), trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { @@ -1083,10 +1018,12 @@ int bch2_check_key_has_snapshot(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), + if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: @@ -1294,10 +1231,6 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; new_snapids[i] = iter.pos.offset; - - mutex_lock(&c->snapshot_table_lock); - snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i]; - mutex_unlock(&c->snapshot_table_lock); } err: bch2_trans_iter_exit(trans, &iter); @@ -1403,129 +1336,153 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -static int delete_dead_snapshots_process_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *deleted, - snapshot_id_list *equiv_seen, - struct bpos *last_pos) +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; + +static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { - int ret = bch2_check_key_has_snapshot(trans, iter, k); - if (ret) - return ret < 0 ? ret : 0; + darray_for_each(*l, i) + if (i->id == id) + return i->live_child; + return 0; +} - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - if (!equiv) /* key for invalid snapshot node, but we chose not to delete */ +static unsigned __live_child(struct snapshot_table *t, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + struct snapshot_t *s = __snapshot_t(t, id); + if (!s) return 0; - if (!bkey_eq(k.k->p, *last_pos)) - equiv_seen->nr = 0; + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) + if (s->children[i] && + !snapshot_list_has_id(delete_leaves, s->children[i]) && + !interior_delete_has_id(delete_interior, s->children[i])) + return s->children[i]; - if (snapshot_list_has_id(deleted, k.k->p.snapshot)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); + for (unsigned i = 0; i < ARRAY_SIZE(s->children); i++) { + u32 live_child = s->children[i] + ? __live_child(t, s->children[i], delete_leaves, delete_interior) + : 0; + if (live_child) + return live_child; + } - if (!bpos_eq(*last_pos, k.k->p) && - snapshot_list_has_id(equiv_seen, equiv)) - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); + return 0; +} - *last_pos = k.k->p; +static unsigned live_child(struct bch_fs *c, u32 id, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + rcu_read_lock(); + u32 ret = __live_child(rcu_dereference(c->snapshots), id, + delete_leaves, delete_interior); + rcu_read_unlock(); + return ret; +} - ret = snapshot_list_add_nodup(c, equiv_seen, equiv); - if (ret) - return ret; +static int delete_dead_snapshots_process_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) +{ + if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); - /* - * When we have a linear chain of snapshot nodes, we consider - * those to form an equivalence class: we're going to collapse - * them all down to a single node, and keep the leaf-most node - - * which has the same id as the equivalence class id. - * - * If there are multiple keys in different snapshots at the same - * position, we're only going to keep the one in the newest - * snapshot (we delete the others above) - the rest have been - * overwritten and are redundant, and for the key we're going to keep we - * need to move it to the equivalance class ID if it's not there - * already. - */ - if (equiv != k.k->p.snapshot) { + u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + if (live_child) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; - new->k.p.snapshot = equiv; + new->k.p.snapshot = live_child; - struct btree_iter new_iter; - bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, - BTREE_ITER_all_snapshots| - BTREE_ITER_cached| - BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&new_iter) ?: - bch2_trans_update(trans, &new_iter, new, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &new_iter); + struct btree_iter dst_iter; + struct bkey_s_c dst_k = bch2_bkey_get_iter(trans, &dst_iter, + iter->btree_id, new->k.p, + BTREE_ITER_all_snapshots| + BTREE_ITER_intent); + ret = bkey_err(dst_k); if (ret) return ret; + + ret = (bkey_deleted(dst_k.k) + ? bch2_trans_update(trans, &dst_iter, new, + BTREE_UPDATE_internal_snapshot_node) + : 0) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; } return 0; } -static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k) +/* + * For a given snapshot, if it doesn't have a subvolume that points to it, and + * it doesn't have child snapshot nodes - it's now redundant and we can mark it + * as deleted. + */ +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, + snapshot_id_list *delete_leaves, + interior_delete_list *delete_interior) { - struct bkey_s_c_snapshot snap; - u32 children[2]; - int ret; - if (k.k->type != KEY_TYPE_snapshot) return 0; - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || - BCH_SNAPSHOT_SUBVOL(snap.v)) + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + unsigned live_children = 0; + + if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; - children[0] = le32_to_cpu(snap.v->children[0]); - children[1] = le32_to_cpu(snap.v->children[1]); + for (unsigned i = 0; i < 2; i++) { + u32 child = le32_to_cpu(s.v->children[i]); - ret = bch2_snapshot_live(trans, children[0]) ?: - bch2_snapshot_live(trans, children[1]); - if (ret < 0) - return ret; - return !ret; -} + live_children += child && + !snapshot_list_has_id(delete_leaves, child); + } -/* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ -static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k) -{ - int ret = bch2_snapshot_needs_delete(trans, k); + if (live_children == 0) { + return snapshot_list_add(c, delete_leaves, s.k->p.offset); + } else if (live_children == 1) { + struct snapshot_interior_delete d = { + .id = s.k->p.offset, + .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + }; + + if (!d.live_child) { + bch_err(c, "error finding live child of snapshot %u", d.id); + return -EINVAL; + } - return ret <= 0 - ? ret - : bch2_snapshot_node_set_deleted(trans, k.k->p.offset); + return darray_push(delete_interior, d); + } else { + return 0; + } } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, - snapshot_id_list *skip) + interior_delete_list *skip) { rcu_read_lock(); - while (snapshot_list_has_id(skip, id)) + while (interior_delete_has_id(skip, id)) id = __bch2_snapshot_parent(c, id); while (n--) { do { id = __bch2_snapshot_parent(c, id); - } while (snapshot_list_has_id(skip, id)); + } while (interior_delete_has_id(skip, id)); } rcu_read_unlock(); @@ -1534,7 +1491,7 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, - snapshot_id_list *deleted) + interior_delete_list *deleted) { struct bch_fs *c = trans->c; u32 nr_deleted_ancestors = 0; @@ -1544,7 +1501,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, if (k.k->type != KEY_TYPE_snapshot) return 0; - if (snapshot_list_has_id(deleted, k.k->p.offset)) + if (interior_delete_has_id(deleted, k.k->p.offset)) return 0; s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); @@ -1553,7 +1510,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return ret; darray_for_each(*deleted, i) - nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); + nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, i->id); if (!nr_deleted_ancestors) return 0; @@ -1571,7 +1528,7 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { u32 id = le32_to_cpu(s->v.skip[j]); - if (snapshot_list_has_id(deleted, id)) { + if (interior_delete_has_id(deleted, id)) { id = bch2_snapshot_nth_parent_skip(c, parent, depth > 1 @@ -1590,51 +1547,44 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, int bch2_delete_dead_snapshots(struct bch_fs *c) { - struct btree_trans *trans; - snapshot_id_list deleted = { 0 }; - snapshot_id_list deleted_interior = { 0 }; - int ret = 0; - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - trans = bch2_trans_get(c); + struct btree_trans *trans = bch2_trans_get(c); + snapshot_id_list delete_leaves = {}; + interior_delete_list delete_interior = {}; + int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ - ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - NULL, NULL, 0, - bch2_delete_redundant_snapshot(trans, k)); - bch_err_msg(c, ret, "deleting redundant snapshots"); + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, + check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); + bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_snapshot_set_equiv(trans, k)); - bch_err_msg(c, ret, "in bch2_snapshots_set_equiv"); - if (ret) + if (!delete_leaves.nr && !delete_interior.nr) goto err; - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - if (k.k->type != KEY_TYPE_snapshot) - continue; + { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "deleting leaves"); + darray_for_each(delete_leaves, i) + prt_printf(&buf, " %u", *i); - BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v) - ? snapshot_list_add(c, &deleted, k.k->p.offset) - : 0; - })); - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err; + prt_printf(&buf, " interior"); + darray_for_each(delete_interior, i) + prt_printf(&buf, " %u->%u", i->id, i->live_child); + + ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); + printbuf_exit(&buf); + if (ret) + goto err; + } for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - struct bpos last_pos = POS_MIN; - snapshot_id_list equiv_seen = { 0 }; struct disk_reservation res = { 0 }; if (!btree_type_has_snapshots(btree)) @@ -1644,33 +1594,24 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) btree, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, &deleted, - &equiv_seen, &last_pos)); + delete_dead_snapshots_process_key(trans, &iter, k, + &delete_leaves, + &delete_interior)); bch2_disk_reservation_put(c, &res); - darray_exit(&equiv_seen); bch_err_msg(c, ret, "deleting keys from dying snapshots"); if (ret) goto err; } - bch2_trans_unlock(trans); - down_write(&c->snapshot_create_lock); - - ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ({ - u32 snapshot = k.k->p.offset; - u32 equiv = bch2_snapshot_equiv(c, snapshot); - - equiv != snapshot - ? snapshot_list_add(c, &deleted_interior, snapshot) - : 0; - })); - - bch_err_msg(c, ret, "walking snapshots"); - if (ret) - goto err_create_lock; + darray_for_each(delete_leaves, i) { + ret = commit_do(trans, NULL, NULL, 0, + bch2_snapshot_node_delete(trans, *i)); + bch_err_msg(c, ret, "deleting snapshot %u", *i); + if (ret) + goto err; + } /* * Fixing children of deleted snapshots can't be done completely @@ -1680,30 +1621,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); if (ret) - goto err_create_lock; - - darray_for_each(deleted, i) { - ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); - if (ret) - goto err_create_lock; - } + goto err; - darray_for_each(deleted_interior, i) { + darray_for_each(delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, - bch2_snapshot_node_delete(trans, *i)); - bch_err_msg(c, ret, "deleting snapshot %u", *i); + bch2_snapshot_node_delete(trans, i->id)); + bch_err_msg(c, ret, "deleting snapshot %u", i->id); if (ret) - goto err_create_lock; + goto err; } -err_create_lock: - up_write(&c->snapshot_create_lock); err: - darray_exit(&deleted_interior); - darray_exit(&deleted); + darray_exit(&delete_interior); + darray_exit(&delete_leaves); bch2_trans_put(trans); bch_err_fn(c, ret); return ret; @@ -1721,8 +1652,12 @@ void bch2_delete_dead_snapshots_work(struct work_struct *work) void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && - !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) + return; + + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + + if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); } @@ -1735,18 +1670,10 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - while (1) { - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (!k.k) - break; - + for_each_btree_key_reverse_norestart(trans, iter, id, bpos_predecessor(pos), + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots, + k, ret) { if (!bkey_eq(pos, k.k->p)) break; @@ -1760,24 +1687,23 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return ret; } -static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) +static bool interior_snapshot_needs_delete(struct bkey_s_c_snapshot snap) { - struct bch_fs *c = trans->c; - struct bkey_s_c_snapshot snap; - int ret = 0; + /* If there's one child, it's redundant and keys will be moved to the child */ + return !!snap.v->children[0] + !!snap.v->children[1] == 1; +} +static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k) +{ if (k.k->type != KEY_TYPE_snapshot) return 0; - snap = bkey_s_c_to_snapshot(k); + struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); if (BCH_SNAPSHOT_DELETED(snap.v) || - bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || - (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - return 0; - } + interior_snapshot_needs_delete(snap)) + set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); - return ret; + return 0; } int bch2_snapshots_read(struct bch_fs *c) @@ -1786,11 +1712,7 @@ int bch2_snapshots_read(struct bch_fs *c) for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(trans, k) ?: - bch2_check_snapshot_needs_deletion(trans, k)) ?: - for_each_btree_key(trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); + bch2_check_snapshot_needs_deletion(trans, k))); bch_err_fn(c, ret); /* diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 29c94716293e1..00373cf32e7bc 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -2,11 +2,9 @@ #ifndef _BCACHEFS_SNAPSHOT_H #define _BCACHEFS_SNAPSHOT_H -enum bch_validate_flags; - void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_snapshot_tree_validate(struct bch_fs *, struct bkey_s_c, - enum bch_validate_flags); + struct bkey_validate_context); #define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ .key_validate = bch2_snapshot_tree_validate, \ @@ -19,7 +17,8 @@ struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_snapshot_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); @@ -120,19 +119,19 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } -static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) +static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->equiv : 0; + return s ? s->live : 0; } -static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) { rcu_read_lock(); - id = __bch2_snapshot_equiv(c, id); + bool ret = __bch2_snapshot_exists(c, id); rcu_read_unlock(); - return id; + return ret; } static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h index aabcd3a74cd95..73c2dce0a320e 100644 --- a/fs/bcachefs/snapshot_format.h +++ b/fs/bcachefs/snapshot_format.h @@ -23,7 +23,7 @@ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) /* * Snapshot trees: * - * The snapshot_trees btree gives us persistent indentifier for each tree of + * The snapshot_trees btree gives us persistent identifier for each tree of * bch_snapshot nodes, and allow us to record and easily find the root/master * subvolume that other snapshots were created from: */ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c new file mode 100644 index 0000000000000..f5977c5c67430 --- /dev/null +++ b/fs/bcachefs/str_hash.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "dirent.h" +#include "fsck.h" +#include "str_hash.h" +#include "subvolume.h" + +static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d) +{ + if (d.v->d_type == DT_SUBVOL) { + struct bch_subvolume subvol; + int ret = bch2_subvolume_get(trans, le32_to_cpu(d.v->d_child_subvol), + false, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + return !ret; + } else { + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k); + bch2_trans_iter_exit(trans, &iter); + return ret; + } +} + +static int fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old) +{ + struct qstr old_name = bch2_dirent_get_name(old); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + int ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_dirent_init(&new->k_i); + dirent_copy_target(new, old); + new->k.p = old.k->p; + + for (unsigned i = 0; i < 1000; i++) { + unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + + if (u64s > U8_MAX) + return -EINVAL; + + new->k.u64s = u64s; + + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + (subvol_inum) { 0, old.k->p.inode }, + old.k->p.snapshot, &new->k_i, + BTREE_UPDATE_internal_snapshot_node); + if (!bch2_err_matches(ret, EEXIST)) + break; + } + + if (ret) + return ret; + + return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); +} + +static int hash_pick_winner(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c k1, + struct bkey_s_c k2) +{ + if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) && + !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k))) + return 0; + + switch (desc.btree_id) { + case BTREE_ID_dirents: { + int ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k1)); + if (ret < 0) + return ret; + if (!ret) + return 0; + + ret = bch2_dirent_has_target(trans, bkey_s_c_to_dirent(k2)); + if (ret < 0) + return ret; + if (!ret) + return 1; + return 2; + } + default: + return 0; + } +} + +static int repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != snapshot_root->bi_inum) + break; + if (!bkey_is_inode(k.k)) + continue; + + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + break; + + if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), + trans, inode_snapshot_mismatch, + "inode hash info in different snapshots don't match")) { + inode.bi_hash_seed = snapshot_root->bi_hash_seed; + SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); + ret = __bch2_fsck_write_inode(trans, &inode) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + break; + } + } +fsck_err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * All versions of the same inode in different snapshots must have the same hash + * seed/type: verify that the hash info we're using matches the root + */ +static int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, + struct bch_hash_info *hash_info) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) + goto found; + } + bch_err(c, "%s(): inum %llu not found", __func__, inum); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; +found:; + struct bch_inode_unpacked inode; + ret = bch2_inode_unpack(k, &inode); + if (ret) + goto err; + + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + if (memcmp(hash_info, &hash2, sizeof(hash2))) { + ret = repair_inode_hash_info(trans, &inode); + if (!ret) { + bch_err(c, "inode hash info mismatch with root, but mismatch not found"); + ret = -BCH_ERR_fsck_repair_unimplemented; + } + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + int ret = 0; + + u64 hash = desc->hash_bkey(hash_info, hash_k); + if (hash_k.k->p.offset < hash) + goto bad_hash; + + for_each_btree_key_norestart(trans, iter, desc->btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_slots, k, ret) { + if (bkey_eq(k.k->p, hash_k.k->p)) + break; + + if (k.k->type == desc->key_type && + !desc->cmp_bkey(k, hash_k)) + goto duplicate_entries; + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + goto bad_hash; + } + } +out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +bad_hash: + /* + * Before doing any repair, check hash_info itself: + */ + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, hash_info); + if (ret) + goto out; + + if (fsck_err(trans, hash_table_key_wrong_offset, + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", + bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); + if (IS_ERR(new)) + return PTR_ERR(new); + + k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, + (subvol_inum) { 0, hash_k.k->p.inode }, + hash_k.k->p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(k); + if (ret) + goto out; + if (k.k) + goto duplicate_entries; + + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } +fsck_err: + goto out; +duplicate_entries: + ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); + break; + case 2: + ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; +} diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index ec2b1feea5204..55a4ac7bf220f 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -160,7 +160,7 @@ bch2_hash_lookup_in_snapshot(struct btree_trans *trans, struct bkey_s_c k; int ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), BTREE_ITER_slots|flags, k, ret) { @@ -210,7 +210,7 @@ bch2_hash_hole(struct btree_trans *trans, if (ret) return ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(inum.inum, desc.hash_key(info, key), snapshot), POS(inum.inum, U64_MAX), BTREE_ITER_slots|BTREE_ITER_intent, k, ret) @@ -265,7 +265,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, bool found = false; int ret; - for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + for_each_btree_key_max_norestart(trans, *iter, desc.btree_id, SPOS(insert->k.p.inode, desc.hash_bkey(info, bkey_i_to_s_c(insert)), snapshot), @@ -393,4 +393,26 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +struct snapshots_seen; +int __bch2_str_hash_check_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c); + +static inline int bch2_str_hash_check_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + if (hash_k.k->type != desc->key_type) + return 0; + + if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) + return 0; + + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); +} + #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 80e5efaff524b..f0e094b9c1045 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -207,7 +207,7 @@ int bch2_check_subvol_children(struct bch_fs *c) /* Subvolumes: */ int bch2_subvolume_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_subvolume subvol = bkey_s_c_to_subvolume(k); int ret = 0; @@ -286,11 +286,11 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) static __always_inline int bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), - iter_flags, subvolume, s); + BTREE_ITER_cached| + BTREE_ITER_with_updates, subvolume, s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found, trans->c, "missing subvolume %u", subvol); @@ -299,16 +299,15 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, bool inconsistent_if_not_found, - int iter_flags, struct bch_subvolume *s) { - return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); + return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, s); } int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol) { struct bch_subvolume s; - int ret = bch2_subvolume_get_inlined(trans, subvol, true, 0, &s); + int ret = bch2_subvolume_get_inlined(trans, subvol, true, &s); if (ret) return ret; @@ -328,7 +327,7 @@ int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, struct bch_snapshot snap; return bch2_snapshot_lookup(trans, snapshot, &snap) ?: - bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, subvol); } int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, @@ -388,7 +387,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, * structure of how snapshot subvolumes were created - the parent subvolume of * each snapshot subvolume. * - * When a subvolume is deleted, we scan for child subvolumes and reparant them, + * When a subvolume is deleted, we scan for children subvolumes and fix them, * to avoid dangling references: */ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) @@ -396,8 +395,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d struct bch_subvolume s; return lockrestart_do(trans, - bch2_subvolume_get(trans, subvolid_to_delete, true, - BTREE_ITER_cached, &s)) ?: + bch2_subvolume_get(trans, subvolid_to_delete, true, &s)) ?: for_each_btree_key_commit(trans, iter, BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, @@ -675,7 +673,7 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) /* set bi_subvol on root inode */ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) { - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw, + int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_fs_upgrade_for_subvolumes(trans)); bch_err_fn(c, ret); return ret; diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index f897d106e1426..910f6196700e3 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -5,12 +5,11 @@ #include "darray.h" #include "subvolume_types.h" -enum bch_validate_flags; - int bch2_check_subvols(struct bch_fs *); int bch2_check_subvol_children(struct bch_fs *); -int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_subvolume_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, @@ -25,7 +24,7 @@ int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, - bool, int, struct bch_subvolume *); + bool, struct bch_subvolume *); int __bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *, bool); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); @@ -34,7 +33,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos end, +bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, u32 subvolid, unsigned flags) { u32 snapshot; @@ -43,10 +42,10 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos return bkey_s_c_err(ret); bch2_btree_iter_set_snapshot(iter, snapshot); - return bch2_btree_iter_peek_upto_type(iter, end, flags); + return bch2_btree_iter_peek_max_type(iter, end, flags); } -#define for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ +#define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ _end, _subvolid, _flags, _k, _do) \ ({ \ struct bkey_s_c _k; \ @@ -54,7 +53,7 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_upto_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ _end, _subvolid, (_flags)); \ if (!(_k).k) \ break; \ @@ -67,14 +66,14 @@ bch2_btree_iter_peek_in_subvolume_upto_type(struct btree_iter *iter, struct bpos _ret3; \ }) -#define for_each_btree_key_in_subvolume_upto(_trans, _iter, _btree_id, \ +#define for_each_btree_key_in_subvolume_max(_trans, _iter, _btree_id, \ _start, _end, _subvolid, _flags, _k, _do) \ ({ \ struct btree_iter _iter; \ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ \ - for_each_btree_key_in_subvolume_upto_continue(_trans, _iter, \ + for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ _end, _subvolid, _flags, _k, _do); \ }) diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index f2ec4277c2a50..1549d6daf7af5 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -9,13 +9,13 @@ typedef DARRAY(u32) snapshot_id_list; #define IS_ANCESTOR_BITMAP 128 struct snapshot_t { + bool live; u32 parent; u32 skip[3]; u32 depth; u32 children[2]; u32 subvol; /* Nonzero only if a subvolume points to this node: */ u32 tree; - u32 equiv; unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; }; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 7c71594f6a8bd..042dc2da35623 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -23,6 +23,7 @@ #include #include +#include static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { }; @@ -41,7 +42,7 @@ static const struct bch2_metadata_version bch2_metadata_versions[] = { #undef x }; -void bch2_version_to_text(struct printbuf *out, unsigned v) +void bch2_version_to_text(struct printbuf *out, enum bcachefs_metadata_version v) { const char *str = "(unknown version)"; @@ -54,7 +55,7 @@ void bch2_version_to_text(struct printbuf *out, unsigned v) prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); } -unsigned bch2_latest_compatible_version(unsigned v) +enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version v) { if (!BCH_VERSION_MAJOR(v)) return v; @@ -68,6 +69,16 @@ unsigned bch2_latest_compatible_version(unsigned v) return v; } +void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +{ + mutex_lock(&c->sb_lock); + SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + const char * const bch2_sb_fields[] = { #define x(name, nr) #name, BCH_SB_FIELDS() @@ -368,6 +379,12 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_features; } + if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || + BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { + prt_printf(out, "Filesystem has incompatible version"); + return -BCH_ERR_invalid_sb_features; + } + block_size = le16_to_cpu(sb->block_size); if (block_size > PAGE_SECTORS) { @@ -406,11 +423,26 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_time_precision; } + /* old versions didn't know to downgrade this field */ + if (BCH_SB_VERSION_INCOMPAT_ALLOWED(sb) > le16_to_cpu(sb->version)) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, le16_to_cpu(sb->version)); + + if (BCH_SB_VERSION_INCOMPAT(sb) > BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)) { + prt_printf(out, "Invalid version_incompat "); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); + prt_str(out, " > incompat_allowed "); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); + if (flags & BCH_VALIDATE_write) + return -BCH_ERR_invalid_sb_version; + else + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); + } + if (!flags) { /* * Been seeing a bug where these are getting inexplicably * zeroed, so we're now validating them, but we have to be - * careful not to preven people's filesystems from mounting: + * careful not to prevent people's filesystems from mounting: */ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); @@ -428,6 +460,11 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); } +#ifdef __KERNEL__ + if (!BCH_SB_SHARD_INUMS_NBITS(sb)) + SET_BCH_SB_SHARD_INUMS_NBITS(sb, ilog2(roundup_pow_of_two(num_online_cpus()))); +#endif + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; @@ -519,6 +556,9 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.uuid = src->uuid; c->sb.user_uuid = src->user_uuid; c->sb.version = le16_to_cpu(src->version); + c->sb.version_incompat = BCH_SB_VERSION_INCOMPAT(src); + c->sb.version_incompat_allowed + = BCH_SB_VERSION_INCOMPAT_ALLOWED(src); c->sb.version_min = le16_to_cpu(src->version_min); c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); c->sb.nr_devices = src->nr_devices; @@ -676,7 +716,8 @@ static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf } enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb); - if (csum_type >= BCH_CSUM_NR) { + if (csum_type >= BCH_CSUM_NR || + bch2_csum_type_is_encryption(csum_type)) { prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); return -BCH_ERR_invalid_sb_csum_type; } @@ -878,7 +919,7 @@ static void write_super_endio(struct bio *bio) ? BCH_MEMBER_ERROR_write : BCH_MEMBER_ERROR_read, "superblock %s error: %s", - bio_data_dir(bio) ? "write" : "read", + str_write_read(bio_data_dir(bio)), bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; @@ -891,14 +932,15 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) struct bch_sb *sb = ca->disk_sb.sb; struct bio *bio = ca->disk_sb.bio; + memset(ca->sb_read_scratch, 0, BCH_SB_READ_SCRATCH_BUF_SIZE); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); bio->bi_end_io = write_super_endio; bio->bi_private = ca; - bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); + bch2_bio_map(bio, ca->sb_read_scratch, BCH_SB_READ_SCRATCH_BUF_SIZE); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], - bio_sectors(bio)); + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); closure_bio_submit(bio, &c->sb_write); @@ -1149,6 +1191,8 @@ bool bch2_check_version_downgrade(struct bch_fs *c) */ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + if (BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb) > bcachefs_metadata_version_current) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, bcachefs_metadata_version_current); if (c->sb.version > bcachefs_metadata_version_current) c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); if (c->sb.version_min > bcachefs_metadata_version_current) @@ -1157,7 +1201,7 @@ bool bch2_check_version_downgrade(struct bch_fs *c) return ret; } -void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) +void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) { lockdep_assert_held(&c->sb_lock); @@ -1167,6 +1211,10 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) c->disk_sb.sb->version = cpu_to_le16(new_version); c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + + if (incompat) + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, @@ -1331,6 +1379,14 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, bch2_version_to_text(out, le16_to_cpu(sb->version)); prt_newline(out); + prt_printf(out, "Incompatible features allowed:\t"); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT_ALLOWED(sb)); + prt_newline(out); + + prt_printf(out, "Incompatible features in use:\t"); + bch2_version_to_text(out, BCH_SB_VERSION_INCOMPAT(sb)); + prt_newline(out); + prt_printf(out, "Version upgrade complete:\t"); bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); prt_newline(out); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index fadd364e2802a..f1ab4f9437203 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -10,14 +10,29 @@ #include +#define BCH_SB_READ_SCRATCH_BUF_SIZE 4096 + static inline bool bch2_version_compatible(u16 version) { return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && version >= bcachefs_metadata_version_min; } -void bch2_version_to_text(struct printbuf *, unsigned); -unsigned bch2_latest_compatible_version(unsigned); +void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); +enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); + +void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); + +static inline bool bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) +{ + if (unlikely(version > c->sb.version_incompat)) { + if (version > c->sb.version_incompat_allowed) + return false; + bch2_set_version_incompat(c, version); + } + return true; +} static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) { @@ -92,7 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) } bool bch2_check_version_downgrade(struct bch_fs *); -void bch2_sb_upgrade(struct bch_fs *, unsigned); +void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a6ed9a0bf1c70..7e97c198efe2d 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -290,7 +290,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_fs_journal_stop(&c->journal); - bch_info(c, "%sshutdown complete, journal seq %llu", + bch_info(c, "%sclean shutdown complete, journal seq %llu", test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un", c->journal.seq_ondisk); @@ -441,6 +441,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) { int ret; + BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; @@ -584,7 +586,6 @@ static void __bch2_fs_free(struct bch_fs *c) #endif kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); - kfree(c->unused_inode_hints); if (c->write_ref_wq) destroy_workqueue(c->write_ref_wq); @@ -766,12 +767,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); + spin_lock_init(&c->recovery_pass_lock); sema_init(&c->online_fsck_mutex, 1); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - atomic_set(&c->journal_keys.ref, 1); - c->journal_keys.initial_ref_held = true; for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); @@ -781,6 +781,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_btree_iter_init_early(c); bch2_fs_btree_interior_update_init_early(c); + bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); bch2_fs_rebalance_init(c); @@ -809,9 +810,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->vfs_inodes_list); mutex_init(&c->vfs_inodes_lock); - c->copy_gc_enabled = 1; - c->rebalance.enabled = 1; - c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; @@ -873,8 +871,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", @@ -901,9 +897,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) !(c->online_reserved = alloc_percpu(u64)) || mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || - mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || - !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, - sizeof(u64), GFP_KERNEL))) { + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { ret = -BCH_ERR_ENOMEM_fs_other_alloc; goto err; } @@ -1033,9 +1027,12 @@ int bch2_fs_start(struct bch_fs *c) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + c->recovery_task = current; ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ? bch2_fs_recovery(c) : bch2_fs_initialize(c); + c->recovery_task = NULL; + if (ret) goto err; @@ -1120,12 +1117,12 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_bdevname(&buf, fs->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time)); prt_newline(&buf); prt_bdevname(&buf, sb->bdev); prt_char(&buf, ' '); - bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time)); prt_newline(&buf); if (!opts->no_splitbrain_check) @@ -1198,7 +1195,7 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->io_done); bch2_dev_buckets_free(ca); - free_page((unsigned long) ca->sb_read_scratch); + kfree(ca->sb_read_scratch); bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); @@ -1309,8 +1306,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, init_completion(&ca->ref_completion); init_completion(&ca->io_ref_completion); - init_rwsem(&ca->bucket_lock); - INIT_WORK(&ca->io_error_work, bch2_io_error_work); bch2_time_stats_quantiles_init(&ca->io_latency[READ]); @@ -1337,7 +1332,7 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || + !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) goto err; @@ -1366,7 +1361,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member member = bch2_sb_member_get(c->disk_sb.sb, dev_idx); struct bch_dev *ca = NULL; - int ret = 0; if (bch2_fs_init_fault("dev_alloc")) goto err; @@ -1378,10 +1372,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->fs = c; bch2_dev_attach(c, ca, dev_idx); - return ret; + return 0; err: - if (ca) - bch2_dev_free(ca); return -BCH_ERR_ENOMEM_dev_alloc; } @@ -1751,11 +1743,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err; - ret = bch2_dev_journal_alloc(ca, true); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err; - down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1806,11 +1793,14 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (ret) goto err_late; - ca->new_fs_bucket_idx = 0; - if (ca->mi.state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + up_write(&c->state_lock); return 0; diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index dada09331d2eb..fa6d522165108 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -34,16 +34,6 @@ void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); -/* - * Only for use in the recovery/fsck path: - */ -static inline void bch2_fs_lazy_rw(struct bch_fs *c) -{ - if (!test_bit(BCH_FS_rw, &c->flags) && - !test_bit(BCH_FS_was_rw, &c->flags)) - bch2_fs_read_write_early(c); -} - void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 03e59f86f360d..a7eb1f5114847 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -146,7 +146,7 @@ write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); write_attribute(trigger_freelist_wakeup); -rw_attribute(gc_gens_pos); +read_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); @@ -203,7 +203,6 @@ read_attribute(disk_groups); read_attribute(has_data); read_attribute(alloc_debug); -read_attribute(accounting); read_attribute(usage_base); #define x(t, n, ...) read_attribute(t); @@ -211,12 +210,11 @@ BCH_PERSISTENT_COUNTERS() #undef x rw_attribute(discard); +read_attribute(state); rw_attribute(label); -rw_attribute(copy_gc_enabled); read_attribute(copy_gc_wait); -rw_attribute(rebalance_enabled); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); @@ -237,11 +235,6 @@ write_attribute(perf_test); BCH_TIME_STATS() #undef x -static struct attribute sysfs_state_rw = { - .name = "state", - .mode = 0444, -}; - static size_t bch2_btree_cache_size(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; @@ -302,7 +295,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) { - prt_printf(out, "%s: ", bch2_btree_id_str(c->gc_gens_btree)); + bch2_btree_id_to_text(out, c->gc_gens_btree); + prt_printf(out, ": "); bch2_bpos_to_text(out, c->gc_gens_pos); prt_printf(out, "\n"); } @@ -339,9 +333,6 @@ SHOW(bch2_fs) if (attr == &sysfs_gc_gens_pos) bch2_gc_gens_pos_to_text(out, c); - sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); - - sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ if (attr == &sysfs_copy_gc_wait) @@ -405,9 +396,6 @@ SHOW(bch2_fs) if (attr == &sysfs_alloc_debug) bch2_fs_alloc_debug_to_text(out, c); - if (attr == &sysfs_accounting) - bch2_fs_accounting_to_text(out, c); - if (attr == &sysfs_usage_base) bch2_fs_usage_base_to_text(out, c); @@ -418,23 +406,6 @@ STORE(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); - if (attr == &sysfs_copy_gc_enabled) { - ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) - ?: (ssize_t) size; - - if (c->copygc_thread) - wake_up_process(c->copygc_thread); - return ret; - } - - if (attr == &sysfs_rebalance_enabled) { - ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) - ?: (ssize_t) size; - - rebalance_wakeup(c); - return ret; - } - sysfs_pd_controller_store(rebalance, &c->rebalance.pd); /* Debugging: */ @@ -534,15 +505,22 @@ SHOW(bch2_fs_counters) printbuf_tabstop_push(out, 32); - #define x(t, ...) \ + #define x(t, n, f, ...) \ if (attr == &sysfs_##t) { \ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ + if (f & TYPE_SECTORS) { \ + counter <<= 9; \ + counter_since_mount <<= 9; \ + } \ + \ prt_printf(out, "since mount:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter_since_mount) :\ prt_human_readable_u64(out, counter_since_mount); \ prt_newline(out); \ \ prt_printf(out, "since filesystem creation:\t"); \ + (f & TYPE_COUNTER) ? prt_u64(out, counter) : \ prt_human_readable_u64(out, counter); \ prt_newline(out); \ } @@ -610,10 +588,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_gc_gens_pos, - &sysfs_copy_gc_enabled, &sysfs_copy_gc_wait, - &sysfs_rebalance_enabled, sysfs_pd_controller_files(rebalance), &sysfs_moving_ctxts, @@ -622,7 +598,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_disk_groups, &sysfs_alloc_debug, - &sysfs_accounting, &sysfs_usage_base, NULL }; @@ -682,6 +657,13 @@ STORE(bch2_fs_opts_dir) (id == Opt_compression && !c->opts.background_compression))) bch2_set_rebalance_needs_scan(c, 0); + if (v && id == Opt_rebalance_enabled) + rebalance_wakeup(c); + + if (v && id == Opt_copygc_enabled && + c->copygc_thread) + wake_up_process(c->copygc_thread); + ret = size; err: bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); @@ -790,7 +772,7 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_state_rw) { + if (attr == &sysfs_state) { prt_string_option(out, bch2_member_states, ca->mi.state); prt_char(out, '\n'); } @@ -870,7 +852,7 @@ struct attribute *bch2_dev_files[] = { /* settings: */ &sysfs_discard, - &sysfs_state_rw, + &sysfs_state, &sysfs_label, &sysfs_has_data, diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index fb5c1543e52f0..6c64698146375 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -131,7 +131,7 @@ static int test_iterate(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i++); @@ -186,7 +186,7 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i); @@ -242,7 +242,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(k.k->p.offset != i); @@ -259,7 +259,7 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_slots, k, ({ if (i >= nr * 2) @@ -302,7 +302,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, ({ BUG_ON(bkey_start_offset(k.k) != i + 8); @@ -320,7 +320,7 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) i = 0; ret = bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + for_each_btree_key_max(trans, iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), BTREE_ITER_slots, k, ({ if (i == nr) @@ -349,10 +349,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -369,10 +369,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -488,7 +488,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)); + k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; @@ -726,7 +726,7 @@ static int seq_insert(struct bch_fs *c, u64 nr) static int seq_lookup(struct bch_fs *c, u64 nr) { return bch2_trans_run(c, - for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_max(trans, iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), POS(0, U64_MAX), 0, k, 0)); diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h index dc6493f7bbabc..40ade9bf15d7a 100644 --- a/fs/bcachefs/time_stats.h +++ b/fs/bcachefs/time_stats.h @@ -12,7 +12,7 @@ * - sum of all event durations * - average event duration, standard and weighted * - standard deviation of event durations, standard and weighted - * and analagous statistics for the frequency of events + * and analogous statistics for the frequency of events * * We provide both mean and weighted mean (exponentially weighted), and standard * deviation and weighted standard deviation, to give an efficient-to-compute diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 5597b9d6297f1..9d40b7d4ea296 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -199,6 +199,30 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* disk_accounting.c */ + +TRACE_EVENT(accounting_mem_insert, + TP_PROTO(struct bch_fs *c, const char *acc), + TP_ARGS(c, acc), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned, new_nr ) + __string(acc, acc ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->new_nr = c->accounting.k.nr; + __assign_str(acc); + ), + + TP_printk("%d,%d entries %u added %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->new_nr, + __get_str(acc)) +); + /* fs.c: */ TRACE_EVENT(bch2_sync_fs, TP_PROTO(struct super_block *sb, int wait), @@ -848,8 +872,8 @@ TRACE_EVENT(move_data, TRACE_EVENT(evacuate_bucket, TP_PROTO(struct bch_fs *c, struct bpos *bucket, unsigned sectors, unsigned bucket_size, - u64 fragmentation, int ret), - TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), + int ret), + TP_ARGS(c, bucket, sectors, bucket_size, ret), TP_STRUCT__entry( __field(dev_t, dev ) @@ -857,7 +881,6 @@ TRACE_EVENT(evacuate_bucket, __field(u64, bucket ) __field(u32, sectors ) __field(u32, bucket_size ) - __field(u64, fragmentation ) __field(int, ret ) ), @@ -867,15 +890,14 @@ TRACE_EVENT(evacuate_bucket, __entry->bucket = bucket->offset; __entry->sectors = sectors; __entry->bucket_size = bucket_size; - __entry->fragmentation = fragmentation; __entry->ret = ret; ), - TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", + TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->member, __entry->bucket, __entry->sectors, __entry->bucket_size, - __entry->fragmentation, __entry->ret) + __entry->ret) ); TRACE_EVENT(copygc, @@ -1316,6 +1338,12 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced, __entry->new_u64s) ); +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + TRACE_EVENT(path_downgrade, TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, @@ -1352,10 +1380,21 @@ TRACE_EVENT(path_downgrade, __entry->pos_snapshot) ); -DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip), - TP_ARGS(trans, caller_ip) +TRACE_EVENT(key_cache_fill, + TP_PROTO(struct btree_trans *trans, const char *key), + TP_ARGS(trans, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %s", __entry->trans_fn, __get_str(key)) ); TRACE_EVENT(write_buffer_flush, @@ -1414,6 +1453,24 @@ TRACE_EVENT(write_buffer_flush_slowpath, TP_printk("%zu/%zu", __entry->slowpath, __entry->total) ); +TRACE_EVENT(write_buffer_maybe_flush, + TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *key), + TP_ARGS(trans, caller_ip, key), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __string(key, key ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __assign_str(key); + ), + + TP_printk("%s %pS %s", __entry->trans_fn, (void *) __entry->caller_ip, __get_str(key)) +); + DEFINE_EVENT(fs_str, rebalance_extent, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index fb02c1c360044..c292b9ce82406 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -317,6 +317,19 @@ do { \ _ptr ? container_of(_ptr, type, member) : NULL; \ }) +static inline struct list_head *list_pop(struct list_head *head) +{ + if (list_empty(head)) + return NULL; + + struct list_head *ret = head->next; + list_del_init(ret); + return ret; +} + +#define list_pop_entry(head, type, member) \ + container_of_or_null(list_pop(head), type, member) + /* Does linear interpolation between powers of two */ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) { @@ -696,4 +709,13 @@ static inline bool test_bit_le64(size_t bit, __le64 *addr) return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0; } +static inline void memcpy_swab(void *_dst, void *_src, size_t len) +{ + u8 *dst = _dst + len; + u8 *src = _src; + + while (len--) + *--dst = *src++; +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c index 6a78553d9b0cd..6620ecae26af3 100644 --- a/fs/bcachefs/varint.c +++ b/fs/bcachefs/varint.c @@ -9,6 +9,7 @@ #include #endif +#include "errcode.h" #include "varint.h" /** @@ -53,7 +54,7 @@ int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) u64 v; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { __le64 v_le = 0; @@ -115,7 +116,7 @@ int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) unsigned bytes = ffz(*in) + 1; if (unlikely(in + bytes > end)) - return -1; + return -BCH_ERR_varint_decode_error; if (likely(bytes < 9)) { v >>= bytes; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 952aca400faf4..aed7c69841734 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -71,7 +71,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = { }; int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, - enum bch_validate_flags flags) + struct bkey_validate_context from) { struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); unsigned val_u64s = xattr_val_u64s(xattr.v->x_name_len, @@ -309,7 +309,7 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) u64 offset = 0, inum = inode->ei_inode.bi_inum; int ret = bch2_trans_run(c, - for_each_btree_key_in_subvolume_upto(trans, iter, BTREE_ID_xattrs, + for_each_btree_key_in_subvolume_max(trans, iter, BTREE_ID_xattrs, POS(inum, offset), POS(inum, U64_MAX), inode->ei_inum.subvol, 0, k, ({ @@ -565,13 +565,6 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); err: mutex_unlock(&inode->ei_update_lock); - - if (value && - (opt_id == Opt_background_target || - opt_id == Opt_background_compression || - (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) - bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); - err_class_exit: return bch2_err_class(ret); } @@ -609,7 +602,7 @@ static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { #endif /* NO_BCACHEFS_FS */ -const struct xattr_handler *bch2_xattr_handlers[] = { +const struct xattr_handler * const bch2_xattr_handlers[] = { &bch_xattr_user_handler, &bch_xattr_trusted_handler, &bch_xattr_security_handler, diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index c188a5ad64cef..132fbbd15a66d 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -6,7 +6,8 @@ extern const struct bch_hash_desc bch2_xattr_hash_desc; -int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, enum bch_validate_flags); +int bch2_xattr_validate(struct bch_fs *, struct bkey_s_c, + struct bkey_validate_context); void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_xattr ((struct bkey_ops) { \ @@ -44,6 +45,6 @@ int bch2_xattr_set(struct btree_trans *, subvol_inum, ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -extern const struct xattr_handler *bch2_xattr_handlers[]; +extern const struct xattr_handler * const bch2_xattr_handlers[]; #endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 24727ec34e5aa..6521e9a9d6ef5 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -13,7 +13,7 @@ #include #include "internal.h" -static const struct constant_table bool_names[] = { +const struct constant_table bool_names[] = { { "0", false }, { "1", true }, { "false", false }, @@ -22,6 +22,7 @@ static const struct constant_table bool_names[] = { { "yes", true }, { }, }; +EXPORT_SYMBOL(bool_names); static const struct constant_table * __lookup_constant(const struct constant_table *tbl, const char *name) diff --git a/include/linux/fs_parser.h b/include/linux/fs_parser.h index 6cf713a7e6c6f..0974cd33bcba7 100644 --- a/include/linux/fs_parser.h +++ b/include/linux/fs_parser.h @@ -83,6 +83,8 @@ extern int fs_lookup_param(struct fs_context *fc, extern int lookup_constant(const struct constant_table tbl[], const char *name, int not_found); +extern const struct constant_table bool_names[]; + #ifdef CONFIG_VALIDATE_FS_PARSER extern bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, int low, int high, int special); diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h index 43a7b9dcf15e6..fe17b48281719 100644 --- a/include/linux/min_heap.h +++ b/include/linux/min_heap.h @@ -15,8 +15,8 @@ */ #define MIN_HEAP_PREALLOCATED(_type, _name, _nr) \ struct _name { \ - int nr; \ - int size; \ + size_t nr; \ + size_t size; \ _type *data; \ _type preallocated[_nr]; \ }