diff --git a/.bcachefs_revision b/.bcachefs_revision index 97936a157..e41bb4016 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -8c94740b1bf8645d3398170f41c9c88b78332252 +feaca6edbd240bbd98d261097a97037c56a09eec diff --git a/cmd_fsck.c b/cmd_fsck.c index 0954a83c0..f7dcae98e 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -97,11 +97,11 @@ int cmd_fsck(int argc, char *argv[]) exit(8); } - if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags)) { fprintf(stderr, "%s: errors fixed\n", c->name); ret |= 1; } - if (test_bit(BCH_FS_ERROR, &c->flags)) { + if (test_bit(BCH_FS_error, &c->flags)) { fprintf(stderr, "%s: still has errors\n", c->name); ret |= 4; } diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index 4c8bcf23b..8c66333bc 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -136,15 +136,30 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, - struct bkey_i_backpointer *bp_k, + struct bpos bucket, struct bch_backpointer bp, struct bkey_s_c orig_k, bool insert) { struct btree_iter bp_iter; struct bkey_s_c k; + struct bkey_i_backpointer *bp_k; int ret; + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp_k->k.p, BTREE_ITER_INTENT| diff --git a/libbcachefs/backpointers.h b/libbcachefs/backpointers.h index ab866feea..737e2396a 100644 --- a/libbcachefs/backpointers.h +++ b/libbcachefs/backpointers.h @@ -63,7 +63,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, return ret; } -int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *, +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket, struct bch_backpointer, struct bkey_s_c, bool); static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, @@ -72,28 +72,21 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_s_c orig_k, bool insert) { - struct bch_fs *c = trans->c; - struct bkey_i_backpointer *bp_k; - int ret; + if (unlikely(bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); - bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); - ret = PTR_ERR_OR_ZERO(bp_k); - if (ret) - return ret; + struct bkey_i_backpointer bp_k; - bkey_backpointer_init(&bp_k->k_i); - bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); - bp_k->v = bp; + bkey_backpointer_init(&bp_k.k_i); + bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset); + bp_k.v = bp; if (!insert) { - bp_k->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&bp_k->k, 0); + bp_k.k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k.k, 0); } - if (unlikely(bch2_backpointers_no_use_write_buffer)) - return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert); - - return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i); } static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 295efeda1..2e9f4af3a 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -406,7 +406,6 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_max_in_flight) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ - x(blocked_write_buffer_full) \ x(nocow_lock_contended) enum bch_time_stats { @@ -567,32 +566,38 @@ struct bch_dev { struct io_count __percpu *io_done; }; -enum { - /* startup: */ - BCH_FS_STARTED, - BCH_FS_MAY_GO_RW, - BCH_FS_RW, - BCH_FS_WAS_RW, - - /* shutdown: */ - BCH_FS_STOPPING, - BCH_FS_EMERGENCY_RO, - BCH_FS_GOING_RO, - BCH_FS_WRITE_DISABLE_COMPLETE, - BCH_FS_CLEAN_SHUTDOWN, - - /* fsck passes: */ - BCH_FS_FSCK_DONE, - BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ - BCH_FS_NEED_ANOTHER_GC, - - BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, - - /* errors: */ - BCH_FS_ERROR, - BCH_FS_TOPOLOGY_ERROR, - BCH_FS_ERRORS_FIXED, - BCH_FS_ERRORS_NOT_FIXED, +/* + * fsck_done - kill? + * + * replace with something more general from enumated fsck passes/errors: + * initial_gc_unfixed + * error + * topology error + */ + +#define BCH_FS_FLAGS() \ + x(started) \ + x(may_go_rw) \ + x(rw) \ + x(was_rw) \ + x(stopping) \ + x(emergency_ro) \ + x(going_ro) \ + x(write_disable_complete) \ + x(clean_shutdown) \ + x(fsck_done) \ + x(initial_gc_unfixed) \ + x(need_another_gc) \ + x(need_delete_dead_snapshots) \ + x(error) \ + x(topology_error) \ + x(errors_fixed) \ + x(errors_not_fixed) + +enum bch_fs_flags { +#define x(n) BCH_FS_##n, + BCH_FS_FLAGS() +#undef x }; struct btree_debug { @@ -1068,20 +1073,10 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) #endif } -static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_GOING_RO, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget(&c->writes); -#endif -} - static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) { #ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_GOING_RO, &c->flags) && + return !test_bit(BCH_FS_going_ro, &c->flags) && atomic_long_inc_not_zero(&c->writes[ref]); #else return percpu_ref_tryget_live(&c->writes); @@ -1100,7 +1095,7 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) if (atomic_long_read(&c->writes[i])) return; - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); #else percpu_ref_put(&c->writes); diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 967780072..be0367f37 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1539,7 +1539,7 @@ struct bch_sb_field_disk_groups { x(move_extent_write, 36) \ x(move_extent_finish, 37) \ x(move_extent_fail, 38) \ - x(move_extent_alloc_mem_fail, 39) \ + x(move_extent_start_fail, 39) \ x(copygc, 40) \ x(copygc_wait, 41) \ x(gc_gens_end, 42) \ @@ -1576,7 +1576,9 @@ struct bch_sb_field_disk_groups { x(write_super, 73) \ x(trans_restart_would_deadlock_recursion_limit, 74) \ x(trans_restart_write_buffer_flush, 75) \ - x(trans_restart_split_race, 76) + x(trans_restart_split_race, 76) \ + x(write_buffer_flush_slowpath, 77) \ + x(write_buffer_flush_sync, 78) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, @@ -2135,8 +2137,7 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(clock, 7) \ x(dev_usage, 8) \ x(log, 9) \ - x(overwrite, 10) \ - x(write_buffer_keys, 11) + x(overwrite, 10) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 72dea90e1..47e7770d0 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -13,13 +13,6 @@ #include #include -#include - -#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ -do { \ - if (shrinker_counter) \ - bc->not_freed_##counter++; \ -} while (0) const char * const bch2_btree_node_flags[] = { #define x(f) #f, @@ -208,7 +201,7 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, * this version is for btree nodes that have already been freed (we're not * reaping a real btree node) */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) { struct btree_cache *bc = &c->btree_cache; int ret = 0; @@ -218,64 +211,38 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_dirty(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - else if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); - else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + if (!flush) return -BCH_ERR_ENOMEM_btree_node_reclaim; - } /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } - if (!six_trylock_intent(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); + if (!six_trylock_intent(&b->c.lock)) return -BCH_ERR_ENOMEM_btree_node_reclaim; - } - if (!six_trylock_write(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); + if (!six_trylock_write(&b->c.lock)) goto out_unlock_intent; - } /* recheck under lock */ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); - else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + if (!flush) goto out_unlock; - } six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); goto wait_on_io; } - if (btree_node_noevict(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(noevict); - goto out_unlock; - } - if (btree_node_write_blocked(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); - goto out_unlock; - } - if (btree_node_will_make_reachable(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); + if (btree_node_noevict(b) || + btree_node_write_blocked(b) || + btree_node_will_make_reachable(b)) goto out_unlock; - } if (btree_node_dirty(b)) { - if (!flush) { - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + if (!flush) goto out_unlock; - } /* * Using the underscore version because we don't want to compact * bsets after the write, since this node is about to be evicted @@ -305,14 +272,14 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, b goto out; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, false, shrinker_counter); + return __btree_node_reclaim(c, b, false); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true, false); + return __btree_node_reclaim(c, b, true); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -360,12 +327,11 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b, true)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(c, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); freed++; - bc->freed++; } } restart: @@ -374,11 +340,9 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (btree_node_accessed(b)) { clear_btree_node_accessed(b); - bc->not_freed_access_bit++; - } else if (!btree_node_reclaim(c, b, true)) { + } else if (!btree_node_reclaim(c, b)) { freed++; btree_node_data_free(c, b); - bc->freed++; bch2_btree_node_hash_remove(bc, b); six_unlock_write(&b->c.lock); @@ -428,17 +392,6 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, return btree_cache_can_free(bc); } -static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -{ - struct bch_fs *c = shrink->private_data; - char *cbuf; - size_t buflen = seq_buf_get_buf(s, &cbuf); - struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - - bch2_btree_cache_to_text(&out, &c->btree_cache); - seq_buf_commit(s, out.pos); -} - void bch2_fs_btree_cache_exit(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; @@ -525,7 +478,6 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink = shrink; shrink->count_objects = bch2_btree_cache_count; shrink->scan_objects = bch2_btree_cache_scan; - shrink->to_text = bch2_btree_cache_shrinker_to_text; shrink->seeks = 4; shrink->private_data = c; shrinker_register(shrink); @@ -599,7 +551,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) struct btree *b; list_for_each_entry_reverse(b, &bc->live, list) - if (!btree_node_reclaim(c, b, false)) + if (!btree_node_reclaim(c, b)) return b; while (1) { @@ -635,7 +587,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b, false)) { + if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; } @@ -661,7 +613,7 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2, false)) { + if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); btree_node_to_freedlist(bc, b2); @@ -1257,21 +1209,9 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc stats.failed); } -void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) +void bch2_btree_cache_to_text(struct printbuf *out, const struct bch_fs *c) { - prt_printf(out, "nr nodes:\t\t%u\n", bc->used); - prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); - - prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed); - prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty); - prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight); - prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight); - prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent); - prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write); - prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit); - prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict); - prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked); - prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable); - + prt_printf(out, "nr nodes:\t\t%u\n", c->btree_cache.used); + prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&c->btree_cache.dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", c->btree_cache.alloc_lock); } diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index bfe1d7482..cfb80b201 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -126,6 +126,6 @@ static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) const char *bch2_btree_id_str(enum btree_id); void bch2_btree_pos_to_text(struct printbuf *, struct bch_fs *, const struct btree *); void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, const struct btree *); -void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); +void bch2_btree_cache_to_text(struct printbuf *, const struct bch_fs *); #endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 90f5bcfa3..70e478807 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -108,7 +108,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } } @@ -134,7 +134,7 @@ static int bch2_gc_check_topology(struct bch_fs *c, ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); goto err; } else { - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); } } @@ -619,7 +619,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id g->data_type = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -664,7 +664,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (data_type == BCH_DATA_btree) { g->data_type = data_type; - set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + set_bit(BCH_FS_need_another_gc, &c->flags); } else { do_update = true; } @@ -996,7 +996,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b /* Continue marking when opted to not * fix the error: */ ret = 0; - set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + set_bit(BCH_FS_initial_gc_unfixed, &c->flags); continue; } } else if (ret) { @@ -1847,7 +1847,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) #endif c->gc_count++; - if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + if (test_bit(BCH_FS_need_another_gc, &c->flags) || (!iter && bch2_test_restart_gc)) { if (iter++ > 2) { bch_info(c, "Unable to fix bucket gens, looping"); @@ -1859,7 +1859,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) * XXX: make sure gens we fixed got saved */ bch_info(c, "Second GC pass needed, restarting:"); - clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + clear_bit(BCH_FS_need_another_gc, &c->flags); __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); bch2_gc_stripes_reset(c, metadata_only); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index a52fd206f..bdc808087 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -781,7 +781,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat struct btree_node_iter node_iter = l->iter; struct bkey_packed *k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); @@ -816,7 +816,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p struct bch_fs *c = trans->c; struct bkey_s_c k; struct bkey_buf tmp; - unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + unsigned nr = test_bit(BCH_FS_started, &c->flags) ? (path->level > 1 ? 0 : 2) : (path->level > 1 ? 1 : 16); bool was_locked = btree_node_locked(path, path->level); diff --git a/libbcachefs/btree_journal_iter.c b/libbcachefs/btree_journal_iter.c index 7a5e0a893..4c084ce49 100644 --- a/libbcachefs/btree_journal_iter.c +++ b/libbcachefs/btree_journal_iter.c @@ -177,7 +177,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, struct journal_keys *keys = &c->journal_keys; size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); - BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && journal_key_cmp(&n, &keys->d[idx]) == 0) { diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index e14e9b4cd..c64f8db06 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -13,7 +13,6 @@ #include "trace.h" #include -#include static inline bool btree_uses_pcpu_readers(enum btree_id id) { @@ -779,7 +778,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans, ck->valid = true; if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_bit(BKEY_CACHED_DIRTY, &ck->flags); atomic_long_inc(&c->btree_key_cache.nr_dirty); @@ -1008,7 +1007,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) if (atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal) && - test_bit(BCH_FS_WAS_RW, &c->flags)) + test_bit(BCH_FS_was_rw, &c->flags)) panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", atomic_long_read(&bc->nr_dirty)); @@ -1029,18 +1028,6 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) INIT_LIST_HEAD(&c->freed_nonpcpu); } -static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -{ - struct bch_fs *c = shrink->private_data; - struct btree_key_cache *bc = &c->btree_key_cache; - char *cbuf; - size_t buflen = seq_buf_get_buf(s, &cbuf); - struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); - - bch2_btree_key_cache_to_text(&out, bc); - seq_buf_commit(s, out.pos); -} - int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) { struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); @@ -1064,7 +1051,6 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) shrink->seeks = 0; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; - shrink->to_text = bch2_btree_key_cache_shrinker_to_text; shrink->private_data = c; shrinker_register(shrink); return 0; diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 7210d5c22..336350bd9 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -287,7 +287,7 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, bch2_btree_add_journal_pin(c, b, journal_seq); if (unlikely(!btree_node_dirty(b))) { - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); set_btree_node_dirty_acct(c, b); } @@ -659,6 +659,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, i->k->k.needs_whiteout = false; } + if (trans->nr_wb_updates && + trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) + return -BCH_ERR_btree_insert_need_flush_buffer; + /* * Don't get journal reservation until after we know insert will * succeed: @@ -693,6 +697,14 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) return -BCH_ERR_btree_insert_need_mark_replicas; + if (trans->nr_wb_updates) { + EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res); + + ret = bch2_btree_insert_keys_write_buffer(trans); + if (ret) + goto revert_fs_usage; + } + h = trans->hooks; while (h) { ret = h->fn(trans, h); @@ -754,7 +766,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, trans_for_each_wb_update(trans, wb) { entry = bch2_journal_add_entry(j, &trans->journal_res, - BCH_JSET_ENTRY_write_buffer_keys, + BCH_JSET_ENTRY_btree_keys, wb->btree, 0, wb->k.k.u64s); bkey_copy((struct bkey_i *) entry->start, &wb->k); @@ -938,6 +950,30 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, ret = bch2_trans_relock(trans); break; + case -BCH_ERR_btree_insert_need_flush_buffer: { + struct btree_write_buffer *wb = &c->btree_write_buffer; + + ret = 0; + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_unlock(trans); + mutex_lock(&wb->flush_lock); + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_begin(trans); + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&wb->flush_lock); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + } else { + mutex_unlock(&wb->flush_lock); + ret = bch2_trans_relock(trans); + } + } + break; + } default: BUG_ON(ret >= 0); break; @@ -959,7 +995,7 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) int ret; if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) || - test_bit(BCH_FS_STARTED, &c->flags)) + test_bit(BCH_FS_started, &c->flags)) return -BCH_ERR_erofs_trans_commit; ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); @@ -1024,7 +1060,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) return ret; } - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); goto out_reset; } @@ -1036,7 +1072,21 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; } - EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && + mutex_trylock(&c->btree_write_buffer.flush_lock)) { + bch2_trans_begin(trans); + bch2_trans_unlock(trans); + + ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&c->btree_write_buffer.flush_lock); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + goto out; + } + + EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); trans->journal_u64s = trans->extra_journal_entries.nr; trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 14983e778..2326bceb3 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -162,16 +162,6 @@ struct btree_cache { /* Number of elements in live + freeable lists */ unsigned used; unsigned reserve; - unsigned freed; - unsigned not_freed_lock_intent; - unsigned not_freed_lock_write; - unsigned not_freed_dirty; - unsigned not_freed_read_in_flight; - unsigned not_freed_write_in_flight; - unsigned not_freed_noevict; - unsigned not_freed_write_blocked; - unsigned not_freed_will_make_reachable; - unsigned not_freed_access_bit; atomic_t dirty; struct shrinker *shrink; diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 1837f8484..ba42f578f 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -693,20 +693,6 @@ int bch2_btree_delete_at(struct btree_trans *trans, return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); } -int bch2_btree_delete_at_buffered(struct btree_trans *trans, - enum btree_id btree, struct bpos pos) -{ - struct bkey_i *k; - - k = bch2_trans_kmalloc(trans, sizeof(*k)); - if (IS_ERR(k)) - return PTR_ERR(k); - - bkey_init(&k->k); - k->k.p = pos; - return bch2_trans_update_buffered(trans, btree, k); -} - int bch2_btree_delete(struct btree_trans *trans, enum btree_id btree, struct bpos pos, unsigned update_flags) @@ -811,19 +797,13 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { - struct bkey_i *k; - int ret = 0; + struct bkey_i k; - k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); - ret = PTR_ERR_OR_ZERO(k); - if (unlikely(ret)) - return ret; - - bkey_init(&k->k); - k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; - k->k.p = pos; + bkey_init(&k.k); + k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k.k.p = pos; - return bch2_trans_update_buffered(trans, btree, k); + return bch2_trans_update_buffered(trans, btree, &k); } __printf(2, 0) diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 14a2315aa..fa19f3212 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -47,7 +47,6 @@ enum bch_trans_commit_flags { int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, unsigned, unsigned); int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned); int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, @@ -65,6 +64,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + return bch2_btree_bit_mod(trans, btree, pos, false); +} + int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, struct bpos, struct bpos); diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index bfe4d7975..68627061b 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1082,8 +1082,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, break; } + /* + * Always check for space for two keys, even if we won't have to + * split at prior level - it might have been a merge instead: + */ if (bch2_btree_node_insert_fits(c, path->l[update_level].b, - BKEY_BTREE_PTR_U64s_MAX * (1 + split))) + BKEY_BTREE_PTR_U64s_MAX * 2)) break; split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); @@ -2052,7 +2056,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) a->seq = b->data->keys.seq; INIT_WORK(&a->work, async_btree_node_rewrite_work); - if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) { mutex_lock(&c->pending_node_rewrites_lock); list_add(&a->list, &c->pending_node_rewrites); mutex_unlock(&c->pending_node_rewrites_lock); @@ -2060,7 +2064,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) } if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { - if (test_bit(BCH_FS_STARTED, &c->flags)) { + if (test_bit(BCH_FS_started, &c->flags)) { bch_err(c, "%s: error getting c->writes ref", __func__); kfree(a); return; diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index d3c38d2c0..6a1915680 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -7,130 +7,43 @@ #include "btree_write_buffer.h" #include "error.h" #include "journal.h" -#include "journal_io.h" #include "journal_reclaim.h" -#include +#include static int bch2_btree_write_buffer_journal_flush(struct journal *, struct journal_entry_pin *, u64); -static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *); - -static inline bool __wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ - return (cmp_int(l->hi, r->hi) ?: - cmp_int(l->mi, r->mi) ?: - cmp_int(l->lo, r->lo)) >= 0; -} - -static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r) -{ -#ifdef CONFIG_X86_64 - int cmp; - - asm("mov (%[l]), %%rax;" - "sub (%[r]), %%rax;" - "mov 8(%[l]), %%rax;" - "sbb 8(%[r]), %%rax;" - "mov 16(%[l]), %%rax;" - "sbb 16(%[r]), %%rax;" - : "=@ccae" (cmp) - : [l] "r" (l), [r] "r" (r) - : "rax", "cc"); - - EBUG_ON(cmp != __wb_key_cmp(l, r)); - return cmp; -#else - return __wb_key_cmp(l, r); -#endif -} - -/* Compare excluding idx, the low 24 bits: */ -static inline bool wb_key_eq(const void *_l, const void *_r) +static int btree_write_buffered_key_cmp(const void *_l, const void *_r) { - const struct wb_key_ref *l = _l; - const struct wb_key_ref *r = _r; + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; - return !((l->hi ^ r->hi)| - (l->mi ^ r->mi)| - ((l->lo >> 24) ^ (r->lo >> 24))); + return cmp_int(l->btree, r->btree) ?: + bpos_cmp(l->k.k.p, r->k.k.p) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); } -static noinline void wb_sort(struct wb_key_ref *base, size_t num) +static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) { - size_t n = num, a = num / 2; - - if (!a) /* num < 2 || size == 0 */ - return; - - for (;;) { - size_t b, c, d; - - if (a) /* Building heap: sift down --a */ - --a; - else if (--n) /* Sorting: Extract root to --n */ - swap(base[0], base[n]); - else /* Sort complete */ - break; - - /* - * Sift element at "a" down into heap. This is the - * "bottom-up" variant, which significantly reduces - * calls to cmp_func(): we find the sift-down path all - * the way to the leaves (one compare per level), then - * backtrack to find where to insert the target element. - * - * Because elements tend to sift down close to the leaves, - * this uses fewer compares than doing two per level - * on the way down. (A bit more than half as many on - * average, 3/4 worst-case.) - */ - for (b = a; c = 2*b + 1, (d = c + 1) < n;) - b = wb_key_cmp(base + c, base + d) ? c : d; - if (d == n) /* Special case last leaf with no sibling */ - b = c; - - /* Now backtrack from "b" to the correct location for "a" */ - while (b != a && wb_key_cmp(base + a, base + b)) - b = (b - 1) / 2; - c = b; /* Where "a" belongs */ - while (b != a) { /* Shift it into place */ - b = (b - 1) / 2; - swap(base[b], base[c]); - } - } -} - -static noinline int wb_flush_one_slowpath(struct btree_trans *trans, - struct btree_iter *iter, - struct btree_write_buffered_key *wb) -{ - bch2_btree_node_unlock_write(trans, iter->path, iter->path->l[0].b); - - trans->journal_res.seq = wb->journal_seq; + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; - return bch2_trans_update(trans, iter, &wb->k, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: - bch2_trans_commit(trans, NULL, NULL, - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim); + return cmp_int(l->journal_seq, r->journal_seq); } -static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter, - struct btree_write_buffered_key *wb, - bool *write_locked, size_t *fast) +static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb, + unsigned commit_flags, + bool *write_locked, + size_t *fast) { struct bch_fs *c = trans->c; struct btree_path *path; int ret; - EBUG_ON(!wb->journal_seq); - EBUG_ON(!c->btree_write_buffer.flushing.pin.seq); - EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - ret = bch2_btree_iter_traverse(iter); if (ret) return ret; @@ -153,14 +66,46 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite *write_locked = true; } - if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) { + if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); *write_locked = false; - return wb_flush_one_slowpath(trans, iter, wb); + goto trans_commit; } bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; return 0; +trans_commit: + trans->journal_res.seq = wb->journal_seq; + + return bch2_trans_update(trans, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + commit_flags| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim); +} + +static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) +{ + union btree_write_buffer_state old, new; + u64 v = READ_ONCE(wb->state.v); + + do { + old.v = new.v = v; + + new.nr = 0; + new.idx++; + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + + while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) + cpu_relax(); + + smp_mb(); + + return old; } /* @@ -192,79 +137,31 @@ btree_write_buffered_insert(struct btree_trans *trans, return ret; } -static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb) -{ - struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer); - struct journal *j = &c->journal; - - if (!wb->inc.keys.nr) - return; - - bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr)); - darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) { - swap(wb->flushing.keys, wb->inc.keys); - goto out; - } - - size_t nr = min(darray_room(wb->flushing.keys), - wb->sorted.size - wb->flushing.keys.nr); - nr = min(nr, wb->inc.keys.nr); - - memcpy(&darray_top(wb->flushing.keys), - wb->inc.keys.data, - sizeof(wb->inc.keys.data[0]) * nr); - - memmove(wb->inc.keys.data, - wb->inc.keys.data + nr, - sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr)); - - wb->flushing.keys.nr += nr; - wb->inc.keys.nr -= nr; -out: - if (!wb->inc.keys.nr) - bch2_journal_pin_drop(j, &wb->inc.pin); - else - bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin, - bch2_btree_write_buffer_journal_flush); - - if (j->watermark) { - spin_lock(&j->lock); - bch2_journal_set_watermark(j); - spin_unlock(&j->lock); - } - - BUG_ON(wb->sorted.size < wb->flushing.keys.nr); -} - -static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) +int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct wb_key_ref *i; + struct journal_entry_pin pin; + struct btree_write_buffered_key *i, *keys; struct btree_iter iter = { NULL }; - size_t skipped = 0, fast = 0, slowpath = 0; + size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; bool write_locked = false; + union btree_write_buffer_state s; int ret = 0; - bch2_trans_unlock(trans); - bch2_trans_begin(trans); + memset(&pin, 0, sizeof(pin)); - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); - mutex_unlock(&wb->inc.lock); + bch2_journal_pin_copy(j, &pin, &wb->journal_pin, + bch2_btree_write_buffer_journal_flush); + bch2_journal_pin_drop(j, &wb->journal_pin); - for (size_t i = 0; i < wb->flushing.keys.nr; i++) { - wb->sorted.data[i].idx = i; - wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree; - memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos)); - } - wb->sorted.nr = wb->flushing.keys.nr; + s = btree_write_buffer_switch(wb); + keys = wb->keys[s.idx]; + nr = s.nr; + + if (race_fault()) + goto slowpath; /* * We first sort so that we can detect and skip redundant updates, and @@ -280,151 +177,111 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) * If that happens, simply skip the key so we can optimistically insert * as many keys as possible in the fast path. */ - wb_sort(wb->sorted.data, wb->sorted.nr); - - darray_for_each(wb->sorted, i) { - struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - - for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) - prefetch(&wb->flushing.keys.data[n->idx]); - - BUG_ON(!k->journal_seq); - - if (i + 1 < &darray_top(wb->sorted) && - wb_key_eq(i, i + 1)) { - struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx]; + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_key_cmp, NULL); + for (i = keys; i < keys + nr; i++) { + if (i + 1 < keys + nr && + i[0].btree == i[1].btree && + bpos_eq(i[0].k.k.p, i[1].k.k.p)) { skipped++; - n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq); - k->journal_seq = 0; + i->journal_seq = 0; continue; } if (write_locked && - (iter.path->btree_id != k->btree || - bpos_gt(k->k.k.p, iter.path->l[0].b->key.k.p))) { + (iter.path->btree_id != i->btree || + bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); write_locked = false; } - if (!iter.path || iter.path->btree_id != k->btree) { + if (!iter.path || iter.path->btree_id != i->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p, + bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } - bch2_btree_iter_set_pos(&iter, k->k.k.p); + bch2_btree_iter_set_pos(&iter, i->k.k.p); iter.path->preserve = false; do { - if (race_fault()) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; - } - - ret = wb_flush_one(trans, &iter, k, &write_locked, &fast); + ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, 0, + &write_locked, &fast); if (!write_locked) bch2_trans_begin(trans); } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); - if (!ret) { - k->journal_seq = 0; - } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { slowpath++; - ret = 0; - } else + continue; + } + if (ret) break; + + i->journal_seq = 0; } if (write_locked) bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; - - if (slowpath) { - /* - * Flush in the order they were present in the journal, so that - * we can release journal pins: - * The fastpath zapped the seq of keys that were successfully flushed so - * we can skip those here. - */ - trace_write_buffer_flush_slowpath(trans, slowpath, wb->flushing.keys.nr); - - struct btree_write_buffered_key *i; - darray_for_each(wb->flushing.keys, i) { - if (!i->journal_seq) - continue; - - bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin, - bch2_btree_write_buffer_journal_flush); - - bch2_trans_begin(trans); - - ret = commit_do(trans, NULL, NULL, - BCH_WATERMARK_reclaim| - BCH_TRANS_COMMIT_no_check_rw| - BCH_TRANS_COMMIT_no_enospc| - BCH_TRANS_COMMIT_no_journal_res| - BCH_TRANS_COMMIT_journal_reclaim, - btree_write_buffered_insert(trans, i)); - if (ret) - goto err; - } - } -err: + trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); + + if (slowpath) + goto slowpath; + bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); - trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0); - bch2_journal_pin_drop(j, &wb->flushing.pin); - wb->flushing.keys.nr = 0; +out: + bch2_journal_pin_drop(j, &pin); return ret; -} +slowpath: + trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, nr); -static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq) -{ - struct journal *j = &c->journal; - struct journal_buf *buf; - int ret = 0; + /* + * Now sort the rest by journal seq and bump the journal pin as we go. + * The slowpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_journal_cmp, + NULL); - mutex_lock(&j->buf_lock); - while ((buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) - if (bch2_journal_keys_to_write_buffer(c, buf)) { - ret = -ENOMEM; + for (i = keys; i < keys + nr; i++) { + if (!i->journal_seq) + continue; + + bch2_journal_pin_update(j, i->journal_seq, &pin, + bch2_btree_write_buffer_journal_flush); + + ret = commit_do(trans, NULL, NULL, + BCH_WATERMARK_reclaim| + BCH_TRANS_COMMIT_no_check_rw| + BCH_TRANS_COMMIT_no_enospc| + BCH_TRANS_COMMIT_no_journal_res| + BCH_TRANS_COMMIT_journal_reclaim, + btree_write_buffered_insert(trans, i)); + if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) break; - } - mutex_unlock(&j->buf_lock); + } - return ret; + goto out; } int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) { struct bch_fs *c = trans->c; - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret = 0, fetch_from_journal_err; - - trace_write_buffer_flush_sync(trans, _RET_IP_); -retry: - bch2_trans_unlock(trans); - bch2_journal_block_reservations(&c->journal); - fetch_from_journal_err = fetch_wb_keys_from_journal(c, U64_MAX); - bch2_journal_unblock(&c->journal); - - /* - * On memory allocation failure, bch2_btree_write_buffer_flush_locked() - * is not guaranteed to empty wb->inc: - */ - mutex_lock(&wb->flushing.lock); - while (!ret && - (wb->flushing.keys.nr || wb->inc.keys.nr)) - ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) + return -BCH_ERR_erofs_no_writes; - if (!ret && fetch_from_journal_err) - goto retry; + trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_); + bch2_trans_unlock(trans); + mutex_lock(&c->btree_write_buffer.flush_lock); + int ret = bch2_btree_write_buffer_flush_locked(trans); + mutex_unlock(&c->btree_write_buffer.flush_lock); + bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); return ret; } @@ -434,9 +291,9 @@ int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans) struct btree_write_buffer *wb = &c->btree_write_buffer; int ret = 0; - if (mutex_trylock(&wb->flushing.lock)) { + if (mutex_trylock(&wb->flush_lock)) { ret = bch2_btree_write_buffer_flush_locked(trans); - mutex_unlock(&wb->flushing.lock); + mutex_unlock(&wb->flush_lock); } return ret; @@ -459,195 +316,85 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret, fetch_from_journal_err; - do { - fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq); - - mutex_lock(&wb->flushing.lock); - ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - mutex_unlock(&wb->flushing.lock); - } while (!ret && - (fetch_from_journal_err || - (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq) || - (wb->inc.pin.seq && wb->inc.pin.seq <= seq))); + mutex_lock(&wb->flush_lock); + int ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); + mutex_unlock(&wb->flush_lock); return ret; } -static void bch2_btree_write_buffer_flush_work(struct work_struct *work) +static inline u64 btree_write_buffer_ref(int idx) { - struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work); - struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; - - mutex_lock(&wb->flushing.lock); - do { - ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans)); - } while (!ret && bch2_btree_write_buffer_should_flush(c)); - mutex_unlock(&wb->flushing.lock); - - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + return ((union btree_write_buffer_state) { + .ref0 = idx == 0, + .ref1 = idx == 1, + }).v; } -int __bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) +int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) { + struct bch_fs *c = trans->c; struct btree_write_buffer *wb = &c->btree_write_buffer; - int ret; -retry: - ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL); - if (!ret && dst->wb == &wb->flushing) - ret = darray_resize(&wb->sorted, wb->flushing.keys.size); - - if (unlikely(ret)) { - if (dst->wb == &c->btree_write_buffer.flushing) { - mutex_unlock(&dst->wb->lock); - dst->wb = &c->btree_write_buffer.inc; - bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin, - bch2_btree_write_buffer_journal_flush); - goto retry; - } - - return ret; - } - - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - BUG_ON(!dst->room); - BUG_ON(!dst->seq); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} + struct btree_write_buffered_key *i; + union btree_write_buffer_state old, new; + int ret = 0; + u64 v; -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; + trans_for_each_wb_update(trans, i) { + EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); - if (mutex_trylock(&wb->flushing.lock)) { - mutex_lock(&wb->inc.lock); - move_keys_from_inc_to_flushing(wb); + i->journal_seq = trans->journal_res.seq; + i->journal_offset = trans->journal_res.offset; + } - /* - * Attempt to skip wb->inc, and add keys directly to - * wb->flushing, saving us a copy later: - */ + preempt_disable(); + v = READ_ONCE(wb->state.v); + do { + old.v = new.v = v; - if (!wb->inc.keys.nr) { - dst->wb = &wb->flushing; - } else { - mutex_unlock(&wb->flushing.lock); - dst->wb = &wb->inc; + new.v += btree_write_buffer_ref(new.idx); + new.nr += trans->nr_wb_updates; + if (new.nr > wb->size) { + ret = -BCH_ERR_btree_insert_need_flush_buffer; + goto out; } - } else { - mutex_lock(&wb->inc.lock); - dst->wb = &wb->inc; - } + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); - dst->room = darray_room(dst->wb->keys); - if (dst->wb == &wb->flushing) - dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr); - dst->seq = seq; + memcpy(wb->keys[new.idx] + old.nr, + trans->wb_updates, + sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); - bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin, + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, bch2_btree_write_buffer_journal_flush); -} -void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - if (!dst->wb->keys.nr) - bch2_journal_pin_drop(&c->journal, &dst->wb->pin); - - if (bch2_btree_write_buffer_should_flush(c) && - __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && - !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); - - if (dst->wb == &wb->flushing) - mutex_unlock(&wb->flushing.lock); - mutex_unlock(&wb->inc.lock); -} - -static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) -{ - struct journal_keys_to_wb dst; - struct jset_entry *entry; - struct bkey_i *k; - int ret = 0; - - bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); - - for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) { - jset_entry_for_each_key(entry, k) { - ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k); - if (ret) - goto out; - } - - entry->type = BCH_JSET_ENTRY_btree_keys; - } - - buf->need_flush_to_write_buffer = false; + atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); out: - bch2_journal_keys_to_write_buffer_end(c, &dst); - return ret; -} - -static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size) -{ - if (wb->keys.size >= new_size) - return 0; - - if (!mutex_trylock(&wb->lock)) - return -EINTR; - - int ret = darray_resize(&wb->keys, new_size); - mutex_unlock(&wb->lock); + preempt_enable(); return ret; } -int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb_keys_resize(&wb->flushing, new_size) ?: - wb_keys_resize(&wb->inc, new_size); -} - void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) && - !bch2_journal_error(&c->journal)); + BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); - darray_exit(&wb->sorted); - darray_exit(&wb->flushing.keys); - darray_exit(&wb->inc.keys); + kvfree(wb->keys[1]); + kvfree(wb->keys[0]); } int bch2_fs_btree_write_buffer_init(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; - mutex_init(&wb->inc.lock); - mutex_init(&wb->flushing.lock); - INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); + mutex_init(&wb->flush_lock); + wb->size = c->opts.btree_write_buffer_size; - /* Will be resized by journal as needed: */ - unsigned initial_size = 1 << 16; + wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); + wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); + if (!wb->keys[0] || !wb->keys[1]) + return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; - return darray_make_room(&wb->inc.keys, initial_size) ?: - darray_make_room(&wb->flushing.keys, initial_size) ?: - darray_make_room(&wb->sorted, initial_size); + return 0; } diff --git a/libbcachefs/btree_write_buffer.h b/libbcachefs/btree_write_buffer.h index 1f645f529..dec2c9a8b 100644 --- a/libbcachefs/btree_write_buffer.h +++ b/libbcachefs/btree_write_buffer.h @@ -2,59 +2,13 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H #define _BCACHEFS_BTREE_WRITE_BUFFER_H -#include "bkey.h" - -static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4; -} - -static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c) -{ - struct btree_write_buffer *wb = &c->btree_write_buffer; - - return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4; -} - -struct btree_trans; -int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +int bch2_btree_write_buffer_flush_locked(struct btree_trans *); int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *); +int bch2_btree_write_buffer_flush_sync(struct btree_trans *); int bch2_btree_write_buffer_tryflush(struct btree_trans *); -struct journal_keys_to_wb { - struct btree_write_buffer_keys *wb; - size_t room; - u64 seq; -}; - -int __bch2_journal_key_to_wb(struct bch_fs *, - struct journal_keys_to_wb *, - enum btree_id, struct bkey_i *); - -static inline int bch2_journal_key_to_wb(struct bch_fs *c, - struct journal_keys_to_wb *dst, - enum btree_id btree, struct bkey_i *k) -{ - EBUG_ON(!dst->seq); - - if (unlikely(!dst->room)) - return __bch2_journal_key_to_wb(c, dst, btree, k); - - struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys); - wb_k->journal_seq = dst->seq; - wb_k->btree = btree; - bkey_copy(&wb_k->k, k); - dst->wb->keys.nr++; - dst->room--; - return 0; -} - -void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64); -void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *); +int bch2_btree_insert_keys_write_buffer(struct btree_trans *); -int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); diff --git a/libbcachefs/btree_write_buffer_types.h b/libbcachefs/btree_write_buffer_types.h index 9b9433de9..99993ba77 100644 --- a/libbcachefs/btree_write_buffer_types.h +++ b/libbcachefs/btree_write_buffer_types.h @@ -2,56 +2,43 @@ #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -#include "darray.h" #include "journal_types.h" #define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 #define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) -struct wb_key_ref { -union { - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - unsigned idx:24; - u8 pos[sizeof(struct bpos)]; - enum btree_id btree:8; -#else - enum btree_id btree:8; - u8 pos[sizeof(struct bpos)]; - unsigned idx:24; -#endif - } __packed; - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - u64 lo; - u64 mi; - u64 hi; -#else - u64 hi; - u64 mi; - u64 lo; -#endif - }; -}; -}; - struct btree_write_buffered_key { - enum btree_id btree:8; - u64 journal_seq:56; + u64 journal_seq; + unsigned journal_offset; + enum btree_id btree; __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); }; -struct btree_write_buffer_keys { - DARRAY(struct btree_write_buffered_key) keys; - struct journal_entry_pin pin; - struct mutex lock; +union btree_write_buffer_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 nr:23; + u64 idx:1; + u64 ref0:20; + u64 ref1:20; + }; }; struct btree_write_buffer { - DARRAY(struct wb_key_ref) sorted; - struct btree_write_buffer_keys inc; - struct btree_write_buffer_keys flushing; - struct work_struct flush_work; + struct mutex flush_lock; + struct journal_entry_pin journal_pin; + + union btree_write_buffer_state state; + size_t size; + + struct btree_write_buffered_key *keys[2]; }; #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 50eb6ba2f..312bd0c86 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -334,7 +334,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } -struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) +static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) { return (struct bch_alloc_v4) { .gen = b.gen, @@ -346,13 +346,12 @@ struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) } static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, - struct bucket old, struct bucket new, - u64 journal_seq, bool gc) + struct bucket old, struct bucket new) { bch2_dev_usage_update(c, ca, bucket_m_to_alloc(old), bucket_m_to_alloc(new), - journal_seq, gc); + 0, true); } static inline int __update_replicas(struct bch_fs *c, @@ -658,7 +657,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, err: bucket_unlock(g); if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, 0, true); + bch2_dev_usage_update_m(c, ca, old, new); percpu_up_read(&c->mark_lock); return ret; } @@ -773,7 +772,6 @@ static int mark_stripe_bucket(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - u64 journal_seq = trans->journal_res.seq; const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; unsigned nr_data = s->nr_blocks - s->nr_redundant; bool parity = ptr_idx >= nr_data; @@ -820,7 +818,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, err: bucket_unlock(g); if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + bch2_dev_usage_update_m(c, ca, old, new); percpu_up_read(&c->mark_lock); printbuf_exit(&buf); return ret; @@ -843,8 +841,12 @@ static int __mark_pointer(struct btree_trans *trans, return ret; *dst_sectors += sectors; - *bucket_data_type = *dirty_sectors || *cached_sectors - ? ptr_data_type : 0; + + if (!*dirty_sectors && !*cached_sectors) + *bucket_data_type = 0; + else if (*bucket_data_type != BCH_DATA_stripe) + *bucket_data_type = ptr_data_type; + return 0; } @@ -855,7 +857,6 @@ static int bch2_mark_pointer(struct btree_trans *trans, s64 sectors, unsigned flags) { - u64 journal_seq = trans->journal_res.seq; struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket old, new, *g; @@ -882,7 +883,7 @@ static int bch2_mark_pointer(struct btree_trans *trans, new = *g; bucket_unlock(g); if (!ret) - bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + bch2_dev_usage_update_m(c, ca, old, new); percpu_up_read(&c->mark_lock); return ret; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 118f0c0c4..ba0436ae6 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -418,7 +418,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, unsigned i; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) @@ -492,7 +492,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_dev *ca; unsigned i; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -533,7 +533,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, struct bch_dev *ca; int ret = 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -725,7 +725,7 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); } - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EINVAL; switch (cmd) { diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 2a02bf00b..bc8b556f1 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -1005,7 +1005,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) unsigned i, nr_data = v->nr_blocks - v->nr_redundant; int ret = 0; - ret = bch2_btree_write_buffer_flush_sync(trans); + ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); if (ret) goto err; @@ -1415,7 +1415,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans, if (ret) return ERR_PTR(ret); - if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + if (test_bit(BCH_FS_going_ro, &c->flags)) { h = ERR_PTR(-BCH_ERR_erofs_no_writes); goto found; } diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 4d35e5c6c..e3e2be792 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -150,6 +150,7 @@ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ x(0, backpointer_to_overwritten_btree_node) \ x(0, lock_fail_root_changed) \ x(0, journal_reclaim_would_deadlock) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 7b28d3792..655e3ba9b 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -7,7 +7,7 @@ bool bch2_inconsistent_error(struct bch_fs *c) { - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_error, &c->flags); switch (c->opts.errors) { case BCH_ON_ERROR_continue: @@ -26,8 +26,8 @@ bool bch2_inconsistent_error(struct bch_fs *c) void bch2_topology_error(struct bch_fs *c) { - set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + set_bit(BCH_FS_topology_error, &c->flags); + if (test_bit(BCH_FS_fsck_done, &c->flags)) bch2_inconsistent_error(c); } @@ -114,7 +114,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) { struct fsck_err_state *s; - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + if (test_bit(BCH_FS_fsck_done, &c->flags)) return NULL; list_for_each_entry(s, &c->fsck_error_msgs, list) @@ -193,7 +193,7 @@ int bch2_fsck_err(struct bch_fs *c, prt_printf(out, bch2_log_msg(c, "")); #endif - if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (test_bit(BCH_FS_fsck_done, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { prt_str(out, ", shutting down"); @@ -253,7 +253,7 @@ int bch2_fsck_err(struct bch_fs *c, if (print) bch2_print_string_as_lines(KERN_ERR, out->buf); - if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + if (!test_bit(BCH_FS_fsck_done, &c->flags) && (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore)) bch_err(c, "Unable to continue, halting"); @@ -271,10 +271,10 @@ int bch2_fsck_err(struct bch_fs *c, bch2_inconsistent_error(c); if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } else { - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); - set_bit(BCH_FS_ERROR, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } return ret; diff --git a/libbcachefs/fs-io-buffered.c b/libbcachefs/fs-io-buffered.c index 52f0e7acd..637a83e4d 100644 --- a/libbcachefs/fs-io-buffered.c +++ b/libbcachefs/fs-io-buffered.c @@ -638,7 +638,7 @@ static int __bch2_writepage(struct folio *folio, /* Check for writing past i_size: */ WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > round_up(i_size, block_bytes(c)) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + !test_bit(BCH_FS_emergency_ro, &c->flags), "writing past i_size: %llu > %llu (unrounded %llu)\n", bio_end_sector(&w->io->op.wbio.bio) << 9, round_up(i_size, block_bytes(c)), diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index b1c89a482..8cf4bcf9b 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -448,7 +448,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, bch2_btree_id_str(btree_id), pos.inode, pos.offset, i->id, n.id, n.equiv); - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); } } diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index a2c96f7c1..b861ab2e2 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -1173,7 +1173,7 @@ int bch2_delete_dead_inodes(struct bch_fs *c) break; if (ret) { - if (!test_bit(BCH_FS_RW, &c->flags)) { + if (!test_bit(BCH_FS_rw, &c->flags)) { bch2_trans_unlock(trans); bch2_fs_lazy_rw(c); } diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 86b148d9b..d5540c856 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -10,7 +10,6 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" -#include "btree_write_buffer.h" #include "buckets.h" #include "error.h" #include "journal.h" @@ -148,7 +147,6 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) bch2_journal_reclaim_fast(j); if (write) closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); - wake_up(&j->wait); } /* @@ -332,7 +330,6 @@ static int journal_entry_open(struct journal *j) buf->must_flush = false; buf->separate_flush = false; buf->flush_time = 0; - buf->need_flush_to_write_buffer = true; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -769,75 +766,6 @@ void bch2_journal_block(struct journal *j) journal_quiesce(j); } -/* - * XXX: ideally this would not be closing the current journal entry, but - * otherwise we do not have a way to avoid racing with res_get() - j->blocked - * will race. - */ -static bool journal_reservations_stopped(struct journal *j) -{ - union journal_res_state s; - - journal_entry_close(j); - - s.v = atomic64_read_acquire(&j->reservations.counter); - - return s.buf0_count == 0 && - s.buf1_count == 0 && - s.buf2_count == 0 && - s.buf3_count == 0; -} - -void bch2_journal_block_reservations(struct journal *j) -{ - spin_lock(&j->lock); - j->blocked++; - spin_unlock(&j->lock); - - wait_event(j->wait, journal_reservations_stopped(j)); -} - -static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) -{ - spin_lock(&j->lock); - max_seq = min(max_seq, journal_cur_seq(j)); - - for (u64 seq = journal_last_unwritten_seq(j); - seq <= max_seq; - seq++) { - unsigned idx = seq & JOURNAL_BUF_MASK; - struct journal_buf *buf = j->buf + idx; - union journal_res_state s; - - if (!buf->need_flush_to_write_buffer) - continue; - - if (seq == journal_cur_seq(j)) - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); - - s.v = atomic64_read_acquire(&j->reservations.counter); - - if (journal_state_count(s, idx)) { - spin_unlock(&j->lock); - return ERR_PTR(-EAGAIN); - } - - spin_unlock(&j->lock); - return buf; - } - - spin_unlock(&j->lock); - return NULL; -} - -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq) -{ - struct journal_buf *ret; - - wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN)); - return ret; -} - /* allocate journal on a device: */ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, @@ -1289,7 +1217,6 @@ int bch2_fs_journal_init(struct journal *j) static struct lock_class_key res_key; unsigned i; - mutex_init(&j->buf_lock); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); init_waitqueue_head(&j->wait); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index b5185f97a..c85d01cf4 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -259,7 +259,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u { union journal_res_state s; - s.v = atomic64_sub_return_release(((union journal_res_state) { + s.v = atomic64_sub_return(((union journal_res_state) { .buf0_count = idx == 0, .buf1_count = idx == 1, .buf2_count = idx == 2, @@ -427,8 +427,6 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -void bch2_journal_block_reservations(struct journal *); -struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq); void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index c2a655235..fe4565fcd 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -4,7 +4,6 @@ #include "alloc_foreground.h" #include "btree_io.h" #include "btree_update_interior.h" -#include "btree_write_buffer.h" #include "buckets.h" #include "checksum.h" #include "disk_groups.h" @@ -722,22 +721,6 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } -static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, - struct jset *jset, - struct jset_entry *entry, - unsigned version, int big_endian, - enum bkey_invalid_flags flags) -{ - return journal_entry_btree_keys_validate(c, jset, entry, - version, big_endian, READ); -} - -static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c, - struct jset_entry *entry) -{ - journal_entry_btree_keys_to_text(out, c, entry); -} - struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -1518,8 +1501,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - /* we aren't holding j->lock: */ unsigned new_size = READ_ONCE(j->buf_size_want); void *new_buf; @@ -1527,11 +1508,6 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (buf->buf_size >= new_size) return; - size_t btree_write_buffer_size = new_size / 64; - - if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) - return; - new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1621,7 +1597,6 @@ static CLOSURE_CALLBACK(journal_write_done) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - bch2_journal_reclaim_fast(j); bch2_journal_space_available(j); track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], @@ -1725,11 +1700,9 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) struct bch_fs *c = container_of(j, struct bch_fs, journal); struct jset_entry *start, *end, *i, *next, *prev = NULL; struct jset *jset = w->data; - struct journal_keys_to_wb wb = { NULL }; unsigned sectors, bytes, u64s; - unsigned long btree_roots_have = 0; bool validate_before_checksum = false; - u64 seq = le64_to_cpu(jset->seq); + unsigned long btree_roots_have = 0; int ret; /* @@ -1757,28 +1730,9 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) * to c->btree_roots we have to get any missing btree roots and * add them to this journal entry: */ - switch (i->type) { - case BCH_JSET_ENTRY_btree_root: + if (i->type == BCH_JSET_ENTRY_btree_root) { bch2_journal_entry_to_btree_root(c, i); __set_bit(i->btree_id, &btree_roots_have); - break; - case BCH_JSET_ENTRY_write_buffer_keys: - EBUG_ON(!w->need_flush_to_write_buffer); - - if (!wb.wb) - bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - - struct bkey_i *k; - jset_entry_for_each_key(i, k) { - ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); - if (ret) { - bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer"); - bch2_journal_keys_to_write_buffer_end(c, &wb); - return ret; - } - } - i->type = BCH_JSET_ENTRY_btree_keys; - break; } /* Can we merge with previous entry? */ @@ -1801,10 +1755,6 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) memmove_u64s_down(prev, i, jset_u64s(u64s)); } - if (wb.wb) - bch2_journal_keys_to_write_buffer_end(c, &wb); - w->need_flush_to_write_buffer = false; - prev = prev ? vstruct_next(prev) : jset->start; jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); @@ -1812,7 +1762,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); - bch2_journal_super_entries_add_common(c, &end, seq); + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1835,7 +1786,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) - j->last_empty_seq = seq; + j->last_empty_seq = le64_to_cpu(jset->seq); if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) validate_before_checksum = true; @@ -1931,11 +1882,9 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; - mutex_lock(&j->buf_lock); journal_buf_realloc(j, w); ret = bch2_journal_write_prep(j, w); - mutex_unlock(&j->buf_lock); if (ret) goto err; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 2aa4c0c6b..658aaa2c3 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -3,7 +3,6 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" -#include "btree_write_buffer.h" #include "buckets.h" #include "errcode.h" #include "error.h" @@ -51,23 +50,20 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j, return available; } -void bch2_journal_set_watermark(struct journal *j) +static inline void journal_set_watermark(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); bool low_on_space = j->space[journal_space_clean].total * 4 <= j->space[journal_space_total].total; bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4; - bool low_on_wb = bch2_btree_write_buffer_must_wait(c); - unsigned watermark = low_on_space || low_on_pin || low_on_wb + unsigned watermark = low_on_space || low_on_pin ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], &j->low_on_space_start, low_on_space) || track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], - &j->write_buffer_full_start, low_on_wb)) + &j->low_on_pin_start, low_on_pin)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); @@ -234,7 +230,7 @@ void bch2_journal_space_available(struct journal *j) else clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); - bch2_journal_set_watermark(j); + journal_set_watermark(j); out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -307,7 +303,6 @@ void bch2_journal_reclaim_fast(struct journal *j) * all btree nodes got written out */ while (!fifo_empty(&j->pin) && - j->pin.front <= j->seq_ondisk && !atomic_read(&fifo_peek_front(&j->pin).count)) { j->pin.front++; popped = true; diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index ec84c3345..7b15d682a 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -16,7 +16,6 @@ static inline void journal_reclaim_kick(struct journal *j) unsigned bch2_journal_dev_buckets_available(struct journal *, struct journal_device *, enum journal_space_from); -void bch2_journal_set_watermark(struct journal *); void bch2_journal_space_available(struct journal *); static inline bool journal_pin_active(struct journal_entry_pin *pin) diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index f9d9aa95b..0200e299c 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -267,7 +267,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work) while (!(ret = PTR_ERR_OR_ZERO(b)) && b && - !test_bit(BCH_FS_STOPPING, &c->flags)) + !test_bit(BCH_FS_stopping, &c->flags)) b = bch2_btree_iter_next_node(&iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 85c543af6..2427cce64 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -36,7 +36,6 @@ struct journal_buf { bool noflush; /* write has already been kicked off, and was noflush */ bool must_flush; /* something wants a flush */ bool separate_flush; - bool need_flush_to_write_buffer; }; /* @@ -182,12 +181,6 @@ struct journal { */ darray_u64 early_journal_entries; - /* - * Protects journal_buf->data, when accessing without a jorunal - * reservation: for synchronization between the btree write buffer code - * and the journal write path: - */ - struct mutex buf_lock; /* * Two journal entries -- one is currently open for new entries, the * other is possibly being written out. @@ -278,7 +271,6 @@ struct journal { u64 low_on_space_start; u64 low_on_pin_start; u64 max_in_flight_start; - u64 write_buffer_full_start; struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index 5340f2d0e..82c08a987 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -123,9 +123,11 @@ int bch2_check_lru_key(struct btree_trans *trans, if (lru_k.k->type != KEY_TYPE_set || lru_pos_time(lru_k.k->p) != idx) { if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) { - *last_flushed_pos = lru_k.k->p; - ret = bch2_btree_write_buffer_flush_sync(trans) ?: - -BCH_ERR_transaction_restart_write_buffer_flush; + ret = bch2_btree_write_buffer_flush_sync(trans); + if (!ret) { + *last_flushed_pos = lru_k.k->p; + ret = -BCH_ERR_transaction_restart_write_buffer_flush; + } goto out; } diff --git a/libbcachefs/move.c b/libbcachefs/move.c index c5518a866..db14ec376 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -56,17 +56,6 @@ static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) } } -static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) -{ - if (trace_move_extent_alloc_mem_fail_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_alloc_mem_fail(c, buf.buf); - printbuf_exit(&buf); - } -} - struct moving_io { struct list_head read_list; struct list_head io_list; @@ -356,8 +345,16 @@ int bch2_move_extent(struct moving_context *ctxt, if (ret == -BCH_ERR_data_update_done) return 0; - this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); - trace_move_extent_alloc_mem_fail2(c, k); + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]); + if (trace_move_extent_start_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, ": "); + prt_str(&buf, bch2_err_str(ret)); + trace_move_extent_start_fail(c, buf.buf); + printbuf_exit(&buf); + } return ret; } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index b7f9990c5..8526f1774 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -233,6 +233,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(btree_write_buffer_size, u32, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(16, (1U << 20) - 1), \ + BCH2_NO_SB_OPT, 1U << 13, \ + NULL, "Number of btree write buffer entries") \ x(gc_reserve_percent, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(5, 21), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 5f4f76e67..3f8c3ba10 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -530,7 +530,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); keys->gap = keys->nr; - set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_may_go_rw, &c->flags); if (keys->nr) return bch2_fs_read_write_early(c); return 0; @@ -876,13 +876,13 @@ int bch2_fs_recovery(struct bch_fs *c) /* If we fixed errors, verify that fs is actually clean now: */ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && - !test_bit(BCH_FS_ERROR, &c->flags)) { + test_bit(BCH_FS_errors_fixed, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags) && + !test_bit(BCH_FS_error, &c->flags)) { bch2_flush_fsck_errs(c); bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); - clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); + clear_bit(BCH_FS_errors_fixed, &c->flags); c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; @@ -890,13 +890,13 @@ int bch2_fs_recovery(struct bch_fs *c) if (ret) goto err; - if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || - test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + if (test_bit(BCH_FS_errors_fixed, &c->flags) || + test_bit(BCH_FS_errors_not_fixed, &c->flags)) { bch_err(c, "Second fsck run was not clean"); - set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_errors_not_fixed, &c->flags); } - set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + set_bit(BCH_FS_errors_fixed, &c->flags); } if (enabled_qtypes(c)) { @@ -913,14 +913,14 @@ int bch2_fs_recovery(struct bch_fs *c) write_sb = true; } - if (!test_bit(BCH_FS_ERROR, &c->flags)) { + if (!test_bit(BCH_FS_error, &c->flags)) { c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); write_sb = true; } if (c->opts.fsck && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_errors_not_fixed, &c->flags)) { SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); write_sb = true; @@ -954,7 +954,7 @@ int bch2_fs_recovery(struct bch_fs *c) ret = 0; out: - set_bit(BCH_FS_FSCK_DONE, &c->flags); + set_bit(BCH_FS_fsck_done, &c->flags); bch2_flush_fsck_errs(c); if (!c->opts.keep_journal && @@ -962,7 +962,7 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_journal_keys_put_initial(c); kfree(clean); - if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) { + if (!ret && test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) { bch2_fs_read_write_early(c); bch2_delete_dead_snapshots_async(c); } @@ -1001,8 +1001,8 @@ int bch2_fs_initialize(struct bch_fs *c) mutex_unlock(&c->sb_lock); c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); - set_bit(BCH_FS_MAY_GO_RW, &c->flags); - set_bit(BCH_FS_FSCK_DONE, &c->flags); + set_bit(BCH_FS_may_go_rw, &c->flags); + set_bit(BCH_FS_fsck_done, &c->flags); for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index b23550b44..e473c788f 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -318,7 +318,7 @@ int bch2_mark_snapshot(struct btree_trans *trans, __set_is_ancestor_bitmap(c, id); if (BCH_SNAPSHOT_DELETED(s.v)) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } @@ -1376,10 +1376,10 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) u32 *i, id; int ret = 0; - if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) return 0; - if (!test_bit(BCH_FS_STARTED, &c->flags)) { + if (!test_bit(BCH_FS_started, &c->flags)) { ret = bch2_fs_read_write_early(c); if (ret) { bch_err_msg(c, ret, "deleting dead snapshots: error going rw"); @@ -1680,7 +1680,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct if (BCH_SNAPSHOT_DELETED(snap.v) || bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset || (ret = bch2_snapshot_needs_delete(trans, k)) > 0) { - set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags); + set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); return 0; } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 512d56657..136c01403 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -916,9 +916,9 @@ int bch2_write_super(struct bch_fs *c) le64_add_cpu(&c->disk_sb.sb->seq, 1); - if (test_bit(BCH_FS_ERROR, &c->flags)) + if (test_bit(BCH_FS_error, &c->flags)) SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); - if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) + if (test_bit(BCH_FS_topology_error, &c->flags)) SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 552d55dd9..e7f186b45 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -73,6 +73,13 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet "); MODULE_DESCRIPTION("bcachefs filesystem"); +const char * const bch2_fs_flag_strs[] = { +#define x(n) #n, + BCH_FS_FLAGS() +#undef x + NULL +}; + #define KTYPE(type) \ static const struct attribute_group type ## _group = { \ .attrs = type ## _files \ @@ -240,8 +247,8 @@ static void __bch2_fs_read_only(struct bch_fs *c) journal_cur_seq(&c->journal)); if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) - set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + !test_bit(BCH_FS_emergency_ro, &c->flags)) + set_bit(BCH_FS_clean_shutdown, &c->flags); bch2_fs_journal_stop(&c->journal); /* @@ -256,19 +263,19 @@ static void bch2_writes_disabled(struct percpu_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); - set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } #endif void bch2_fs_read_only(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags)) { + if (!test_bit(BCH_FS_rw, &c->flags)) { bch2_journal_reclaim_stop(&c->journal); return; } - BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags)); bch_verbose(c, "going read-only"); @@ -276,7 +283,7 @@ void bch2_fs_read_only(struct bch_fs *c) * Block new foreground-end write operations from starting - any new * writes will return -EROFS: */ - set_bit(BCH_FS_GOING_RO, &c->flags); + set_bit(BCH_FS_going_ro, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_kill(&c->writes); #else @@ -296,36 +303,35 @@ void bch2_fs_read_only(struct bch_fs *c) * that going RO is complete: */ wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || - test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags) || + test_bit(BCH_FS_emergency_ro, &c->flags)); - bool writes_disabled = test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags); if (writes_disabled) bch_verbose(c, "finished waiting for writes to stop"); __bch2_fs_read_only(c); wait_event(bch2_read_only_wait, - test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + test_bit(BCH_FS_write_disable_complete, &c->flags)); if (!writes_disabled) bch_verbose(c, "finished waiting for writes to stop"); - clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); - clear_bit(BCH_FS_GOING_RO, &c->flags); - clear_bit(BCH_FS_RW, &c->flags); + clear_bit(BCH_FS_write_disable_complete, &c->flags); + clear_bit(BCH_FS_going_ro, &c->flags); + clear_bit(BCH_FS_rw, &c->flags); if (!bch2_journal_error(&c->journal) && - !test_bit(BCH_FS_ERROR, &c->flags) && - !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && - test_bit(BCH_FS_STARTED, &c->flags) && - test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && + !test_bit(BCH_FS_error, &c->flags) && + !test_bit(BCH_FS_emergency_ro, &c->flags) && + test_bit(BCH_FS_started, &c->flags) && + test_bit(BCH_FS_clean_shutdown, &c->flags) && !c->opts.norecovery) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_read(&c->btree_cache.dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); - BUG_ON(c->btree_write_buffer.inc.keys.nr); - BUG_ON(c->btree_write_buffer.flushing.keys.nr); + BUG_ON(c->btree_write_buffer.state.nr); bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); @@ -351,7 +357,7 @@ static void bch2_fs_read_only_async(struct bch_fs *c) bool bch2_fs_emergency_read_only(struct bch_fs *c) { - bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); bch2_journal_halt(&c->journal); bch2_fs_read_only_async(c); @@ -392,12 +398,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) unsigned i; int ret; - if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; } - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) return 0; if (c->opts.norecovery) @@ -420,7 +426,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; - clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + clear_bit(BCH_FS_clean_shutdown, &c->flags); /* * First journal write must be a flush write: after a clean shutdown we @@ -434,8 +440,8 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - set_bit(BCH_FS_RW, &c->flags); - set_bit(BCH_FS_WAS_RW, &c->flags); + set_bit(BCH_FS_rw, &c->flags); + set_bit(BCH_FS_was_rw, &c->flags); #ifndef BCH_WRITE_REF_DEBUG percpu_ref_reinit(&c->writes); @@ -468,7 +474,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_do_pending_node_rewrites(c); return 0; err: - if (test_bit(BCH_FS_RW, &c->flags)) + if (test_bit(BCH_FS_rw, &c->flags)) bch2_fs_read_only(c); else __bch2_fs_read_only(c); @@ -568,7 +574,7 @@ void __bch2_fs_stop(struct bch_fs *c) bch_verbose(c, "shutting down"); - set_bit(BCH_FS_STOPPING, &c->flags); + set_bit(BCH_FS_stopping, &c->flags); cancel_work_sync(&c->journal_seq_blacklist_gc_work); @@ -960,7 +966,7 @@ int bch2_fs_start(struct bch_fs *c) down_write(&c->state_lock); - BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); + BUG_ON(test_bit(BCH_FS_started, &c->flags)); mutex_lock(&c->sb_lock); @@ -995,12 +1001,12 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - set_bit(BCH_FS_STARTED, &c->flags); + set_bit(BCH_FS_started, &c->flags); if (c->opts.read_only || c->opts.nochanges) { bch2_fs_read_only(c); } else { - ret = !test_bit(BCH_FS_RW, &c->flags) + ret = !test_bit(BCH_FS_rw, &c->flags) ? bch2_fs_read_write(c) : bch2_fs_read_write_late(c); if (ret) diff --git a/libbcachefs/super.h b/libbcachefs/super.h index bf762df18..dada09331 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -8,6 +8,8 @@ #include +extern const char * const bch2_fs_flag_strs[]; + struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); @@ -37,8 +39,8 @@ int bch2_fs_read_write_early(struct bch_fs *); */ static inline void bch2_fs_lazy_rw(struct bch_fs *c) { - if (!test_bit(BCH_FS_RW, &c->flags) && - !test_bit(BCH_FS_WAS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags) && + !test_bit(BCH_FS_was_rw, &c->flags)) bch2_fs_read_write_early(c); } diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 7223418d3..4a7c93bcf 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -145,6 +145,7 @@ rw_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); +read_attribute(flags); read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); @@ -268,7 +269,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c memset(s, 0, sizeof(s)); - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; trans = bch2_trans_get(c); @@ -384,6 +385,9 @@ SHOW(bch2_fs) sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + if (attr == &sysfs_flags) + prt_bitflags(out, bch2_fs_flag_strs, c->flags); + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); if (attr == &sysfs_btree_write_stats) @@ -416,7 +420,7 @@ SHOW(bch2_fs) bch2_btree_updates_to_text(out, c); if (attr == &sysfs_btree_cache) - bch2_btree_cache_to_text(out, &c->btree_cache); + bch2_btree_cache_to_text(out, c); if (attr == &sysfs_btree_key_cache) bch2_btree_key_cache_to_text(out, &c->btree_key_cache); @@ -497,12 +501,12 @@ STORE(bch2_fs) /* Debugging: */ - if (!test_bit(BCH_FS_STARTED, &c->flags)) + if (!test_bit(BCH_FS_started, &c->flags)) return -EPERM; /* Debugging: */ - if (!test_bit(BCH_FS_RW, &c->flags)) + if (!test_bit(BCH_FS_rw, &c->flags)) return -EROFS; if (attr == &sysfs_prune_cache) { @@ -634,6 +638,7 @@ STORE(bch2_fs_internal) SYSFS_OPS(bch2_fs_internal); struct attribute *bch2_fs_internal_files[] = { + &sysfs_flags, &sysfs_journal_debug, &sysfs_btree_updates, &sysfs_btree_cache, diff --git a/libbcachefs/trace.h b/libbcachefs/trace.h index 4980cfdd1..6eced95ce 100644 --- a/libbcachefs/trace.h +++ b/libbcachefs/trace.h @@ -775,9 +775,9 @@ TRACE_EVENT(move_extent_fail, TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) ); -DEFINE_EVENT(str, move_extent_alloc_mem_fail, - TP_PROTO(struct bch_fs *c, const char *k), - TP_ARGS(c, k) +DEFINE_EVENT(str, move_extent_start_fail, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); TRACE_EVENT(move_data,