From d0f753ea3986caf4d21d82629a542f0897427ae6 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 5 Dec 2024 03:10:29 +0500 Subject: [PATCH] Mav feedback In progress --- include/sys/zvol_impl.h | 3 + module/os/linux/zfs/zvol_os.c | 79 +++++++++++++----------- module/zfs/zvol.c | 109 +++++++++++++++++++++++++++++++++- 3 files changed, 156 insertions(+), 35 deletions(-) diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index 3cd0d78c353d..55021a080076 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -83,6 +83,9 @@ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len); void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit); +void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, + uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, + size_t nbps); int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio); int zvol_init_impl(void); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 08f3237b6be5..cdbdcd0658c3 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -23,7 +23,11 @@ * Copyright (c) 2024, Rob Norris * Copyright (c) 2024, Klara, Inc. */ - +/* + * TODO Mav feedback: driver_private correction, zvol_setup_copy_offload() + * why setup, offload, lock. + * Review again: suspend_lock, zil, rangelock, end_bio. + */ #include #include #include @@ -519,7 +523,7 @@ static void zvol_setup_copy_offload(zv_request_t *zvr) blkptr_t *bps; size_t maxblocks; uint64_t inoff, outoff, len = 0; - int error = 0, seg = 1; + int error = EINVAL, seg = 1; memset(&uio_src, 0, sizeof (zfs_uio_t)); memset(&uio_dst, 0, sizeof (zfs_uio_t)); @@ -534,8 +538,7 @@ static void zvol_setup_copy_offload(zv_request_t *zvr) bio->bi_private; zfs_uio_bvec_init(&uio_src, bio, NULL); if (len != bio->bi_iter.bi_size) { - rw_exit(&zv_src->zv_suspend_lock); - zvol_end_io(bio, req, -SET_ERROR(error)); + zvol_end_io(NULL, req, -SET_ERROR(error)); return; } if (offload_io && offload_io->driver_private) @@ -548,12 +551,28 @@ static void zvol_setup_copy_offload(zv_request_t *zvr) } if (!zv_src || !zv_dst) { - rw_exit(&zv_src->zv_suspend_lock); - zvol_end_io(bio, req, -SET_ERROR(error)); + zvol_end_io(NULL, req, -SET_ERROR(error)); return; } + + rw_enter(&zv_dst->zv_suspend_lock, RW_READER); + /* + * ZIL Lock + */ + if (zv_dst->zv_zilog == NULL) { + rw_exit(&zv_dst->zv_suspend_lock); + rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); + if (zv_dst->zv_zilog == NULL) { + zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, + zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); + zv_dst->zv_flags |= ZVOL_WRITTEN_TO; + VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & + ZIL_REPLAY_NEEDED)); + } + rw_downgrade(&zv_dst->zv_suspend_lock); + } if (zv_src != zv_dst) - rw_enter(&zv_dst->zv_suspend_lock, RW_READER); + rw_enter(&zv_src->zv_suspend_lock, RW_READER); inoff = uio_src.uio_loffset; outoff = uio_dst.uio_loffset; @@ -632,30 +651,25 @@ static void zvol_setup_copy_offload(zv_request_t *zvr) goto out; } - /* - * ZIL Lock - */ - if (zv_dst->zv_zilog == NULL) { - rw_exit(&zv_dst->zv_suspend_lock); - rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); - if (zv_dst->zv_zilog == NULL) { - zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, - zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); - zv_dst->zv_flags |= ZVOL_WRITTEN_TO; - VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & - ZIL_REPLAY_NEEDED)); - } - rw_downgrade(&zv_dst->zv_suspend_lock); - } - zilog_dst = zv_dst->zv_zilog; maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) / sizeof (bps[0]); bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); - inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, - RL_READER); - outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, - RL_WRITER); + /* + * Maintain predictable lock order. + */ + if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) { + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + } else { + outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, + RL_WRITER); + inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, + RL_READER); + } + while (len > 0) { uint64_t size, last_synced_txg; size_t nbps = maxblocks; @@ -693,7 +707,7 @@ static void zvol_setup_copy_offload(zv_request_t *zvr) dmu_tx_commit(tx); break; } - zfs_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, NULL, outoff, + zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff, size, zv_src->zv_volblocksize, bps, nbps); dmu_tx_commit(tx); inoff += size; @@ -708,10 +722,9 @@ static void zvol_setup_copy_offload(zv_request_t *zvr) } out: if (zv_src != zv_dst) - rw_exit(&zv_dst->zv_suspend_lock); - - rw_exit(&zv_src->zv_suspend_lock); - zvol_end_io(bio, req, -SET_ERROR(error)); + rw_exit(&zv_src->zv_suspend_lock); + rw_exit(&zv_dst->zv_suspend_lock); + zvol_end_io(NULL, req, -SET_ERROR(error)); } static void @@ -787,8 +800,6 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, zvol_end_io(bio, rq, -SET_ERROR(EROFS)); goto out; } - rw_enter(&zv->zv_suspend_lock, RW_READER); - if (force_sync) { zvol_setup_copy_offload(&zvr); } else { diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 78bf714170d2..3718ec3a81e9 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -516,6 +516,67 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } +/* + * Replay a TX_CLONE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_clone_range_t *lr = arg2; + objset_t *os = zv->zv_objset; + dmu_tx_t *tx; + int error; + uint64_t blksz; + uint64_t off; + uint64_t len; + + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(spa_feature_is_enabled(dmu_objset_spa(os), + SPA_FEATURE_BLOCK_CLONING)); + + off= lr->lr_offset; + len= lr->lr_length; + blksz = lr->lr_blksz; + + if ((off % blksz) != 0) { + return (SET_ERROR(EINVAL)); + } + + tx = dmu_tx_create(os); + error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); + if (error != 0 || !zv->zv_dn) { + dmu_tx_abort(tx); + return (error); + } + dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + goto out; + } + dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len, + tx, lr->lr_bps, lr->lr_nbps); + if (error != 0) { + dmu_tx_commit(tx); + goto out; + } + zil_replaying(zv->zv_zilog, tx); + dmu_tx_commit(tx); + +out: + dnode_rele(zv->zv_dn, zv); + zv->zv_dn = NULL; + return (error); +} + static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -540,7 +601,9 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL_V0 */ zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ @@ -550,7 +613,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ - zvol_replay_err, /* TX_CLONE_RANGE */ + zvol_replay_clone_range, /* TX_CLONE_RANGE */ }; /* @@ -625,6 +688,50 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, } } +/* + * Handles TX_CLONE_RANGE transactions. + */ +void +zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, + uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) +{ + itx_t *itx; + lr_clone_range_t *lr; + uint64_t partlen, max_log_data; + size_t partnbps; + + if (zil_replaying(zilog, tx)) + return; + + max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); + + while (nbps > 0) { + partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); + partlen = partnbps * blksz; + ASSERT3U(partlen, <, len + blksz); + partlen = MIN(partlen, len); + + itx = zil_itx_create(txtype, + sizeof (*lr) + sizeof (bps[0]) * partnbps); + lr = (lr_clone_range_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = partlen; + lr->lr_blksz = blksz; + lr->lr_nbps = partnbps; + memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); + + zil_itx_assign(zilog, itx, tx); + + bps += partnbps; + ASSERT3U(nbps, >=, partnbps); + nbps -= partnbps; + off += partlen; + ASSERT3U(len, >=, partlen); + len -= partlen; + } +} + /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */