diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 1676020d04d3..89ee30014c6c 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -378,6 +378,7 @@ typedef enum { VDEV_PROP_TRIM_SUPPORT, VDEV_PROP_TRIM_ERRORS, VDEV_PROP_SLOW_IOS, + VDEV_PROP_QUEUE_IO, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index abd66b8abc96..dbb98680e1bb 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -467,6 +467,7 @@ struct vdev { uint64_t vdev_io_t; uint64_t vdev_slow_io_n; uint64_t vdev_slow_io_t; + uint64_t vdev_queue_io; }; #define VDEV_PAD_SIZE (8 << 10) diff --git a/include/sys/zio.h b/include/sys/zio.h index 46f5d68aed4a..6fe01b5a555e 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -209,24 +209,25 @@ typedef uint64_t zio_flag_t; #define ZIO_FLAG_TRYHARD (1ULL << 17) #define ZIO_FLAG_OPTIONAL (1ULL << 18) #define ZIO_FLAG_DIO_READ (1ULL << 19) +#define ZIO_FLAG_DIO_WRITE (1ULL << 20) #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ -#define ZIO_FLAG_DONT_QUEUE (1ULL << 20) /* must be first for INHERIT */ -#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 21) -#define ZIO_FLAG_IO_BYPASS (1ULL << 22) -#define ZIO_FLAG_IO_REWRITE (1ULL << 23) -#define ZIO_FLAG_RAW_COMPRESS (1ULL << 24) -#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 25) -#define ZIO_FLAG_GANG_CHILD (1ULL << 26) -#define ZIO_FLAG_DDT_CHILD (1ULL << 27) -#define ZIO_FLAG_GODFATHER (1ULL << 28) -#define ZIO_FLAG_NOPWRITE (1ULL << 29) -#define ZIO_FLAG_REEXECUTED (1ULL << 30) -#define ZIO_FLAG_DELEGATED (1ULL << 31) -#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32) +#define ZIO_FLAG_DONT_QUEUE (1ULL << 21) /* must be first for INHERIT */ +#define ZIO_FLAG_DONT_PROPAGATE (1ULL << 22) +#define ZIO_FLAG_IO_BYPASS (1ULL << 23) +#define ZIO_FLAG_IO_REWRITE (1ULL << 24) +#define ZIO_FLAG_RAW_COMPRESS (1ULL << 25) +#define ZIO_FLAG_RAW_ENCRYPT (1ULL << 26) +#define ZIO_FLAG_GANG_CHILD (1ULL << 27) +#define ZIO_FLAG_DDT_CHILD (1ULL << 28) +#define ZIO_FLAG_GODFATHER (1ULL << 29) +#define ZIO_FLAG_NOPWRITE (1ULL << 30) +#define ZIO_FLAG_REEXECUTED (1ULL << 31) +#define ZIO_FLAG_DELEGATED (1ULL << 32) +#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 33) #define ZIO_ALLOCATOR_NONE (-1) #define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE) diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index ac9ae233c72d..5317417f7f57 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -5916,7 +5916,8 @@ - + + diff --git a/man/man7/vdevprops.7 b/man/man7/vdevprops.7 index 34d4026b1009..7595c5f2aa19 100644 --- a/man/man7/vdevprops.7 +++ b/man/man7/vdevprops.7 @@ -156,6 +156,9 @@ If this device should perform new allocations, used to disable a device when it is scheduled for later removal. See .Xr zpool-remove 8 . +.It Sy queue_io +Add io to the vdev queue when reading or writing to this vdev. +Disabling this property can sometimes improve performance for direct IOs. .El .Ss User Properties In addition to the standard native properties, ZFS supports arbitrary user diff --git a/module/zcommon/zfs_valstr.c b/module/zcommon/zfs_valstr.c index 43bccea14a85..ba5ed5336e42 100644 --- a/module/zcommon/zfs_valstr.c +++ b/module/zcommon/zfs_valstr.c @@ -207,6 +207,7 @@ _VALSTR_BITFIELD_IMPL(zio_flag, { '.', "TH", "TRYHARD" }, { '.', "OP", "OPTIONAL" }, { '.', "RD", "DIO_READ" }, + { '.', "WD", "DIO_WRITE" }, { '.', "DQ", "DONT_QUEUE" }, { '.', "DP", "DONT_PROPAGATE" }, { '.', "BY", "IO_BYPASS" }, diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index d3355730ba3d..c8831753dff9 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -466,6 +466,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_QUEUE_IO, "queue_io", 1, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "QUEUE_IO", + boolean_table, sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 40b78b519f49..a12b0422de43 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -195,7 +195,8 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) zio_t *zio = zio_write(pio, os->os_spa, txg, bp, data, db->db.db_size, db->db.db_size, &zp, dmu_write_direct_ready, NULL, dmu_write_direct_done, dsa, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DIO_WRITE, + &zb); if (pio == NULL) return (zio_wait(zio)); diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 983f444d79b0..f648babf7ae7 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -704,6 +704,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N); vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T); + vd->vdev_queue_io = vdev_prop_default_numeric(VDEV_PROP_QUEUE_IO); + list_link_init(&vd->vdev_config_dirty_node); list_link_init(&vd->vdev_state_dirty_node); list_link_init(&vd->vdev_initialize_node); @@ -6053,6 +6055,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_slow_io_t = intval; break; + case VDEV_PROP_QUEUE_IO: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + if (vd->vdev_ops->vdev_op_leaf) { + vd->vdev_queue_io = intval; + } + break; default: /* Most processing is done in vdev_props_set_sync */ break; @@ -6416,6 +6427,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) case VDEV_PROP_IO_T: case VDEV_PROP_SLOW_IO_N: case VDEV_PROP_SLOW_IO_T: + case VDEV_PROP_QUEUE_IO: err = vdev_prop_get_int(vd, prop, &intval); if (err && err != ENOENT) break; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 092b3f375be0..d2d29106fa26 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -946,6 +946,12 @@ vdev_queue_io(zio_t *zio) zio->io_flags |= ZIO_FLAG_DONT_QUEUE; zio->io_timestamp = gethrtime(); + if (!zio->io_vd->vdev_queue_io && + zio->io_flags & (ZIO_FLAG_DIO_READ | ZIO_FLAG_DIO_WRITE)) { + zio->io_queue_state = ZIO_QS_NONE; + return (zio); + } + mutex_enter(&vq->vq_lock); vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq); @@ -978,6 +984,10 @@ vdev_queue_io_done(zio_t *zio) vq->vq_io_complete_ts = now; vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; + if (zio->io_queue_state == ZIO_QS_NONE) { + return; + } + mutex_enter(&vq->vq_lock); vdev_queue_pending_remove(vq, zio); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index fc4adc42d00a..3460b30fd4e9 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -544,7 +544,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub'] [tests/functional/cli_root/zpool_set] tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', 'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos', - 'user_property_001_pos', 'user_property_002_neg'] + 'user_property_001_pos', 'user_property_002_neg', 'vdev_set_queue_io'] tags = ['functional', 'cli_root', 'zpool_set'] [tests/functional/cli_root/zpool_split] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 7d1551a63f0d..56cceb179179 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1233,6 +1233,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_set/setup.ksh \ functional/cli_root/zpool/setup.ksh \ functional/cli_root/zpool_set/vdev_set_001_pos.ksh \ + functional/cli_root/zpool_set/vdev_set_queue_io.ksh \ functional/cli_root/zpool_set/zpool_set_common.kshlib \ functional/cli_root/zpool_set/zpool_set_001_pos.ksh \ functional/cli_root/zpool_set/zpool_set_002_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg index 6cfa7eaf7514..0650cebb6b4a 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg @@ -75,4 +75,5 @@ typeset -a properties=( trim_support trim_errors slow_ios + queue_io ) diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_queue_io.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_queue_io.ksh new file mode 100755 index 000000000000..6f7d967fce13 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_set/vdev_set_queue_io.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Triad National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Toggling vdev queue_io property while reading from vdev should not cause panic. +# +# STRATEGY: +# 1. Create a zpool +# 2. Write a file to the pool. +# 3. Start reading from file, while also toggling the queue_io property on / off. +# + +verify_runnable "global" + +command -v fio > /dev/null || log_unsupported "fio missing" +log_must save_tunable DIO_ENABLED +log_must set_tunable32 DIO_ENABLED 1 + +function toggle_queue_io +{ + zpool set queue_io=off $TESTPOOL1 $FILEDEV + sleep 0.1 + zpool set queue_io=on $TESTPOOL1 $FILEDEV + sleep 0.1 +} + +function cleanup +{ + log_must destroy_pool $TESTPOOL1 + rm -f $FILEDEV + log_must restore_tunable DIO_ENABLED +} + +log_assert "Toggling vdev queue_io property while reading from vdev should not cause panic" +log_onexit cleanup + +# 1. Create a pool + +FILEDEV="$TEST_BASE_DIR/filedev.$$" +log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV +log_must create_pool $TESTPOOL1 $FILEDEV + +mntpnt=$(get_prop mountpoint $TESTPOOL1) + +# 2. Write a file to the pool, while also toggling the queue_io property on / off. + +log_must eval "fio --filename=$mntpnt/foobar --name=write-file \ + --rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ + --ioengine=sync --runtime=10 &" + +ITERATIONS=30 + +for i in $(seq $ITERATIONS); do + log_must toggle_queue_io +done; +wait + +# 3. Starting reading from file, while also toggling the queue_io property on / off. + +log_must eval "fio --filename=$mntpnt/foobar --name=read-file \ + --rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \ + --ioengine=sync --time_based --runtime=10 &" + +for i in $(seq $ITERATIONS); do + log_must toggle_queue_io +done; +wait + +log_pass "Toggling vdev queue_io property while reading from vdev does not cause panic"