diff --git a/meson.build b/meson.build index 0a9f6901cd..74eedf5ff9 100644 --- a/meson.build +++ b/meson.build @@ -24,10 +24,12 @@ mandir = join_paths(prefixdir, get_option('mandir')) sbindir = join_paths(prefixdir, get_option('sbindir')) sysconfdir = join_paths(prefixdir, get_option('sysconfdir')) -udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir')) -dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir')) -systemddir = join_paths(prefixdir, get_option('systemddir')) -rundir = join_paths(prefixdir, get_option('rundir')) +udevrulesdir = join_paths(prefixdir, get_option('udevrulesdir')) +dracutrulesdir = join_paths(prefixdir, get_option('dracutrulesdir')) +dracutmodulesdir = join_paths(prefixdir, get_option('dracutmodulesdir')) +systemddir = join_paths(prefixdir, get_option('systemddir')) +rundir = join_paths(prefixdir, get_option('rundir')) +networkmanagerdir = join_paths(prefixdir, get_option('networkmanagerdir')) ############################################################################### conf = configuration_data() @@ -199,7 +201,8 @@ substs.set('NAME', meson.project_name()) substs.set('VERSION', meson.project_version()) substs.set('LICENSE', meson.project_license()[0]) substs.set('UDEVRULESDIR', udevrulesdir) -substs.set('DRACUTRILESDIR', dracutrulesdir) +substs.set('DRACUTRULESDIR', dracutrulesdir) +substs.set('DRACUTMODULESDIR', dracutmodulesdir) substs.set('REQUIRES', requires) substs.set('DATADIR', datadir) substs.set('MANDIR', mandir) @@ -207,6 +210,7 @@ substs.set('RUNDIR', rundir) substs.set('SBINDIR', sbindir) substs.set('SYSCONFDIR', sysconfdir) substs.set('SYSTEMDDIR', systemddir) +substs.set('NETWORKMANAGERDIR', networkmanagerdir) substs.set('SYSTEMCTL', get_option('systemctl')) configure_file( @@ -221,11 +225,11 @@ disc = configure_file( configuration: substs, ) -dracut_files = [ +dracut_conf_files = [ '70-nvmf-autoconnect.conf', ] -foreach file : dracut_files +foreach file : dracut_conf_files configure_file( input: 'nvmf-autoconnect/dracut-conf/' + file + '.in', output: file, @@ -233,6 +237,47 @@ foreach file : dracut_files ) endforeach +want_dracut_module = get_option('dracut-module') +if want_dracut_module + dracut_nbft_files = [ + 'module-setup.sh', + 'nbft-boot-pre.service', + 'nbft-boot-connect.service' + ] + + foreach file : dracut_nbft_files + configure_file( + input: 'nvmf-autoconnect/dracut-95nbft/' + file + '.in', + output: file, + configuration: substs, + ) + endforeach + + networkmanager_conf_files = [ + '99-nvme-nbft-no-ignore-carrier.conf' + ] + + foreach file : networkmanager_conf_files + configure_file( + input: 'nvmf-autoconnect/NetworkManager/' + file + '.in', + output: file, + configuration: substs, + ) + endforeach + + networkmanager_dispatcher_files = [ + '99-nvme-nbft-connect.sh' + ] + + foreach file : networkmanager_dispatcher_files + configure_file( + input: 'nvmf-autoconnect/NetworkManager/' + file + '.in', + output: file, + configuration: substs, + ) + endforeach +endif + systemd_files = [ 'nvmefc-boot-connections.service', 'nvmf-autoconnect.service', @@ -315,11 +360,28 @@ install_data('completions/bash-nvme-completion.sh', install_data('completions/_nvme', install_dir: datadir + '/zsh/site-functions') -foreach file : dracut_files +foreach file : dracut_conf_files install_data(meson.current_build_dir() + '/' + file, install_dir: dracutrulesdir) endforeach +if want_dracut_module + foreach file : dracut_nbft_files + install_data(meson.current_build_dir() + '/' + file, + install_dir: dracutmodulesdir + '95nbft/') + endforeach + + foreach file : networkmanager_conf_files + install_data(meson.current_build_dir() + '/' + file, + install_dir: networkmanagerdir + 'conf.d/') + endforeach + + foreach file : networkmanager_dispatcher_files + install_data(meson.current_build_dir() + '/' + file, + install_dir: networkmanagerdir + 'dispatcher.d/') + endforeach +endif + foreach file : systemd_files install_data(meson.current_build_dir() + '/' + file, install_dir: systemddir) @@ -343,8 +405,10 @@ if meson.version().version_compare('>=0.53.0') 'mandir': mandir, 'udevrulesdir': udevrulesdir, 'dracutrulesdir': dracutrulesdir, + 'dracutmodulesdir': dracutmodulesdir, 'rundir': rundir, 'systemddir': systemddir, + 'networkmanagerdir': networkmanagerdir, 'build location': meson.current_build_dir(), } summary(path_dict, section: 'Paths') @@ -353,8 +417,9 @@ if meson.version().version_compare('>=0.53.0') } summary(dep_dict, section: 'Dependencies') conf_dict = { - 'git version': conf.get('GIT_VERSION'), - 'pdc enabled': get_option('pdc-enabled'), + 'git version': conf.get('GIT_VERSION'), + 'pdc enabled': get_option('pdc-enabled'), + 'dracut module enabled': want_dracut_module } summary(conf_dict, section: 'Configuration') endif diff --git a/meson_options.txt b/meson_options.txt index c61dae0fff..21ee75db49 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -17,6 +17,12 @@ option( value : 'lib/dracut/dracut.conf.d/', description : 'directory for dracut rules files' ) +option( + 'dracutmodulesdir', + type : 'string', + value : 'lib/dracut/modules.d/', + description : 'dracut modules directory' +) option( 'htmldir', type : 'string', @@ -70,3 +76,15 @@ option( type : 'string', description : 'override the git version string' ) +option( + 'dracut-module', + type : 'boolean', + value : false, + description : 'Enable the 95nbft dracut module' +) +option( + 'networkmanagerdir', + type : 'string', + value : 'lib/NetworkManager/', + description : 'NetworkManager lib directory' +) diff --git a/nvme.spec.in b/nvme.spec.in index 43fc9303f3..284571492d 100644 --- a/nvme.spec.in +++ b/nvme.spec.in @@ -33,7 +33,7 @@ touch %{buildroot}@SYSCONFDIR@/nvme/hostid @UDEVRULESDIR@/65-persistent-net-nbft.rules @UDEVRULESDIR@/70-nvmf-autoconnect.rules @UDEVRULESDIR@/71-nvmf-netapp.rules -@DRACUTRILESDIR@/70-nvmf-autoconnect.conf +@DRACUTRULESDIR@/70-nvmf-autoconnect.conf @SYSTEMDDIR@/nvmf-connect@.service @SYSTEMDDIR@/nvmefc-boot-connections.service @SYSTEMDDIR@/nvmf-connect-nbft.service diff --git a/nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in b/nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in new file mode 100755 index 0000000000..1dc03243d0 --- /dev/null +++ b/nvmf-autoconnect/NetworkManager/99-nvme-nbft-connect.sh.in @@ -0,0 +1,5 @@ +#!/bin/bash + +if [[ "$1" == nbft* ]] && [[ "$2" == "up" ]]; then + systemctl start nvmf-connect-nbft.service +fi diff --git a/nvmf-autoconnect/NetworkManager/99-nvme-nbft-no-ignore-carrier.conf.in b/nvmf-autoconnect/NetworkManager/99-nvme-nbft-no-ignore-carrier.conf.in new file mode 100644 index 0000000000..4f1951589e --- /dev/null +++ b/nvmf-autoconnect/NetworkManager/99-nvme-nbft-no-ignore-carrier.conf.in @@ -0,0 +1,15 @@ +# Boot from NVMe over TCP (NBFT) +# +# For NVMe/TCP connections that provide namespaces containing rootfs +# it is crucial to react on carrier events and reconnect any missing +# NVMe/TCP connections as defined in the ACPI NBFT table. A custom +# /usr/lib/NetworkManager/dispatcher.d/99-nvme-nbft-connect.sh hook +# will respawn nvmf-connect-nbft.service on such occasion. + +[device-nbft-no-ignore-carrier] + +# only affects nbft0, nbft1, ... interfaces +match-device=interface-name:nbft* + +# react on link up/down events +ignore-carrier=no diff --git a/nvmf-autoconnect/dracut-95nbft/README.md b/nvmf-autoconnect/dracut-95nbft/README.md new file mode 100644 index 0000000000..57ef070a15 --- /dev/null +++ b/nvmf-autoconnect/dracut-95nbft/README.md @@ -0,0 +1,125 @@ +# The dracut 95nbft module + +Focused solely on providing the Boot from NVMe over TCP functionality, intended +to replace parts of the existing `95nvmf` dracut module. At the moment this all +depends on the recently added NetworkManager NBFT support, though the desire is +to support more network management frameworks in the future. + +Related nvme-cli meson configure options: +* `-Ddracut-module` (default=false) - enables the 95nbft dracut module +* `-Ddracutmodulesdir` (default=`$prefix/lib/dracut/modules.d/`) +* `-Dnetworkmanagerdir` (default=`$prefix/lib/NetworkManager/`) + + +# The design + +(see [dracut.bootup(7)](https://man7.org/linux/man-pages/man7/dracut.bootup.7.html) +for the overall boot process flow) + +The boot process looks roughly as follows: +* `nbft-boot-pre.service` is run, creates udev network link files and tells + dracut to activate networking +* dracut runs `nm-initrd-generator` and starts the NetworkManager daemon +* `systemd-udev-trigger.service` renames the network interfaces +* `nm-wait-online-initrd.service` finishes, indicating networking is up and ready +* `nbft-boot-connect.service` initiates actual NVMe connections +* the dracut initqueue is waiting for specific block devices (rootfs) to appear + +Two major packages are responsible for this: the new nvme-cli dracut module and +the added NBFT support in NetworkManager. + +## The new dracut 95nbft module + +The dracut `module-setup.sh` only installs two systemd unit files sandwiched +between specific dracut phases, nothing else. By default the module is always +included in the initramfs unless _hostonly_ is requested in which case the system +is tested for ACPI NBFT tables presence and the module is only included in such +a case. + +The systemd unit files are only run when the ACPI NBFT tables are present and +no `rd.nvmf.nonbft` kernel commandline argument was provided that otherwise +instruct the boot process to skip the NBFT machinery. + +## nbft-boot-pre.service + +Calls the nvme-cli nbft plugin to generate network link files for each interface +found in all NBFT tables. The interface naming in form of `nbftXhY` consists +of an ACPI NBFT table index (defaults to 0) and the specified HFI index. +In a typical scenario only `nbft0h1`, `nbft0h2`, `nbft1h1`, ... interfaces are +present, however it's up to the pre-OS driver to supply arbitrary indexes, +possibly leading to interface names skipping the order to something like +`nbft0h100` and `nbft99h123`. Comparing to the old `95nvmf` dracut module +ordering, this new naming scheme is geared towards (semi-)stable predictable +network interface names. Keep in mind that the contents of the NBFT tables +is generated from scratch upon every system start and is not always persistent +between reboots. + +The network link files are then picked up by udev on trigger via +`systemd-udev-trigger.service` to apply the new interface names. + +For simplicity and for the time being this systemd unit replaces the traditional +dracut cmdline hook and adds the `rd.neednet=1` `cmdline.d` argument. + +## nm-initrd-generator NBFT support + +https://gitlab.freedesktop.org/NetworkManager/NetworkManager/-/merge_requests/2077 + +Executed before the NetworkManager daemon starts the added NBFT support parses +the ACPI NBFT tables available and generates system connections. Only +referenced by MAC addresses, relying on udev to perform actual interface +renaming. + +The `nm-initrd-generator` doesn't link to `libnvme.so.1` but opens it through +`dlopen()` in runtime. This allows for smaller hostonly initramfs images in case +the NBFT tables are not present in the system. The library is being pulled in +indirectly through the dracut module's requirement of nvme-cli. The +`rd.nvmf.nonbft` kernel commandline argument is respected as well. + +## nbft-boot-connect.service + +Modprobes required modules (`nvme-fabrics`) first. + +Performs actual NVMe connections by calling `nvme connect-all --nbft`. The +nvme-cli code has been modified to return non-zero return code in case one +or more SSNS records fail to connect (except those marked as _'unavailable'_ +by the pre-OS driver), resulting in the service startup failure with defined +respawn of 10 seconds (TBD). This ensures multiple connection attempts while +NetworkManager reacts on link events in the background and the dracut initqueue +eagerly waits for new block devices to appears, to be scanned and mounted. Once +the required block device appears, the wait cycle is ended and the system +continues booting, stopping any queued `nbft-boot-connect.service` respawns +seamlessly. + +The difference from the old dracut `95nvmf` module is that the nvme connection +attempts are not driven by network link up events but have fixed respawn +interval. This may potentially help the cases where the NIC is slow to +initialize, reports link up yet it takes another 5+ seconds before it's fully +able to send/receive packets. We've seen this issue with some 25Gb NICs. + + +# The post-switchroot boot flow + +## nvmf-connect-nbft.service + +This unit is supposed to run once the `network-online.target` has been reached +and calls `nvme connect-all --nbft` again. This ensures additional connection +attempt for records that failed to connect in the initramfs phase. As long as +this call matches existing connections and skips SSNS records that have been +already connected, in an ideal case this would result in an no-op. This is +mostly a one-shot service run in NetworkManager based distros since the target +typically stays reached until reboot. + +## NetworkManager dispatcher hooks + +The nvme-cli package installs a custom NetworkManager dispatcher service hook +(`99-nvme-nbft-connect.sh`) that just restarts `nvmf-connect-nbft.service` on +_link up_ events on `nbft*` interfaces. At the time the hook runs the interface +in question has been fully configured by NetworkManager. This ensures further +reconnection attempts in multipath scenarios where a network interface just came +alive. This is designed as a secondary measure with the kernel nvme host driver +connection recovery being the primary mechanism. + +In order to make link events work properly the `nbft*` interfaces need to be set +not to ignore carrier events. This is done through a custom override snippet +(`99-nvme-nbft-no-ignore-carrier.conf`) as some distributions may opt to follow +legacy server networking behaviour (see the `NetworkManager-config-server` package). diff --git a/nvmf-autoconnect/dracut-95nbft/module-setup.sh.in b/nvmf-autoconnect/dracut-95nbft/module-setup.sh.in new file mode 100755 index 0000000000..32431052f3 --- /dev/null +++ b/nvmf-autoconnect/dracut-95nbft/module-setup.sh.in @@ -0,0 +1,49 @@ +#!/usr/bin/bash + +has_nbft() { + local f found= + for f in /sys/firmware/acpi/tables/NBFT*; do + [ -f "$f" ] || continue + found=1 + break + done + [[ $found ]] +} + +# called by dracut +check() { + require_binaries nvme || return 1 + + [[ $hostonly ]] || [[ $mount_needs ]] && { + if ! has_nbft; then + echo "No ACPI NBFT tables present in the system" + return 255 + fi + } + return 0 +} + +# called by dracut +depends() { + echo bash rootfs-block network + return 0 +} + +# called by dracut +installkernel() { + hostonly="" instmods nvme_tcp nvme_fabrics 8021q +} + +# called by dracut +install() { + inst_multiple nvme + + # TODO: /etc/nvme/hostnqn + + for i in \ + nbft-boot-pre.service \ + nbft-boot-connect.service; do + inst_simple "${moddir}/$i" "${systemdsystemunitdir}/$i" + $SYSTEMCTL -q --root "$initdir" enable $i + done +} diff --git a/nvmf-autoconnect/dracut-95nbft/nbft-boot-connect.service.in b/nvmf-autoconnect/dracut-95nbft/nbft-boot-connect.service.in new file mode 100644 index 0000000000..dbdf2bd17c --- /dev/null +++ b/nvmf-autoconnect/dracut-95nbft/nbft-boot-connect.service.in @@ -0,0 +1,26 @@ +# This unit parses the ACPI NBFT table and performs actual NVMe/TCP connections +[Unit] +# Prevent deadlock by avoiding systemd adding dependencies +# on unreachable targets. +DefaultDependencies=no +# Specify 'rd.nvmf.nonbft' to disable NBFT boot +ConditionKernelCommandLine=!rd.nvmf.nonbft +ConditionPathExistsGlob=/sys/firmware/acpi/tables/NBFT* +Wants=network-online.target +After=network-online.target +Before=remote-fs-pre.target + +[Service] +RestrictAddressFamilies=AF_INET AF_INET6 +Type=oneshot +# Connection attempt recovery +Restart=on-failure +RestartSec=10 +StartLimitIntervalSec=0 +StartLimitBurst=0 +# No 'modprobe@.service' available in the initramfs +ExecStartPre=-/sbin/modprobe nvme-fabrics +ExecStart=/usr/sbin/nvme connect-all --nbft + +[Install] +WantedBy=sysinit.target diff --git a/nvmf-autoconnect/dracut-95nbft/nbft-boot-pre.service.in b/nvmf-autoconnect/dracut-95nbft/nbft-boot-pre.service.in new file mode 100644 index 0000000000..e2d334f4f6 --- /dev/null +++ b/nvmf-autoconnect/dracut-95nbft/nbft-boot-pre.service.in @@ -0,0 +1,20 @@ +# This unit checks for the ACPI NBFT table presence and tells +# dracut to activate networking. +[Unit] +DefaultDependencies=no +# Specify 'rd.nvmf.nonbft' to disable NBFT boot +ConditionKernelCommandLine=!rd.nvmf.nonbft +ConditionPathExistsGlob=/sys/firmware/acpi/tables/NBFT* +Before=dracut-cmdline.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStartPre=-/usr/bin/mkdir -p /etc/systemd/network +# generate udev link files +ExecStart=/usr/sbin/nvme nbft gen-udev-link-files /etc/systemd/network/ +# tell dracut to start the network +ExecStart=bash -c 'echo rd.neednet=1 > /etc/cmdline.d/95nbft-args.conf' + +[Install] +WantedBy=sysinit.target diff --git a/plugins/nbft/nbft-plugin.c b/plugins/nbft/nbft-plugin.c index f25941a5db..32f4a3468b 100644 --- a/plugins/nbft/nbft-plugin.c +++ b/plugins/nbft/nbft-plugin.c @@ -571,3 +571,108 @@ int show_nbft(int argc, char **argv, struct command *cmd, struct plugin *plugin) } return ret; } + +/* refer to systemd.link(5) */ +#define LINK_FILE_NAME_FORMAT "10-nbft-iface-%s.link" + +static void write_network_link_file(struct nbft_info_hfi *hfi, char *table_filename, char *path) +{ + _cleanup_free_ char *bname = NULL; + char *table_id; + char *s; + _cleanup_free_ char *ifname = NULL; + _cleanup_free_ char *link_fname = NULL; + FILE *f; + + bname = strdup(table_filename); + table_id = basename(table_filename); + if (!table_id || !strstr(table_id, "NBFT")) { + fprintf(stderr, "Warning: unable to determine NBFT table suffix: %s\n", + table_filename); + return; + } + + /* Proposed predictable interface naming (nbftXhY): + * 'NBFT' --> 'nbft0' + * 'NBFT1' --> 'nbft1' + * 'NBFT999' --> 'nbft999' + * 'NBFT0' --> 'nbft00' + * 'NBFT01' --> 'nbft001' + * 'NBFT099' --> 'nbft0099' + */ + s = strstr(table_id, "NBFT") + 4; + if (*s == '\0') { + if (asprintf(&ifname, "nbft0h%d", hfi->index) < 0) + return; + } else if (*s == '0') { + if (asprintf(&ifname, "nbft0%sh%d", s, hfi->index) < 0) + return; + } else { + if (asprintf(&ifname, "nbft%sh%d", s, hfi->index) < 0) + return; + } + + /* write the file */ + if (asprintf(&link_fname, "%s/" LINK_FILE_NAME_FORMAT, path, ifname) < 0) + return; + f = fopen(link_fname, "w"); + if (!f) { + fprintf(stderr, "Error writing file %s: %m\n", link_fname); + return; + } + fprintf(f, "# Generated by nvme-cli nbft gen-udev-link-files\n"); + fprintf(f, "[Match]\n"); + fprintf(f, "MACAddress=%02x:%02x:%02x:%02x:%02x:%02x\n", + hfi->tcp_info.mac_addr[0], + hfi->tcp_info.mac_addr[1], + hfi->tcp_info.mac_addr[2], + hfi->tcp_info.mac_addr[3], + hfi->tcp_info.mac_addr[4], + hfi->tcp_info.mac_addr[5]); + fprintf(f, "\n[Link]\n"); + fprintf(f, "Name=%s\n", ifname); + fclose(f); +} + +int gen_udev_link_files(int argc, char **argv, struct command *cmd, struct plugin *plugin) +{ + const char *desc = "Generate udev network link files."; + char *nbft_path = NBFT_SYSFS_PATH; + unsigned int verbose = 0; + int ret; + struct list_head nbft_list; + struct nbft_file_entry *entry = NULL; + struct nbft_info_hfi **hfi; + + OPT_ARGS(opts) = { + OPT_STRING("nbft-path", 0, "STR", &nbft_path, "user-defined path for NBFT tables"), + OPT_INCR("verbose", 'v', &verbose, "Increase logging verbosity"), + OPT_END() + }; + + /* TODO: find a way to avoid adding '' option */ + ret = argconfig_parse(argc, argv, desc, opts); + if (ret) + return ret; + + log_level = map_log_level(verbose, false /* quiet */); + nvme_init_default_logging(stderr, log_level, false, false); + + if (optind >= argc) { + fprintf(stderr, "Fatal: missing target udev network link dir\n\n"); + fprintf(stderr, "Usage: nvme nbft gen-udev-link-files [OPTIONS] \n"); + return -1; + } + + list_head_init(&nbft_list); + ret = read_nbft_files(&nbft_list, nbft_path); + if (!ret) { + list_for_each(&nbft_list, entry, node) { + for (hfi = entry->nbft->hfi_list; hfi && *hfi; hfi++) + write_network_link_file(*hfi, entry->nbft->filename, argv[optind]); + } + } + free_nbfts(&nbft_list); + + return ret; +} diff --git a/plugins/nbft/nbft-plugin.h b/plugins/nbft/nbft-plugin.h index 018349d961..2c3c97a17a 100644 --- a/plugins/nbft/nbft-plugin.h +++ b/plugins/nbft/nbft-plugin.h @@ -10,6 +10,8 @@ PLUGIN(NAME("nbft", "ACPI NBFT table extensions", NVME_VERSION), COMMAND_LIST( ENTRY("show", "Show contents of ACPI NBFT tables", show_nbft) + ENTRY("gen-udev-link-files", "Generate udev network link files", + gen_udev_link_files) ) );