diff --git a/docs/release-notes.md b/docs/release-notes.md index e53bdf8d2..cc78bad85 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -17,6 +17,7 @@ nav_order: 9 ### Bug fixes +- Prevent races with udev after disk editing ## Ignition 2.16.2 (2023-07-12) diff --git a/internal/exec/stages/disks/disks.go b/internal/exec/stages/disks/disks.go index a5307fc7a..880fb3b60 100644 --- a/internal/exec/stages/disks/disks.go +++ b/internal/exec/stages/disks/disks.go @@ -23,6 +23,7 @@ import ( "errors" "fmt" "os/exec" + "path/filepath" "github.com/coreos/ignition/v2/config/v3_5_experimental/types" "github.com/coreos/ignition/v2/internal/distro" @@ -106,35 +107,39 @@ func (s stage) Run(config types.Config) error { return fmt.Errorf("failed to create filesystems: %v", err) } - // udevd registers an IN_CLOSE_WRITE inotify watch on block device - // nodes, and synthesizes udev "change" events when the watch fires. - // mkfs.btrfs triggers multiple such events, the first of which - // occurs while there is no recognizable filesystem on the - // partition. Thus, if an existing partition is reformatted as - // btrfs while keeping the same filesystem label, there will be a - // synthesized uevent that deletes the /dev/disk/by-label symlink - // and a second one that restores it. If we didn't account for this, - // a systemd unit that depended on the by-label symlink (e.g. - // systemd-fsck-root.service) could have the symlink deleted out - // from under it. - // - // There's no way to fix this completely. We can't wait for the - // restoring uevent to propagate, since we can't determine which - // specific uevents were triggered by the mkfs. We can wait for - // udev to settle, though it's conceivable that the deleting uevent - // has already been processed and the restoring uevent is still - // sitting in the inotify queue. In practice the uevent queue will - // be the slow one, so this should be good enough. - // - // Test case: boot failure in coreos.ignition.*.btrfsroot kola test. - // - // Additionally, partitioning (and possibly creating raid) suffers - // the same problem. To be safe, always settle. - if _, err := s.Logger.LogCmd( - exec.Command(distro.UdevadmCmd(), "settle"), - "waiting for udev to settle", - ); err != nil { - return fmt.Errorf("udevadm settle failed: %v", err) + return nil +} + +// waitForUdev triggers a tagged event and waits for it to bubble up +// again. This ensures that udev processed the device changes. +// The requirement is that the used device path exists and itself is +// not recreated by udev seeing the changes done. Thus, resolve a +// /dev/disk/by-something/X symlink before performing the device +// changes (i.e., pass /run/ignition/dev_aliases/X) and, e.g., don't +// call it for a partition but the full disk if you modified the +// partition table. +func (s stage) waitForUdev(dev string) error { + // Resolve the original /dev/ABC entry because udevadm wants + // this as argument instead of a symlink like + // /run/ignition/dev_aliases/X. + devPath, err := filepath.EvalSymlinks(dev) + if err != nil { + return fmt.Errorf("failed to resolve device alias %q: %v", dev, err) + } + // By triggering our own event and waiting for it we know that udev + // will have processed the device changes, a bare "udevadm settle" + // is prone to races with the inotify queue. We expect the /dev/DISK + // entry to exist because this function is either called for the full + // disk and only the /dev/DISKpX partition entries will change, or the + // function is called for a partition where the contents changed and + // nothing causes the kernel/udev to reread the partition table and + // recreate the /dev/DISKpX entries. If that was the case best we could + // do here is to add a retry loop (and relax the function comment). + _, err = s.Logger.LogCmd( + exec.Command(distro.UdevadmCmd(), "trigger", "--settle", + devPath), "waiting for triggered uevent") + if err != nil { + return fmt.Errorf("udevadm trigger failed: %v", err) } return nil diff --git a/internal/exec/stages/disks/filesystems.go b/internal/exec/stages/disks/filesystems.go index 6ba872e99..d33c11537 100644 --- a/internal/exec/stages/disks/filesystems.go +++ b/internal/exec/stages/disks/filesystems.go @@ -210,6 +210,29 @@ func (s stage) createFilesystem(fs types.Filesystem) error { return fmt.Errorf("mkfs failed: %v", err) } + // udevd registers an IN_CLOSE_WRITE inotify watch on block device + // nodes, and synthesizes udev "change" events when the watch fires. + // mkfs.btrfs triggers multiple such events, the first of which + // occurs while there is no recognizable filesystem on the + // partition. Thus, if an existing partition is reformatted as + // btrfs while keeping the same filesystem label, there will be a + // synthesized uevent that deletes the /dev/disk/by-label symlink + // and a second one that restores it. If we didn't account for this, + // a systemd unit that depended on the by-label symlink (e.g. + // systemd-fsck-root.service) could have the symlink deleted out + // from under it. + // Trigger a tagged uevent and wait for it because a bare "udevadm + // settle" does not guarantee that all changes were processed + // because it's conceivable that only the deleting uevent has + // already been processed (or none!) while the restoring uevent + // is still sitting in the inotify queue. By triggering our own + // event and waiting for it we know that udev will have processed + // the device changes. + // Test case: boot failure in coreos.ignition.*.btrfsroot kola test. + if err := s.waitForUdev(devAlias); err != nil { + return fmt.Errorf("failed to wait for udev on %q after formatting: %v", devAlias, err) + } + return nil } diff --git a/internal/exec/stages/disks/luks.go b/internal/exec/stages/disks/luks.go index 42f930419..5117234e3 100644 --- a/internal/exec/stages/disks/luks.go +++ b/internal/exec/stages/disks/luks.go @@ -368,6 +368,14 @@ func (s *stage) createLuks(config types.Config) error { } delete(s.State.LuksPersistKeyFiles, luks.Name) } + + // It's best to wait here for the /dev/disk/by-*/X entries to be + // (re)created, not only for other parts of the initramfs but + // also because s.waitOnDevices() can still race with udev's + // disk entry recreation. + if err := s.waitForUdev(devAlias); err != nil { + return fmt.Errorf("failed to wait for udev on %q after LUKS: %v", devAlias, err) + } } return nil diff --git a/internal/exec/stages/disks/partitions.go b/internal/exec/stages/disks/partitions.go index 839536dbf..a21fc0477 100644 --- a/internal/exec/stages/disks/partitions.go +++ b/internal/exec/stages/disks/partitions.go @@ -394,5 +394,14 @@ func (s stage) partitionDisk(dev types.Disk, devAlias string) error { if err := op.Commit(); err != nil { return fmt.Errorf("commit failure: %v", err) } + + // It's best to wait here for the /dev/ABC entries to be + // (re)created, not only for other parts of the initramfs but + // also because s.waitOnDevices() can still race with udev's + // partition entry recreation. + if err := s.waitForUdev(devAlias); err != nil { + return fmt.Errorf("failed to wait for udev on %q after partitioning: %v", devAlias, err) + } + return nil } diff --git a/internal/exec/stages/disks/raid.go b/internal/exec/stages/disks/raid.go index fad26e883..795dd42bd 100644 --- a/internal/exec/stages/disks/raid.go +++ b/internal/exec/stages/disks/raid.go @@ -22,6 +22,7 @@ package disks import ( "fmt" "os/exec" + "strings" "github.com/coreos/ignition/v2/config/v3_5_experimental/types" "github.com/coreos/ignition/v2/internal/distro" @@ -78,6 +79,17 @@ func (s stage) createRaids(config types.Config) error { ); err != nil { return fmt.Errorf("mdadm failed: %v", err) } + + devName := md.Name + if !strings.HasPrefix(devName, "/dev") { + devName = "/dev/md/" + md.Name + } + // Wait for the created device node to show up, no udev + // race prevention required because this node did not + // exist before. + if err := s.waitOnDevices([]string{devName}, "raids"); err != nil { + return err + } } return nil