From 4e76bb98d1c29c99832388e132987d8f5ea598ce Mon Sep 17 00:00:00 2001 From: Swarit Pandey Date: Wed, 6 May 2026 03:21:44 +0530 Subject: [PATCH 1/3] fix(linux/mac): lock contention race and console user edge case Signed-off-by: Swarit Pandey --- cmd/stepsecurity-dev-machine-guard/main.go | 12 ++++++ internal/device/device_test.go | 45 ++++++++++++++++++++++ internal/executor/executor.go | 21 +++++++--- internal/executor/mock.go | 19 +++++++++ internal/systemd/systemd.go | 29 +++++++++++++- internal/systemd/systemd_test.go | 39 +++++++++++++++++++ internal/telemetry/telemetry.go | 27 ++++++++----- 7 files changed, 175 insertions(+), 17 deletions(-) create mode 100644 internal/systemd/systemd_test.go diff --git a/cmd/stepsecurity-dev-machine-guard/main.go b/cmd/stepsecurity-dev-machine-guard/main.go index c512f2a..0d34cdf 100644 --- a/cmd/stepsecurity-dev-machine-guard/main.go +++ b/cmd/stepsecurity-dev-machine-guard/main.go @@ -150,6 +150,18 @@ func main() { os.Exit(1) } + // On Linux, systemd.Install enabled the timer but did not start it. + // Start it now that the inline scan above has released the singleton + // lock, so the timer's first (Persistent=true catch-up) firing does + // not race with that scan (issue #62). Best-effort: a failure here + // means scheduled scans won't run until the next user-systemd + // reload, but the install is otherwise complete. + if runtime.GOOS == "linux" { + if err := systemd.StartTimer(exec, log); err != nil { + log.Progress("warning: timer start failed (%v) — scheduled scans will resume after the next user-systemd reload", err) + } + } + case "uninstall": _, _ = fmt.Fprintf(os.Stdout, "StepSecurity Dev Machine Guard v%s\n\n", buildinfo.Version) switch runtime.GOOS { diff --git a/internal/device/device_test.go b/internal/device/device_test.go index 76832b0..fad1658 100644 --- a/internal/device/device_test.go +++ b/internal/device/device_test.go @@ -2,6 +2,7 @@ package device import ( "context" + "errors" "testing" "github.com/step-security/dev-machine-guard/internal/executor" @@ -161,6 +162,50 @@ func TestGather_LinuxDistroOnly(t *testing.T) { } } +// TestGather_NoConsoleUser_DoesNotLeakRoot covers issue #63: when running as +// a daemon on macOS and LoggedInUser() reports an error (no GUI console +// user), getDeveloperIdentity must not silently fall back to the process's +// own user (which under a LaunchDaemon is "root"). Identity should resolve +// to "unknown" so the telemetry payload's no_user_logged_in flag fires. +func TestGather_NoConsoleUser_DoesNotLeakRoot(t *testing.T) { + mock := executor.NewMock() + mock.SetGOOS("darwin") + mock.SetIsRoot(true) + mock.SetUsername("root") // process-owner identity, the trap + mock.SetHostname("mac") + mock.SetCommand("SERIAL\n \"IOPlatformSerialNumber\" = \"SERIAL\"\n", "", 0, "ioreg", "-l") + mock.SetCommand("15.1\n", "", 0, "sw_vers", "-productVersion") + mock.SetLoggedInUserError(errors.New("no GUI console user (owner=\"_windowserver\")")) + + dev := Gather(context.Background(), mock) + + if dev.UserIdentity != "unknown" { + t.Errorf("user_identity: expected 'unknown' (no console user), got %q — root leak from CurrentUser fallback", dev.UserIdentity) + } +} + +// TestGather_NoConsoleUser_EnvVarRescues asserts that the env-var path in +// getDeveloperIdentity still works when LoggedInUser would error. Operators +// who populate USER_EMAIL/DEVELOPER_EMAIL in the daemon plist should still +// get correct identity attribution even on no-console-user hosts. +func TestGather_NoConsoleUser_EnvVarRescues(t *testing.T) { + mock := executor.NewMock() + mock.SetGOOS("darwin") + mock.SetIsRoot(true) + mock.SetUsername("root") + mock.SetHostname("mac") + mock.SetCommand("SERIAL\n \"IOPlatformSerialNumber\" = \"SERIAL\"\n", "", 0, "ioreg", "-l") + mock.SetCommand("15.1\n", "", 0, "sw_vers", "-productVersion") + mock.SetLoggedInUserError(errors.New("no console user")) + mock.SetEnv("USER_EMAIL", "operator@example.com") + + dev := Gather(context.Background(), mock) + + if dev.UserIdentity != "operator@example.com" { + t.Errorf("user_identity: expected USER_EMAIL value, got %q", dev.UserIdentity) + } +} + func TestGather_Windows(t *testing.T) { mock := executor.NewMock() mock.SetGOOS("windows") diff --git a/internal/executor/executor.go b/internal/executor/executor.go index 0d4ffe7..65c6cfd 100644 --- a/internal/executor/executor.go +++ b/internal/executor/executor.go @@ -186,23 +186,32 @@ func (r *Real) LoggedInUser() (*user.User, error) { return r.CurrentUser() } - // On macOS running as root, detect the console user. - // This mirrors the bash script's get_logged_in_user_info() which uses - // stat -f%Su /dev/console to find who is actually logged in. + // On macOS running as root, detect the console user via + // stat -f%Su /dev/console (mirrors the bash script's + // get_logged_in_user_info()). + // + // When the lookup can't yield a real GUI user — stat errored, console + // is owned by root or _windowserver, or the resolved name doesn't + // exist in Directory Services — we must NOT fall back to + // r.CurrentUser(). Under a LaunchDaemon that returns the root user + // with err == nil, callers treat that as "root is the developer", and + // the telemetry pipeline ships user_identity="root" with + // no_user_logged_in=false (issue #63). Surface the absence as an + // error so callers can flag the run as no-user instead. ctx := context.Background() stdout, _, _, err := r.Run(ctx, "stat", "-f%Su", "/dev/console") if err != nil { - return r.CurrentUser() + return nil, fmt.Errorf("stat /dev/console failed: %w", err) } username := strings.TrimSpace(stdout) if username == "" || username == "root" || username == "_windowserver" { - return r.CurrentUser() + return nil, fmt.Errorf("no GUI console user (owner=%q)", username) } u, err := user.Lookup(username) if err != nil { - return r.CurrentUser() + return nil, fmt.Errorf("console user %q not in directory services: %w", username, err) } return u, nil diff --git a/internal/executor/mock.go b/internal/executor/mock.go index df31836..cc44cbe 100644 --- a/internal/executor/mock.go +++ b/internal/executor/mock.go @@ -43,6 +43,11 @@ type Mock struct { // macOS Command Line Tools presence (false simulates a Mac without CLT // installed, where /usr/bin/python3 etc. are install-prompt shims). appleCLTInstalled bool + + // LoggedInUser override — when set, LoggedInUser returns (nil, err) + // instead of falling through to CurrentUser. Used by tests covering + // the macOS+root "no console user" branch (issue #63). + loggedInUserErr error } type cmdResult struct { @@ -177,6 +182,14 @@ func (m *Mock) SetAppleCLTInstalled(installed bool) { m.appleCLTInstalled = installed } +// SetLoggedInUserError makes LoggedInUser return (nil, err). Pass nil to +// reset to default behavior (delegating to CurrentUser). +func (m *Mock) SetLoggedInUserError(err error) { + m.mu.Lock() + defer m.mu.Unlock() + m.loggedInUserErr = err +} + // --- Executor interface --- func (m *Mock) Run(_ context.Context, name string, args ...string) (string, string, int, error) { @@ -298,6 +311,12 @@ func (m *Mock) Glob(pattern string) ([]string, error) { } func (m *Mock) LoggedInUser() (*user.User, error) { + m.mu.RLock() + err := m.loggedInUserErr + m.mu.RUnlock() + if err != nil { + return nil, err + } return m.CurrentUser() } diff --git a/internal/systemd/systemd.go b/internal/systemd/systemd.go index 42dc132..3a15d37 100644 --- a/internal/systemd/systemd.go +++ b/internal/systemd/systemd.go @@ -78,7 +78,13 @@ func Install(exec executor.Executor, log *progress.Logger) error { return fmt.Errorf("daemon-reload failed (exit code %d): %s", daemonExitCode, daemonStderr) } - _, stderr, exitCode, err := exec.Run(ctx, "systemctl", "--user", "enable", "--now", unitName+".timer") + // Enable (without --now) so the unit is loaded across reboots. Activating + // the timer in this session is deferred to StartTimer, which the install + // command calls only after its inline post-install telemetry has released + // the singleton lock. If we used --now here, the timer's Persistent=true + + // already-elapsed OnBootSec would fire the service immediately and race + // with that inline run on the lockfile (issue #62). + _, stderr, exitCode, err := exec.Run(ctx, "systemctl", "--user", "enable", unitName+".timer") if err != nil { return fmt.Errorf("failed to enable timer: %w", err) } @@ -96,6 +102,27 @@ func Install(exec executor.Executor, log *progress.Logger) error { return nil } +// StartTimer activates the timer that Install enabled. Split out from Install +// so the install command can run its inline post-install telemetry first +// (and release the singleton lock) before the timer is allowed to fire its +// own first invocation. With Persistent=true on the timer unit, this start +// will trigger one immediate catch-up scan via the service unit — that's +// fine because the inline scan has already completed. +func StartTimer(exec executor.Executor, log *progress.Logger) error { + ctx := context.Background() + + _, stderr, exitCode, err := exec.Run(ctx, "systemctl", "--user", "start", unitName+".timer") + if err != nil { + return fmt.Errorf("failed to start timer: %w", err) + } + if exitCode != 0 { + return fmt.Errorf("failed to start timer (exit code %d): %s", exitCode, stderr) + } + + log.Progress("Timer started") + return nil +} + // Uninstall removes the systemd user timer and service units. func Uninstall(exec executor.Executor, log *progress.Logger) error { ctx := context.Background() diff --git a/internal/systemd/systemd_test.go b/internal/systemd/systemd_test.go new file mode 100644 index 0000000..59a7d38 --- /dev/null +++ b/internal/systemd/systemd_test.go @@ -0,0 +1,39 @@ +package systemd + +import ( + "strings" + "testing" + + "github.com/step-security/dev-machine-guard/internal/executor" + "github.com/step-security/dev-machine-guard/internal/progress" +) + +// TestStartTimer_IssuesPlainStart asserts the timer-activation command does +// not carry --now (which would be redundant for `start`) and targets the +// expected unit name. Deliberately separate from Install so the install-time +// race fix (issue #62) — enable without --now, then StartTimer after the +// inline scan — stays locked in. +func TestStartTimer_IssuesPlainStart(t *testing.T) { + mock := executor.NewMock() + mock.SetCommand("", "", 0, "systemctl", "--user", "start", unitName+".timer") + + if err := StartTimer(mock, progress.NewLogger(progress.LevelInfo)); err != nil { + t.Fatalf("StartTimer returned error: %v", err) + } +} + +// TestStartTimer_PropagatesFailure asserts the function surfaces a non-zero +// systemctl exit so the install command can warn the operator. +func TestStartTimer_PropagatesFailure(t *testing.T) { + mock := executor.NewMock() + mock.SetCommand("", "Failed to start: Unit not found", 1, + "systemctl", "--user", "start", unitName+".timer") + + err := StartTimer(mock, progress.NewLogger(progress.LevelInfo)) + if err == nil { + t.Fatal("expected error from non-zero systemctl exit, got nil") + } + if !strings.Contains(err.Error(), "exit code 1") { + t.Errorf("error should reference the exit code: %v", err) + } +} diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 82a9e72..7bfa423 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -514,16 +514,23 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err // Build payload payload := &Payload{ - CustomerID: config.CustomerID, - DeviceID: dev.SerialNumber, - SerialNumber: dev.SerialNumber, - UserIdentity: dev.UserIdentity, - Hostname: dev.Hostname, - Platform: dev.Platform, - OSVersion: dev.OSVersion, - AgentVersion: buildinfo.Version, - CollectedAt: endTime.Unix(), - NoUserLoggedIn: dev.UserIdentity == "" || dev.UserIdentity == "unknown", + CustomerID: config.CustomerID, + DeviceID: dev.SerialNumber, + SerialNumber: dev.SerialNumber, + UserIdentity: dev.UserIdentity, + Hostname: dev.Hostname, + Platform: dev.Platform, + OSVersion: dev.OSVersion, + AgentVersion: buildinfo.Version, + CollectedAt: endTime.Unix(), + // Treat a daemon-context UserIdentity == "root" as "no real user" + // too. Even with the executor.LoggedInUser() fix (issue #63), this + // catches any other code path that ends up shipping "root" from + // under a LaunchDaemon (e.g., env-var path returning literal "root", + // future regressions in detection). + NoUserLoggedIn: dev.UserIdentity == "" || + dev.UserIdentity == "unknown" || + (dev.UserIdentity == "root" && exec.IsRoot()), IDEExtensions: extensions, IDEInstallations: ides, From 30b0e7c1728ccba2dde8d7a0ef848cdc0e4214d3 Mon Sep 17 00:00:00 2001 From: Swarit Pandey Date: Thu, 14 May 2026 08:03:24 +0530 Subject: [PATCH 2/3] chore: address comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - telemetry.go: hoist the no-user predicate into a local so the warning, the "Developer:" progress line, and the NoUserLoggedIn payload field agree (previously the warning fired only for empty/"unknown", letting "Developer: root" still print while NoUserLoggedIn was true). - main.go (install): start the systemd timer regardless of telemetry.Run outcome — the install itself succeeded, so the schedule should activate. Demoted the timer-start failure log from Progress to Warn. - launchd / scan / aicli / telemetry: callers using LoggedInUser only to obtain a HomeDir now fall back to CurrentUser when LoggedInUser returns an error (the post-#63 behavior). Identity-attribution callers (device.go, the sudo-delegation gate in telemetry.go) keep the strict no-fallback behavior. Signed-off-by: Swarit Pandey --- cmd/stepsecurity-dev-machine-guard/main.go | 18 +++++++----- internal/detector/aicli.go | 13 ++++++--- internal/launchd/launchd.go | 12 +++++--- internal/scan/scanner.go | 8 +++-- internal/telemetry/telemetry.go | 34 +++++++++++++--------- 5 files changed, 53 insertions(+), 32 deletions(-) diff --git a/cmd/stepsecurity-dev-machine-guard/main.go b/cmd/stepsecurity-dev-machine-guard/main.go index 0d34cdf..f20f68a 100644 --- a/cmd/stepsecurity-dev-machine-guard/main.go +++ b/cmd/stepsecurity-dev-machine-guard/main.go @@ -145,23 +145,25 @@ func main() { } log.Progress("Sending initial telemetry...") fmt.Println() - if err := telemetry.Run(exec, log, cfg); err != nil { - log.Error("%v", err) - os.Exit(1) - } + telemetryErr := telemetry.Run(exec, log, cfg) // On Linux, systemd.Install enabled the timer but did not start it. // Start it now that the inline scan above has released the singleton // lock, so the timer's first (Persistent=true catch-up) firing does - // not race with that scan (issue #62). Best-effort: a failure here - // means scheduled scans won't run until the next user-systemd - // reload, but the install is otherwise complete. + // not race with that scan (issue #62). Run regardless of the + // telemetry result — the install itself succeeded and the schedule + // should activate; a failed initial telemetry run does not undo it. if runtime.GOOS == "linux" { if err := systemd.StartTimer(exec, log); err != nil { - log.Progress("warning: timer start failed (%v) — scheduled scans will resume after the next user-systemd reload", err) + log.Warn("timer start failed (%v) — scheduled scans will resume after the next user-systemd reload", err) } } + if telemetryErr != nil { + log.Error("%v", telemetryErr) + os.Exit(1) + } + case "uninstall": _, _ = fmt.Fprintf(os.Stdout, "StepSecurity Dev Machine Guard v%s\n\n", buildinfo.Version) switch runtime.GOOS { diff --git a/internal/detector/aicli.go b/internal/detector/aicli.go index 06f0e84..32a6bab 100644 --- a/internal/detector/aicli.go +++ b/internal/detector/aicli.go @@ -355,11 +355,16 @@ func expandTilde(path, homeDir string) string { } func getHomeDir(exec executor.Executor) string { - u, err := exec.LoggedInUser() - if err != nil { - return os.TempDir() + if u, err := exec.LoggedInUser(); err == nil { + return u.HomeDir + } + // No console user (issue #63) — fall back to the current user's home + // before giving up to TempDir, otherwise file-path expansion in this + // detector would lose any chance of hitting the right files. + if u, err := exec.CurrentUser(); err == nil { + return u.HomeDir } - return u.HomeDir + return os.TempDir() } // resolveEnvPath replaces %ENVVAR% patterns in Windows-style paths using the executor. diff --git a/internal/launchd/launchd.go b/internal/launchd/launchd.go index 74a5c8d..42233a4 100644 --- a/internal/launchd/launchd.go +++ b/internal/launchd/launchd.go @@ -68,14 +68,18 @@ func Install(exec executor.Executor, log *progress.Logger) error { } } - // Resolve the real user's home directory for the plist. - // When running as root (LaunchDaemon), launchd provides a minimal environment - // without HOME, so os.UserHomeDir() would fail at runtime. We detect the - // logged-in console user now and bake their HOME into the plist. + // Resolve a usable home directory for the plist. + // When running as root (LaunchDaemon), launchd provides a minimal + // environment without HOME, so os.UserHomeDir() would fail at runtime. + // Prefer the GUI console user's HOME; if that lookup fails (no console + // user — LoggedInUser returns an error per issue #63) fall back to + // CurrentUser so the plist still has a value to bake in. userHome := "" if exec.IsRoot() { if u, err := exec.LoggedInUser(); err == nil { userHome = u.HomeDir + } else if u, err := exec.CurrentUser(); err == nil { + userHome = u.HomeDir } } diff --git a/internal/scan/scanner.go b/internal/scan/scanner.go index 0220169..bc7bf4d 100644 --- a/internal/scan/scanner.go +++ b/internal/scan/scanner.go @@ -307,8 +307,12 @@ func resolveSearchDirs(exec executor.Executor, dirs []string) []string { resolved := make([]string, 0, len(dirs)) for _, d := range dirs { if d == "$HOME" { - u, err := exec.LoggedInUser() - if err == nil { + if u, err := exec.LoggedInUser(); err == nil { + d = u.HomeDir + } else if u, err := exec.CurrentUser(); err == nil { + // No console user (issue #63): we still need *some* home + // to expand $HOME against, otherwise the literal "$HOME" + // goes downstream and search misses everything. d = u.HomeDir } } diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 7bfa423..6258182 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -185,15 +185,25 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err log.Progress("Gathering device information...") dev := device.Gather(ctx, exec) deviceID = dev.SerialNumber + // Single source of truth for "is this a real developer or a daemon + // context?" — same predicate the payload uses below, so the warning, + // the Developer: line, and the telemetry field always agree. + noUserLoggedIn := dev.UserIdentity == "" || + dev.UserIdentity == "unknown" || + (dev.UserIdentity == "root" && exec.IsRoot()) log.Progress("Device ID (Serial): %s", dev.SerialNumber) log.Progress("OS Version: %s", dev.OSVersion) - log.Progress("Developer: %s", dev.UserIdentity) - log.Debug("device gathered: hostname=%q platform=%q serial=%q user_identity=%q", dev.Hostname, dev.Platform, dev.SerialNumber, dev.UserIdentity) + if noUserLoggedIn { + log.Progress("Developer: (no user logged in)") + } else { + log.Progress("Developer: %s", dev.UserIdentity) + } + log.Debug("device gathered: hostname=%q platform=%q serial=%q user_identity=%q no_user=%v", dev.Hostname, dev.Platform, dev.SerialNumber, dev.UserIdentity, noUserLoggedIn) if dev.SerialNumber == "" { log.Warn("device serial number could not be determined — telemetry will upload with empty device_id") } - if dev.UserIdentity == "" || dev.UserIdentity == "unknown" { - log.Warn("user identity could not be determined — telemetry will be marked no_user_logged_in") + if noUserLoggedIn { + log.Warn("no real developer identity (UserIdentity=%q, root=%v) — telemetry will be marked no_user_logged_in", dev.UserIdentity, exec.IsRoot()) } // Report "started" now that we have a device_id. Fire-and-forget. @@ -523,14 +533,7 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err OSVersion: dev.OSVersion, AgentVersion: buildinfo.Version, CollectedAt: endTime.Unix(), - // Treat a daemon-context UserIdentity == "root" as "no real user" - // too. Even with the executor.LoggedInUser() fix (issue #63), this - // catches any other code path that ends up shipping "root" from - // under a LaunchDaemon (e.g., env-var path returning literal "root", - // future regressions in detection). - NoUserLoggedIn: dev.UserIdentity == "" || - dev.UserIdentity == "unknown" || - (dev.UserIdentity == "root" && exec.IsRoot()), + NoUserLoggedIn: noUserLoggedIn, IDEExtensions: extensions, IDEInstallations: ides, @@ -768,8 +771,11 @@ func resolveSearchDirs(exec executor.Executor, dirs []string) []string { resolved := make([]string, 0, len(dirs)) for _, d := range dirs { if d == "$HOME" { - u, err := exec.LoggedInUser() - if err == nil { + if u, err := exec.LoggedInUser(); err == nil { + d = u.HomeDir + } else if u, err := exec.CurrentUser(); err == nil { + // No console user (issue #63): still expand to *some* home + // or the literal "$HOME" would propagate downstream. d = u.HomeDir } } From ab705147833e961df1a2afeca1b01244fd8887cb Mon Sep 17 00:00:00 2001 From: Swarit Pandey Date: Thu, 14 May 2026 13:55:57 +0530 Subject: [PATCH 3/3] chore: gofmt struct alignment in telemetry payload --- internal/telemetry/telemetry.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 6258182..4506275 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -524,15 +524,15 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err // Build payload payload := &Payload{ - CustomerID: config.CustomerID, - DeviceID: dev.SerialNumber, - SerialNumber: dev.SerialNumber, - UserIdentity: dev.UserIdentity, - Hostname: dev.Hostname, - Platform: dev.Platform, - OSVersion: dev.OSVersion, - AgentVersion: buildinfo.Version, - CollectedAt: endTime.Unix(), + CustomerID: config.CustomerID, + DeviceID: dev.SerialNumber, + SerialNumber: dev.SerialNumber, + UserIdentity: dev.UserIdentity, + Hostname: dev.Hostname, + Platform: dev.Platform, + OSVersion: dev.OSVersion, + AgentVersion: buildinfo.Version, + CollectedAt: endTime.Unix(), NoUserLoggedIn: noUserLoggedIn, IDEExtensions: extensions,