Skip to content

Commit adbd15d

Browse files
committed
fix(ssi): ensure found datadog-installer has necessary commands
1 parent 2e9ccff commit adbd15d

3 files changed

Lines changed: 273 additions & 5 deletions

File tree

pkg/fleet/installer/packages/apminject/systemd.go

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ import (
1313
_ "embed"
1414
"fmt"
1515
"os"
16+
"os/exec"
1617
"path/filepath"
18+
"time"
1719

1820
"github.com/DataDog/datadog-agent/pkg/fleet/installer/packages/service/systemd"
1921
"github.com/DataDog/datadog-agent/pkg/fleet/installer/telemetry"
@@ -56,8 +58,17 @@ type SystemdServiceManager struct {
5658
// - Regular OCI agent install: datadog-agent is installed before
5759
// datadog-apm-inject, so the agent's embedded installer is in place.
5860
// - Legacy deb/rpm: /usr/bin/datadog-installer ships with the package.
61+
//
62+
// Candidates are additionally verified to support the `apm instrument-start`/
63+
// `instrument-stop` subcommands the unit invokes. This matters on upgrade from
64+
// an older agent: a stale installer (e.g. the previous version's embedded
65+
// binary still pointed to by a `stable` symlink, or an old standalone installer
66+
// at a higher-priority candidate path) would otherwise be baked into ExecStart
67+
// and fail on every boot, silently breaking host injection. resolveInstallerPath
68+
// skips such binaries and falls through to a candidate that actually supports
69+
// the subcommands (e.g. the freshly-copied datadog-installer-ssi).
5970
func NewSystemdServiceManager() *SystemdServiceManager {
60-
installerPath, err := resolveInstallerPath(installerPathCandidates)
71+
installerPath, err := resolveInstallerPath(installerPathCandidates, supportsInstrumentSubcommands)
6172
if err != nil {
6273
log.Warnf("could not resolve datadog-installer path for APM inject service: %v; writeServiceFile will refuse to render the unit", err)
6374
}
@@ -144,7 +155,30 @@ func (s *SystemdServiceManager) writeServiceFile() error {
144155
return nil
145156
}
146157

147-
func resolveInstallerPath(candidates []string) (string, error) {
158+
// installerVerifier reports whether the datadog-installer binary at path
159+
// supports the subcommands the apm-inject unit invokes. It is a parameter of
160+
// resolveInstallerPath so tests can exercise pure path resolution without an
161+
// executable installer on disk.
162+
type installerVerifier func(path string) bool
163+
164+
// supportsInstrumentSubcommands reports whether the installer at path exposes
165+
// the `apm instrument-start` and `apm instrument-stop` subcommands. Installers
166+
// shipped before the systemd-managed preload feature have an `apm` command
167+
// without them, so `apm --help` lists neither. Inspecting the help output is
168+
// side-effect free, unlike actually running instrument-start.
169+
func supportsInstrumentSubcommands(path string) bool {
170+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
171+
defer cancel()
172+
out, err := exec.CommandContext(ctx, path, "apm", "--help").CombinedOutput()
173+
if err != nil {
174+
log.Debugf("installer candidate %s does not support instrument subcommands: %v", path, err)
175+
return false
176+
}
177+
return bytes.Contains(out, []byte("instrument-start")) && bytes.Contains(out, []byte("instrument-stop"))
178+
}
179+
180+
func resolveInstallerPath(candidates []string, verify installerVerifier) (string, error) {
181+
var skippedUnsupported []string
148182
for _, p := range candidates {
149183
info, err := os.Stat(p)
150184
// Skip if doesn't exist.
@@ -155,7 +189,16 @@ func resolveInstallerPath(candidates []string) (string, error) {
155189
if info.IsDir() || info.Mode()&0111 == 0 {
156190
continue
157191
}
192+
// Skip binaries too old to support the subcommands the unit invokes,
193+
// so we never bake a doomed ExecStart into the service file.
194+
if verify != nil && !verify(p) {
195+
skippedUnsupported = append(skippedUnsupported, p)
196+
continue
197+
}
158198
return p, nil
159199
}
200+
if len(skippedUnsupported) > 0 {
201+
return "", fmt.Errorf("found datadog-installer binaries but none support `apm instrument-start` (too old): %v", skippedUnsupported)
202+
}
160203
return "", fmt.Errorf("no datadog-installer binary found among %v", candidates)
161204
}

pkg/fleet/installer/packages/apminject/systemd_test.go

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func TestResolveInstallerPath(t *testing.T) {
126126
executable := filepath.Join(tmpDir, "executable")
127127
require.NoError(t, os.WriteFile(executable, []byte("#!/bin/sh\n"), 0755))
128128

129-
got, err := resolveInstallerPath([]string{missing, nonExec, executable})
129+
got, err := resolveInstallerPath([]string{missing, nonExec, executable}, alwaysSupported)
130130
require.NoError(t, err)
131131
assert.Equal(t, executable, got)
132132
}
@@ -136,7 +136,7 @@ func TestResolveInstallerPath_AllMissing(t *testing.T) {
136136
_, err := resolveInstallerPath([]string{
137137
filepath.Join(tmpDir, "a"),
138138
filepath.Join(tmpDir, "b"),
139-
})
139+
}, alwaysSupported)
140140
assert.Error(t, err)
141141
}
142142

@@ -147,11 +147,41 @@ func TestResolveInstallerPath_SkipsDirectory(t *testing.T) {
147147
exe := filepath.Join(tmpDir, "candidate-exe")
148148
require.NoError(t, os.WriteFile(exe, []byte{}, 0755))
149149

150-
got, err := resolveInstallerPath([]string{dir, exe})
150+
got, err := resolveInstallerPath([]string{dir, exe}, alwaysSupported)
151151
require.NoError(t, err)
152152
assert.Equal(t, exe, got)
153153
}
154154

155+
// TestResolveInstallerPath_SkipsUnsupportedInstaller guards the upgrade case:
156+
// a higher-priority candidate that is on disk and executable but too old to
157+
// support `apm instrument-start` must be skipped in favor of a newer candidate.
158+
func TestResolveInstallerPath_SkipsUnsupportedInstaller(t *testing.T) {
159+
tmpDir := t.TempDir()
160+
stale := filepath.Join(tmpDir, "stale-installer")
161+
require.NoError(t, os.WriteFile(stale, []byte{}, 0755))
162+
fresh := filepath.Join(tmpDir, "fresh-installer")
163+
require.NoError(t, os.WriteFile(fresh, []byte{}, 0755))
164+
165+
verify := func(p string) bool { return p == fresh }
166+
167+
got, err := resolveInstallerPath([]string{stale, fresh}, verify)
168+
require.NoError(t, err)
169+
assert.Equal(t, fresh, got, "stale installer (no instrument-start) must be skipped")
170+
}
171+
172+
// TestResolveInstallerPath_AllUnsupported asserts we fail loudly rather than
173+
// return a stale installer that would produce a unit doomed to fail on boot.
174+
func TestResolveInstallerPath_AllUnsupported(t *testing.T) {
175+
tmpDir := t.TempDir()
176+
exe := filepath.Join(tmpDir, "old-installer")
177+
require.NoError(t, os.WriteFile(exe, []byte{}, 0755))
178+
179+
_, err := resolveInstallerPath([]string{exe}, func(string) bool { return false })
180+
assert.ErrorContains(t, err, "too old")
181+
}
182+
183+
func alwaysSupported(string) bool { return true }
184+
155185
func TestSystemdServiceManager_Uninstall(t *testing.T) {
156186
tmpDir := t.TempDir()
157187
testServicePath := filepath.Join(tmpDir, "test-service.service")
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
// Unless explicitly stated otherwise all files in this repository are licensed
2+
// under the Apache License Version 2.0.
3+
// This product includes software developed at Datadog (https://www.datadoghq.com/).
4+
// Copyright 2016-present Datadog, Inc.
5+
6+
package installscript
7+
8+
import (
9+
"fmt"
10+
"os"
11+
"regexp"
12+
"strings"
13+
"testing"
14+
"time"
15+
16+
"github.com/stretchr/testify/assert"
17+
"github.com/stretchr/testify/require"
18+
19+
e2eos "github.com/DataDog/datadog-agent/test/e2e-framework/components/os"
20+
"github.com/DataDog/datadog-agent/test/e2e-framework/scenarios/aws/ec2"
21+
"github.com/DataDog/datadog-agent/test/e2e-framework/testing/e2e"
22+
awshost "github.com/DataDog/datadog-agent/test/e2e-framework/testing/provisioners/aws/host"
23+
installer "github.com/DataDog/datadog-agent/test/new-e2e/tests/installer/unix"
24+
)
25+
26+
const (
27+
// apmInjectServicePath is where the apminject package writes its systemd unit.
28+
// Mirrors systemd.UserUnitsPath ("/etc/systemd/system") + the unit name.
29+
apmInjectServicePath = "/etc/systemd/system/datadog-apm-inject.service"
30+
apmInjectServiceName = "datadog-apm-inject.service"
31+
// launcherPreloadPath is the line the apm-inject service must keep in
32+
// /etc/ld.so.preload for host instrumentation to be effective.
33+
launcherPreloadPath = "/opt/datadog-packages/datadog-apm-inject/stable/inject/launcher.preload.so"
34+
// productionSSIScriptURL installs the current GA SSI stack. That stack predates
35+
// the systemd-managed ld.so.preload feature and ships an installer whose
36+
// `apm` command has no `instrument-start`/`instrument-stop` subcommands.
37+
productionSSIScriptURL = "https://install.datadoghq.com/scripts/install-ssi.sh"
38+
)
39+
40+
// execStartInstallerRe extracts the installer binary path baked into the unit's
41+
// ExecStart line, e.g. "ExecStart=/opt/.../installer apm instrument-start host".
42+
var execStartInstallerRe = regexp.MustCompile(`(?m)^ExecStart=(\S+) apm instrument-start`)
43+
44+
// ssiUpgradeSuite verifies that upgrading a host that already has an older SSI
45+
// stack (whose datadog-installer lacks the `apm instrument-start`/`instrument-stop`
46+
// subcommands) to a build that enables systemd-managed ld.so.preload produces a
47+
// datadog-apm-inject.service that points at a *working* installer — and that host
48+
// instrumentation survives a reboot, when only the service (not the install-time
49+
// direct fallback) is responsible for populating /etc/ld.so.preload.
50+
type ssiUpgradeSuite struct {
51+
installerScriptBaseSuite
52+
}
53+
54+
// TestSSIUpgrade provisions a single host and runs the upgrade suite. The upgrade
55+
// target is the current pipeline build, so a pipeline id (or commit sha) is required.
56+
func TestSSIUpgrade(t *testing.T) {
57+
requirePipeline(t)
58+
59+
flavor := e2eos.Ubuntu2404
60+
flavor.Architecture = e2eos.AMD64Arch
61+
62+
suite := &ssiUpgradeSuite{
63+
installerScriptBaseSuite: newInstallerScriptSuite(
64+
"installer-ssi-upgrade", flavor, flavor.Architecture,
65+
awshost.WithRunOptions(ec2.WithoutFakeIntake()),
66+
awshost.WithRunOptions(ec2.WithoutAgent()),
67+
),
68+
}
69+
70+
opts := []awshost.ProvisionerOption{
71+
awshost.WithRunOptions(
72+
ec2.WithEC2InstanceOptions(ec2.WithOSArch(flavor, flavor.Architecture)),
73+
ec2.WithoutAgent(),
74+
),
75+
}
76+
opts = append(opts, suite.ProvisionerOptions()...)
77+
78+
e2e.Run(t, suite,
79+
e2e.WithProvisioner(awshost.Provisioner(opts...)),
80+
e2e.WithStackName(suite.Name()),
81+
)
82+
}
83+
84+
func (s *ssiUpgradeSuite) TestUpgradePreservesHostInjection() {
85+
defer s.Purge()
86+
87+
// 1. Install the current GA SSI stack from production. This lands an older
88+
// datadog-installer on disk — one whose `apm` command has no
89+
// instrument-start/instrument-stop subcommands.
90+
s.installProductionSSI()
91+
s.host.AssertPackageInstalledByInstaller("datadog-apm-inject")
92+
s.assertLDPreloadInstrumented("after fresh GA install")
93+
94+
// 2. Upgrade to the pipeline build and opt into systemd-managed preload.
95+
s.RunInstallScript(
96+
s.scriptURLPrefix+"install-ssi.sh",
97+
"DD_SITE=datadoghq.com",
98+
"DD_APM_INSTRUMENTATION_LIBRARIES=python:4",
99+
"DD_APM_INSTRUMENTATION_ENABLED=host",
100+
"DD_APM_INSTRUMENTATION_PRELOAD_MODE=systemd",
101+
"DD_NO_AGENT_INSTALL=true",
102+
"DD_INSTALLER_REGISTRY_URL_AGENT_PACKAGE=installtesting.datad0g.com.internal.dda-testing.com",
103+
)
104+
105+
// 3. The unit must exist, be enabled, and — crucially — reference an installer
106+
// that actually supports the subcommand it invokes. A stale installer
107+
// resolved from a higher-priority candidate path would silently break SSI
108+
// on the next boot.
109+
state := s.host.State()
110+
state.AssertFileExists(apmInjectServicePath, 0644, "root", "root")
111+
state.AssertUnitsLoaded(apmInjectServiceName)
112+
state.AssertUnitsEnabled(apmInjectServiceName)
113+
114+
installerPath := s.execStartInstallerPath()
115+
_, err := s.Env().RemoteHost.Execute(fmt.Sprintf("sudo %s apm instrument-start --help", installerPath))
116+
require.NoErrorf(s.T(), err,
117+
"datadog-apm-inject.service ExecStart references %q, which does not support `apm instrument-start`; "+
118+
"the service would fail on every boot and host injection would silently break", installerPath)
119+
120+
// Right after the upgrade, ld.so.preload is correct regardless of the service
121+
// because Instrument() also writes it directly. The reboot below is what proves
122+
// the *service* (and thus the resolved installer) is healthy.
123+
s.assertLDPreloadInstrumented("after upgrade, before reboot")
124+
125+
// 4. Reboot. Now only datadog-apm-inject.service's ExecStart can populate
126+
// /etc/ld.so.preload — the install-time direct write does not run again.
127+
s.reboot()
128+
129+
require.EventuallyWithT(s.T(), func(c *assert.CollectT) {
130+
out, err := s.Env().RemoteHost.Execute("systemctl is-active " + apmInjectServiceName)
131+
assert.NoErrorf(c, err, "apm-inject service not active after reboot (status: %s)", strings.TrimSpace(out))
132+
}, 2*time.Minute, 5*time.Second)
133+
s.assertLDPreloadInstrumented("after reboot")
134+
}
135+
136+
// installProductionSSI installs the GA SSI stack directly from production, without
137+
// any pipeline/testing registry overrides, so a genuinely older installer ends up
138+
// on disk.
139+
func (s *ssiUpgradeSuite) installProductionSSI() {
140+
s.Env().RemoteHost.MustExecute(fmt.Sprintf("curl -L %s > install_ssi_prod", productionSSIScriptURL))
141+
params := []string{
142+
"DD_API_KEY=" + installer.GetAPIKey(),
143+
"DD_SITE=datadoghq.com",
144+
"DD_APM_INSTRUMENTATION_LIBRARIES=python:4",
145+
"DD_APM_INSTRUMENTATION_ENABLED=host",
146+
"DD_NO_AGENT_INSTALL=true",
147+
}
148+
_, err := s.Env().RemoteHost.Execute(strings.Join(params, " ") + " bash install_ssi_prod")
149+
require.NoError(s.T(), err, "failed to install production SSI stack")
150+
}
151+
152+
// execStartInstallerPath returns the installer binary path baked into the unit's
153+
// ExecStart line.
154+
func (s *ssiUpgradeSuite) execStartInstallerPath() string {
155+
content, err := s.host.ReadFile(apmInjectServicePath)
156+
require.NoError(s.T(), err)
157+
m := execStartInstallerRe.FindStringSubmatch(string(content))
158+
require.Lenf(s.T(), m, 2, "could not find ExecStart installer path in unit:\n%s", string(content))
159+
return m[1]
160+
}
161+
162+
// assertLDPreloadInstrumented asserts the injector launcher is present in
163+
// /etc/ld.so.preload.
164+
func (s *ssiUpgradeSuite) assertLDPreloadInstrumented(when string) {
165+
out := s.Env().RemoteHost.MustExecute("cat /etc/ld.so.preload || true")
166+
require.Containsf(s.T(), out, launcherPreloadPath,
167+
"injector launcher missing from /etc/ld.so.preload (%s); contents:\n%s", when, out)
168+
}
169+
170+
// reboot reboots the host and waits for it to come back with a new boot id.
171+
func (s *ssiUpgradeSuite) reboot() {
172+
before := strings.TrimSpace(s.Env().RemoteHost.MustExecute("cat /proc/sys/kernel/random/boot_id"))
173+
// The connection drops as the host goes down; ignore the error.
174+
_, _ = s.Env().RemoteHost.Execute("sudo reboot")
175+
require.EventuallyWithT(s.T(), func(c *assert.CollectT) {
176+
if err := s.Env().RemoteHost.Reconnect(); err != nil {
177+
assert.NoError(c, err)
178+
return
179+
}
180+
out, err := s.Env().RemoteHost.Execute("cat /proc/sys/kernel/random/boot_id")
181+
if !assert.NoError(c, err) {
182+
return
183+
}
184+
assert.NotEqualf(c, before, strings.TrimSpace(out), "host has not rebooted yet")
185+
}, 5*time.Minute, 10*time.Second)
186+
}
187+
188+
func requirePipeline(t *testing.T) {
189+
t.Helper()
190+
_, hasPipeline := os.LookupEnv("E2E_PIPELINE_ID")
191+
_, hasSHA := os.LookupEnv("CI_COMMIT_SHA")
192+
if !hasPipeline && !hasSHA {
193+
t.Skip("E2E_PIPELINE_ID / CI_COMMIT_SHA not set; this test upgrades to the current pipeline build")
194+
}
195+
}

0 commit comments

Comments
 (0)