Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions pkg/network/protocols/http/etw_http_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -1402,6 +1402,14 @@ func etwHttpServiceSummary() {
//nolint:revive // TODO(WKIT) Fix revive linter
func (hei *EtwInterface) OnEvent(eventInfo *etw.DDEventRecord) {

// Signal that ETW is ready on the first event received. This confirms the
// provider is live and producing, and lets waiters proceed without relying
// on a fixed sleep.
hei.etwReadyOnce.Do(func() {
close(hei.etwReady)
log.Infof("ETW HttpService tracing is now ready (first event received)")
})

// Total number of bytes transferred to kernel from HTTP.sys driver. 0x68 is ETW header size
transferedETWBytesTotal += (uint64(eventInfo.UserDataLength) + 0x68)
transferedETWBytesPayload += uint64(eventInfo.UserDataLength)
Expand Down
15 changes: 15 additions & 0 deletions pkg/network/protocols/http/etw_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ type EtwInterface struct {
captureHTTPS bool
requestSize int64

// etwReady is closed once the ETW provider delivers its first event,
// signaling that the session is live and producing. It replaces the
// unreliable time.Sleep() approach for waiting on ETW startup.
etwReady chan struct{}
etwReadyOnce sync.Once

// ETW component
httpguid windows.GUID
session etw.Session
Expand All @@ -43,6 +49,7 @@ func NewEtwInterface(c *config.Config) (*EtwInterface, error) {
captureHTTPS: c.EnableNativeTLSMonitoring,
captureHTTP: c.EnableHTTPMonitoring,
requestSize: c.HTTPMaxRequestFragment,
etwReady: make(chan struct{}),
}
etwSessionName := "SystemProbeUSM_ETW"
etwcomp, err := etwimpl.NewEtw()
Expand Down Expand Up @@ -156,6 +163,14 @@ func (hei *EtwInterface) StartReadingHttpFlows() {
}()
}

// ETWReady returns a channel that is closed once the ETW provider has
// delivered its first event, indicating the session is live and producing
// events. Callers can select on it (with a timeout) instead of sleeping for a
// fixed duration to wait for ETW startup.
func (hei *EtwInterface) ETWReady() <-chan struct{} {
return hei.etwReady
}

//nolint:revive // TODO(WKIT) Fix revive linter
func (hei *EtwInterface) Close() {
hei.OnStop()
Expand Down
76 changes: 69 additions & 7 deletions pkg/network/protocols/http/etw_interface_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,21 @@ func executeRequestForTest(t *testing.T, etw *EtwInterface, test testDef) ([]Win
go func() {
defer wg.Done()
for {
trans, tok := <-etw.DataChannel
var trans []WinHttpTransaction
var tok bool
// Idle timeout: fail fast if no transaction arrives within the budget
// rather than blocking forever and letting the whole test binary hit
// the global 5-minute timeout (WINA-2079). The timer is recreated each
// iteration, so it acts as a per-batch idle timeout and does not cut
// off long-running tests (e.g. the flood test) that keep making
// progress. Use t.Errorf (not require/FailNow) since we're in a
// goroutine other than the test's.
select {
case trans, tok = <-etw.DataChannel:
case <-time.After(60 * time.Second):
t.Errorf("timed out waiting for ETW transaction for %q: received %d of %d expected", test.name, len(txns), test.count)
return
}
// there is spurious other traffic from other powershell/.net things going on
// skip transactions we're sure aren't generated by this test.
for _, tx := range trans {
Expand Down Expand Up @@ -250,6 +264,55 @@ func executeRequestForTest(t *testing.T, etw *EtwInterface, test testDef) ([]Win

}

// waitForETWReady drives warmup HTTP traffic to the local IIS site until the
// ETW provider delivers its first event (signaled via etw.ETWReady), then
// drains any warmup transactions so they don't bleed into the first subtest.
//
// We can't assume this host has any incidental HTTP traffic, so we generate our
// own to confirm the provider is live before firing the real test requests
// (WINA-2079). This replaces the previous unreliable fixed time.Sleep().
func waitForETWReady(t *testing.T, etw *EtwInterface) {
stopWarmup := make(chan struct{})
var warmupWG sync.WaitGroup
warmupWG.Add(1)
go func() {
defer warmupWG.Done()
for {
// Hit the default site; ignore errors (the provider being up is what
// matters, not the response).
if resp, err := nethttp.Get("http://127.0.0.1:80/"); err == nil {
_ = resp.Body.Close()
}
select {
case <-stopWarmup:
return
case <-time.After(500 * time.Millisecond):
}
}
}()

select {
case <-etw.ETWReady():
t.Log("ETW provider is ready (first event received)")
case <-time.After(30 * time.Second):
t.Log("timed out waiting for ETW readiness signal; proceeding anyway")
}

close(stopWarmup)
warmupWG.Wait()

// Drain the warmup transactions so they aren't counted toward the first
// subtest. No new traffic is generated once warmup stops, so an idle gap
// longer than the poller's interval (3s) means the channel is clear.
for {
select {
case <-etw.DataChannel:
case <-time.After(5 * time.Second):
return
}
}
}

func TestEtwTransactions(t *testing.T) {
ebpftest.LogLevel(t, "info")
cfg := config.New()
Expand All @@ -262,12 +325,11 @@ func TestEtwTransactions(t *testing.T) {

etw.StartReadingHttpFlows()

/*
* This is a bit kludgy, but we need to wait for the ETW provider to start. Empirically, it
* takes "some time" for the provider to start sending messages, which leads to some raciness
* if we're looking for very specific messages.
*/
time.Sleep(10 * time.Second)
// Wait for the ETW provider to start producing events before firing the real
// test requests. Issuing requests before the provider is live causes their
// events to be missed, which previously hung the whole suite (WINA-2079).
waitForETWReady(t, etw)

for _, test := range setupTests() {

t.Run(test.name, func(t *testing.T) {
Expand Down
Loading