1// Copyright 2023 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package testtrace
6
7import (
8	"errors"
9	"fmt"
10	"internal/trace"
11	"slices"
12	"strings"
13)
14
15// Validator is a type used for validating a stream of trace.Events.
16type Validator struct {
17	lastTs   trace.Time
18	gs       map[trace.GoID]*goState
19	ps       map[trace.ProcID]*procState
20	ms       map[trace.ThreadID]*schedContext
21	ranges   map[trace.ResourceID][]string
22	tasks    map[trace.TaskID]string
23	seenSync bool
24	Go121    bool
25}
26
27type schedContext struct {
28	M trace.ThreadID
29	P trace.ProcID
30	G trace.GoID
31}
32
33type goState struct {
34	state   trace.GoState
35	binding *schedContext
36}
37
38type procState struct {
39	state   trace.ProcState
40	binding *schedContext
41}
42
43// NewValidator creates a new Validator.
44func NewValidator() *Validator {
45	return &Validator{
46		gs:     make(map[trace.GoID]*goState),
47		ps:     make(map[trace.ProcID]*procState),
48		ms:     make(map[trace.ThreadID]*schedContext),
49		ranges: make(map[trace.ResourceID][]string),
50		tasks:  make(map[trace.TaskID]string),
51	}
52}
53
54// Event validates ev as the next event in a stream of trace.Events.
55//
56// Returns an error if validation fails.
57func (v *Validator) Event(ev trace.Event) error {
58	e := new(errAccumulator)
59
60	// Validate timestamp order.
61	if v.lastTs != 0 {
62		if ev.Time() <= v.lastTs {
63			e.Errorf("timestamp out-of-order for %+v", ev)
64		} else {
65			v.lastTs = ev.Time()
66		}
67	} else {
68		v.lastTs = ev.Time()
69	}
70
71	// Validate event stack.
72	checkStack(e, ev.Stack())
73
74	switch ev.Kind() {
75	case trace.EventSync:
76		// Just record that we've seen a Sync at some point.
77		v.seenSync = true
78	case trace.EventMetric:
79		m := ev.Metric()
80		if !strings.Contains(m.Name, ":") {
81			// Should have a ":" as per runtime/metrics convention.
82			e.Errorf("invalid metric name %q", m.Name)
83		}
84		// Make sure the value is OK.
85		if m.Value.Kind() == trace.ValueBad {
86			e.Errorf("invalid value")
87		}
88		switch m.Value.Kind() {
89		case trace.ValueUint64:
90			// Just make sure it doesn't panic.
91			_ = m.Value.Uint64()
92		}
93	case trace.EventLabel:
94		l := ev.Label()
95
96		// Check label.
97		if l.Label == "" {
98			e.Errorf("invalid label %q", l.Label)
99		}
100
101		// Check label resource.
102		if l.Resource.Kind == trace.ResourceNone {
103			e.Errorf("label resource none")
104		}
105		switch l.Resource.Kind {
106		case trace.ResourceGoroutine:
107			id := l.Resource.Goroutine()
108			if _, ok := v.gs[id]; !ok {
109				e.Errorf("label for invalid goroutine %d", id)
110			}
111		case trace.ResourceProc:
112			id := l.Resource.Proc()
113			if _, ok := v.ps[id]; !ok {
114				e.Errorf("label for invalid proc %d", id)
115			}
116		case trace.ResourceThread:
117			id := l.Resource.Thread()
118			if _, ok := v.ms[id]; !ok {
119				e.Errorf("label for invalid thread %d", id)
120			}
121		}
122	case trace.EventStackSample:
123		// Not much to check here. It's basically a sched context and a stack.
124		// The sched context is also not guaranteed to align with other events.
125		// We already checked the stack above.
126	case trace.EventStateTransition:
127		// Validate state transitions.
128		//
129		// TODO(mknyszek): A lot of logic is duplicated between goroutines and procs.
130		// The two are intentionally handled identically; from the perspective of the
131		// API, resources all have the same general properties. Consider making this
132		// code generic over resources and implementing validation just once.
133		tr := ev.StateTransition()
134		checkStack(e, tr.Stack)
135		switch tr.Resource.Kind {
136		case trace.ResourceGoroutine:
137			// Basic state transition validation.
138			id := tr.Resource.Goroutine()
139			old, new := tr.Goroutine()
140			if new == trace.GoUndetermined {
141				e.Errorf("transition to undetermined state for goroutine %d", id)
142			}
143			if v.seenSync && old == trace.GoUndetermined {
144				e.Errorf("undetermined goroutine %d after first global sync", id)
145			}
146			if new == trace.GoNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
147				e.Errorf("goroutine %d died with active ranges", id)
148			}
149			state, ok := v.gs[id]
150			if ok {
151				if old != state.state {
152					e.Errorf("bad old state for goroutine %d: got %s, want %s", id, old, state.state)
153				}
154				state.state = new
155			} else {
156				if old != trace.GoUndetermined && old != trace.GoNotExist {
157					e.Errorf("bad old state for unregistered goroutine %d: %s", id, old)
158				}
159				state = &goState{state: new}
160				v.gs[id] = state
161			}
162			// Validate sched context.
163			if new.Executing() {
164				ctx := v.getOrCreateThread(e, ev, ev.Thread())
165				if ctx != nil {
166					if ctx.G != trace.NoGoroutine && ctx.G != id {
167						e.Errorf("tried to run goroutine %d when one was already executing (%d) on thread %d", id, ctx.G, ev.Thread())
168					}
169					ctx.G = id
170					state.binding = ctx
171				}
172			} else if old.Executing() && !new.Executing() {
173				if tr.Stack != ev.Stack() {
174					// This is a case where the transition is happening to a goroutine that is also executing, so
175					// these two stacks should always match.
176					e.Errorf("StateTransition.Stack doesn't match Event.Stack")
177				}
178				ctx := state.binding
179				if ctx != nil {
180					if ctx.G != id {
181						e.Errorf("tried to stop goroutine %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.G, ev.Thread())
182					}
183					ctx.G = trace.NoGoroutine
184					state.binding = nil
185				} else {
186					e.Errorf("stopping goroutine %d not bound to any active context", id)
187				}
188			}
189		case trace.ResourceProc:
190			// Basic state transition validation.
191			id := tr.Resource.Proc()
192			old, new := tr.Proc()
193			if new == trace.ProcUndetermined {
194				e.Errorf("transition to undetermined state for proc %d", id)
195			}
196			if v.seenSync && old == trace.ProcUndetermined {
197				e.Errorf("undetermined proc %d after first global sync", id)
198			}
199			if new == trace.ProcNotExist && v.hasAnyRange(trace.MakeResourceID(id)) {
200				e.Errorf("proc %d died with active ranges", id)
201			}
202			state, ok := v.ps[id]
203			if ok {
204				if old != state.state {
205					e.Errorf("bad old state for proc %d: got %s, want %s", id, old, state.state)
206				}
207				state.state = new
208			} else {
209				if old != trace.ProcUndetermined && old != trace.ProcNotExist {
210					e.Errorf("bad old state for unregistered proc %d: %s", id, old)
211				}
212				state = &procState{state: new}
213				v.ps[id] = state
214			}
215			// Validate sched context.
216			if new.Executing() {
217				ctx := v.getOrCreateThread(e, ev, ev.Thread())
218				if ctx != nil {
219					if ctx.P != trace.NoProc && ctx.P != id {
220						e.Errorf("tried to run proc %d when one was already executing (%d) on thread %d", id, ctx.P, ev.Thread())
221					}
222					ctx.P = id
223					state.binding = ctx
224				}
225			} else if old.Executing() && !new.Executing() {
226				ctx := state.binding
227				if ctx != nil {
228					if ctx.P != id {
229						e.Errorf("tried to stop proc %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.P, ctx.M)
230					}
231					ctx.P = trace.NoProc
232					state.binding = nil
233				} else {
234					e.Errorf("stopping proc %d not bound to any active context", id)
235				}
236			}
237		}
238	case trace.EventRangeBegin, trace.EventRangeActive, trace.EventRangeEnd:
239		// Validate ranges.
240		r := ev.Range()
241		switch ev.Kind() {
242		case trace.EventRangeBegin:
243			if v.hasRange(r.Scope, r.Name) {
244				e.Errorf("already active range %q on %v begun again", r.Name, r.Scope)
245			}
246			v.addRange(r.Scope, r.Name)
247		case trace.EventRangeActive:
248			if !v.hasRange(r.Scope, r.Name) {
249				v.addRange(r.Scope, r.Name)
250			}
251		case trace.EventRangeEnd:
252			if !v.hasRange(r.Scope, r.Name) {
253				e.Errorf("inactive range %q on %v ended", r.Name, r.Scope)
254			}
255			v.deleteRange(r.Scope, r.Name)
256		}
257	case trace.EventTaskBegin:
258		// Validate task begin.
259		t := ev.Task()
260		if t.ID == trace.NoTask || t.ID == trace.BackgroundTask {
261			// The background task should never have an event emitted for it.
262			e.Errorf("found invalid task ID for task of type %s", t.Type)
263		}
264		if t.Parent == trace.BackgroundTask {
265			// It's not possible for a task to be a subtask of the background task.
266			e.Errorf("found background task as the parent for task of type %s", t.Type)
267		}
268		// N.B. Don't check the task type. Empty string is a valid task type.
269		v.tasks[t.ID] = t.Type
270	case trace.EventTaskEnd:
271		// Validate task end.
272		// We can see a task end without a begin, so ignore a task without information.
273		// Instead, if we've seen the task begin, just make sure the task end lines up.
274		t := ev.Task()
275		if typ, ok := v.tasks[t.ID]; ok {
276			if t.Type != typ {
277				e.Errorf("task end type %q doesn't match task start type %q for task %d", t.Type, typ, t.ID)
278			}
279			delete(v.tasks, t.ID)
280		}
281	case trace.EventLog:
282		// There's really not much here to check, except that we can
283		// generate a Log. The category and message are entirely user-created,
284		// so we can't make any assumptions as to what they are. We also
285		// can't validate the task, because proving the task's existence is very
286		// much best-effort.
287		_ = ev.Log()
288	}
289	return e.Errors()
290}
291
292func (v *Validator) hasRange(r trace.ResourceID, name string) bool {
293	ranges, ok := v.ranges[r]
294	return ok && slices.Contains(ranges, name)
295}
296
297func (v *Validator) addRange(r trace.ResourceID, name string) {
298	ranges, _ := v.ranges[r]
299	ranges = append(ranges, name)
300	v.ranges[r] = ranges
301}
302
303func (v *Validator) hasAnyRange(r trace.ResourceID) bool {
304	ranges, ok := v.ranges[r]
305	return ok && len(ranges) != 0
306}
307
308func (v *Validator) deleteRange(r trace.ResourceID, name string) {
309	ranges, ok := v.ranges[r]
310	if !ok {
311		return
312	}
313	i := slices.Index(ranges, name)
314	if i < 0 {
315		return
316	}
317	v.ranges[r] = slices.Delete(ranges, i, i+1)
318}
319
320func (v *Validator) getOrCreateThread(e *errAccumulator, ev trace.Event, m trace.ThreadID) *schedContext {
321	lenient := func() bool {
322		// Be lenient about GoUndetermined -> GoSyscall transitions if they
323		// originate from an old trace. These transitions lack thread
324		// information in trace formats older than 1.22.
325		if !v.Go121 {
326			return false
327		}
328		if ev.Kind() != trace.EventStateTransition {
329			return false
330		}
331		tr := ev.StateTransition()
332		if tr.Resource.Kind != trace.ResourceGoroutine {
333			return false
334		}
335		from, to := tr.Goroutine()
336		return from == trace.GoUndetermined && to == trace.GoSyscall
337	}
338	if m == trace.NoThread && !lenient() {
339		e.Errorf("must have thread, but thread ID is none")
340		return nil
341	}
342	s, ok := v.ms[m]
343	if !ok {
344		s = &schedContext{M: m, P: trace.NoProc, G: trace.NoGoroutine}
345		v.ms[m] = s
346		return s
347	}
348	return s
349}
350
351func checkStack(e *errAccumulator, stk trace.Stack) {
352	// Check for non-empty values, but we also check for crashes due to incorrect validation.
353	i := 0
354	stk.Frames(func(f trace.StackFrame) bool {
355		if i == 0 {
356			// Allow for one fully zero stack.
357			//
358			// TODO(mknyszek): Investigate why that happens.
359			return true
360		}
361		if f.Func == "" || f.File == "" || f.PC == 0 || f.Line == 0 {
362			e.Errorf("invalid stack frame %#v: missing information", f)
363		}
364		i++
365		return true
366	})
367}
368
369type errAccumulator struct {
370	errs []error
371}
372
373func (e *errAccumulator) Errorf(f string, args ...any) {
374	e.errs = append(e.errs, fmt.Errorf(f, args...))
375}
376
377func (e *errAccumulator) Errors() error {
378	return errors.Join(e.errs...)
379}
380