1// Copyright 2023 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package testtrace 6 7import ( 8 "errors" 9 "fmt" 10 "internal/trace" 11 "slices" 12 "strings" 13) 14 15// Validator is a type used for validating a stream of trace.Events. 16type Validator struct { 17 lastTs trace.Time 18 gs map[trace.GoID]*goState 19 ps map[trace.ProcID]*procState 20 ms map[trace.ThreadID]*schedContext 21 ranges map[trace.ResourceID][]string 22 tasks map[trace.TaskID]string 23 seenSync bool 24 Go121 bool 25} 26 27type schedContext struct { 28 M trace.ThreadID 29 P trace.ProcID 30 G trace.GoID 31} 32 33type goState struct { 34 state trace.GoState 35 binding *schedContext 36} 37 38type procState struct { 39 state trace.ProcState 40 binding *schedContext 41} 42 43// NewValidator creates a new Validator. 44func NewValidator() *Validator { 45 return &Validator{ 46 gs: make(map[trace.GoID]*goState), 47 ps: make(map[trace.ProcID]*procState), 48 ms: make(map[trace.ThreadID]*schedContext), 49 ranges: make(map[trace.ResourceID][]string), 50 tasks: make(map[trace.TaskID]string), 51 } 52} 53 54// Event validates ev as the next event in a stream of trace.Events. 55// 56// Returns an error if validation fails. 57func (v *Validator) Event(ev trace.Event) error { 58 e := new(errAccumulator) 59 60 // Validate timestamp order. 61 if v.lastTs != 0 { 62 if ev.Time() <= v.lastTs { 63 e.Errorf("timestamp out-of-order for %+v", ev) 64 } else { 65 v.lastTs = ev.Time() 66 } 67 } else { 68 v.lastTs = ev.Time() 69 } 70 71 // Validate event stack. 72 checkStack(e, ev.Stack()) 73 74 switch ev.Kind() { 75 case trace.EventSync: 76 // Just record that we've seen a Sync at some point. 77 v.seenSync = true 78 case trace.EventMetric: 79 m := ev.Metric() 80 if !strings.Contains(m.Name, ":") { 81 // Should have a ":" as per runtime/metrics convention. 82 e.Errorf("invalid metric name %q", m.Name) 83 } 84 // Make sure the value is OK. 85 if m.Value.Kind() == trace.ValueBad { 86 e.Errorf("invalid value") 87 } 88 switch m.Value.Kind() { 89 case trace.ValueUint64: 90 // Just make sure it doesn't panic. 91 _ = m.Value.Uint64() 92 } 93 case trace.EventLabel: 94 l := ev.Label() 95 96 // Check label. 97 if l.Label == "" { 98 e.Errorf("invalid label %q", l.Label) 99 } 100 101 // Check label resource. 102 if l.Resource.Kind == trace.ResourceNone { 103 e.Errorf("label resource none") 104 } 105 switch l.Resource.Kind { 106 case trace.ResourceGoroutine: 107 id := l.Resource.Goroutine() 108 if _, ok := v.gs[id]; !ok { 109 e.Errorf("label for invalid goroutine %d", id) 110 } 111 case trace.ResourceProc: 112 id := l.Resource.Proc() 113 if _, ok := v.ps[id]; !ok { 114 e.Errorf("label for invalid proc %d", id) 115 } 116 case trace.ResourceThread: 117 id := l.Resource.Thread() 118 if _, ok := v.ms[id]; !ok { 119 e.Errorf("label for invalid thread %d", id) 120 } 121 } 122 case trace.EventStackSample: 123 // Not much to check here. It's basically a sched context and a stack. 124 // The sched context is also not guaranteed to align with other events. 125 // We already checked the stack above. 126 case trace.EventStateTransition: 127 // Validate state transitions. 128 // 129 // TODO(mknyszek): A lot of logic is duplicated between goroutines and procs. 130 // The two are intentionally handled identically; from the perspective of the 131 // API, resources all have the same general properties. Consider making this 132 // code generic over resources and implementing validation just once. 133 tr := ev.StateTransition() 134 checkStack(e, tr.Stack) 135 switch tr.Resource.Kind { 136 case trace.ResourceGoroutine: 137 // Basic state transition validation. 138 id := tr.Resource.Goroutine() 139 old, new := tr.Goroutine() 140 if new == trace.GoUndetermined { 141 e.Errorf("transition to undetermined state for goroutine %d", id) 142 } 143 if v.seenSync && old == trace.GoUndetermined { 144 e.Errorf("undetermined goroutine %d after first global sync", id) 145 } 146 if new == trace.GoNotExist && v.hasAnyRange(trace.MakeResourceID(id)) { 147 e.Errorf("goroutine %d died with active ranges", id) 148 } 149 state, ok := v.gs[id] 150 if ok { 151 if old != state.state { 152 e.Errorf("bad old state for goroutine %d: got %s, want %s", id, old, state.state) 153 } 154 state.state = new 155 } else { 156 if old != trace.GoUndetermined && old != trace.GoNotExist { 157 e.Errorf("bad old state for unregistered goroutine %d: %s", id, old) 158 } 159 state = &goState{state: new} 160 v.gs[id] = state 161 } 162 // Validate sched context. 163 if new.Executing() { 164 ctx := v.getOrCreateThread(e, ev, ev.Thread()) 165 if ctx != nil { 166 if ctx.G != trace.NoGoroutine && ctx.G != id { 167 e.Errorf("tried to run goroutine %d when one was already executing (%d) on thread %d", id, ctx.G, ev.Thread()) 168 } 169 ctx.G = id 170 state.binding = ctx 171 } 172 } else if old.Executing() && !new.Executing() { 173 if tr.Stack != ev.Stack() { 174 // This is a case where the transition is happening to a goroutine that is also executing, so 175 // these two stacks should always match. 176 e.Errorf("StateTransition.Stack doesn't match Event.Stack") 177 } 178 ctx := state.binding 179 if ctx != nil { 180 if ctx.G != id { 181 e.Errorf("tried to stop goroutine %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.G, ev.Thread()) 182 } 183 ctx.G = trace.NoGoroutine 184 state.binding = nil 185 } else { 186 e.Errorf("stopping goroutine %d not bound to any active context", id) 187 } 188 } 189 case trace.ResourceProc: 190 // Basic state transition validation. 191 id := tr.Resource.Proc() 192 old, new := tr.Proc() 193 if new == trace.ProcUndetermined { 194 e.Errorf("transition to undetermined state for proc %d", id) 195 } 196 if v.seenSync && old == trace.ProcUndetermined { 197 e.Errorf("undetermined proc %d after first global sync", id) 198 } 199 if new == trace.ProcNotExist && v.hasAnyRange(trace.MakeResourceID(id)) { 200 e.Errorf("proc %d died with active ranges", id) 201 } 202 state, ok := v.ps[id] 203 if ok { 204 if old != state.state { 205 e.Errorf("bad old state for proc %d: got %s, want %s", id, old, state.state) 206 } 207 state.state = new 208 } else { 209 if old != trace.ProcUndetermined && old != trace.ProcNotExist { 210 e.Errorf("bad old state for unregistered proc %d: %s", id, old) 211 } 212 state = &procState{state: new} 213 v.ps[id] = state 214 } 215 // Validate sched context. 216 if new.Executing() { 217 ctx := v.getOrCreateThread(e, ev, ev.Thread()) 218 if ctx != nil { 219 if ctx.P != trace.NoProc && ctx.P != id { 220 e.Errorf("tried to run proc %d when one was already executing (%d) on thread %d", id, ctx.P, ev.Thread()) 221 } 222 ctx.P = id 223 state.binding = ctx 224 } 225 } else if old.Executing() && !new.Executing() { 226 ctx := state.binding 227 if ctx != nil { 228 if ctx.P != id { 229 e.Errorf("tried to stop proc %d when it wasn't currently executing (currently executing %d) on thread %d", id, ctx.P, ctx.M) 230 } 231 ctx.P = trace.NoProc 232 state.binding = nil 233 } else { 234 e.Errorf("stopping proc %d not bound to any active context", id) 235 } 236 } 237 } 238 case trace.EventRangeBegin, trace.EventRangeActive, trace.EventRangeEnd: 239 // Validate ranges. 240 r := ev.Range() 241 switch ev.Kind() { 242 case trace.EventRangeBegin: 243 if v.hasRange(r.Scope, r.Name) { 244 e.Errorf("already active range %q on %v begun again", r.Name, r.Scope) 245 } 246 v.addRange(r.Scope, r.Name) 247 case trace.EventRangeActive: 248 if !v.hasRange(r.Scope, r.Name) { 249 v.addRange(r.Scope, r.Name) 250 } 251 case trace.EventRangeEnd: 252 if !v.hasRange(r.Scope, r.Name) { 253 e.Errorf("inactive range %q on %v ended", r.Name, r.Scope) 254 } 255 v.deleteRange(r.Scope, r.Name) 256 } 257 case trace.EventTaskBegin: 258 // Validate task begin. 259 t := ev.Task() 260 if t.ID == trace.NoTask || t.ID == trace.BackgroundTask { 261 // The background task should never have an event emitted for it. 262 e.Errorf("found invalid task ID for task of type %s", t.Type) 263 } 264 if t.Parent == trace.BackgroundTask { 265 // It's not possible for a task to be a subtask of the background task. 266 e.Errorf("found background task as the parent for task of type %s", t.Type) 267 } 268 // N.B. Don't check the task type. Empty string is a valid task type. 269 v.tasks[t.ID] = t.Type 270 case trace.EventTaskEnd: 271 // Validate task end. 272 // We can see a task end without a begin, so ignore a task without information. 273 // Instead, if we've seen the task begin, just make sure the task end lines up. 274 t := ev.Task() 275 if typ, ok := v.tasks[t.ID]; ok { 276 if t.Type != typ { 277 e.Errorf("task end type %q doesn't match task start type %q for task %d", t.Type, typ, t.ID) 278 } 279 delete(v.tasks, t.ID) 280 } 281 case trace.EventLog: 282 // There's really not much here to check, except that we can 283 // generate a Log. The category and message are entirely user-created, 284 // so we can't make any assumptions as to what they are. We also 285 // can't validate the task, because proving the task's existence is very 286 // much best-effort. 287 _ = ev.Log() 288 } 289 return e.Errors() 290} 291 292func (v *Validator) hasRange(r trace.ResourceID, name string) bool { 293 ranges, ok := v.ranges[r] 294 return ok && slices.Contains(ranges, name) 295} 296 297func (v *Validator) addRange(r trace.ResourceID, name string) { 298 ranges, _ := v.ranges[r] 299 ranges = append(ranges, name) 300 v.ranges[r] = ranges 301} 302 303func (v *Validator) hasAnyRange(r trace.ResourceID) bool { 304 ranges, ok := v.ranges[r] 305 return ok && len(ranges) != 0 306} 307 308func (v *Validator) deleteRange(r trace.ResourceID, name string) { 309 ranges, ok := v.ranges[r] 310 if !ok { 311 return 312 } 313 i := slices.Index(ranges, name) 314 if i < 0 { 315 return 316 } 317 v.ranges[r] = slices.Delete(ranges, i, i+1) 318} 319 320func (v *Validator) getOrCreateThread(e *errAccumulator, ev trace.Event, m trace.ThreadID) *schedContext { 321 lenient := func() bool { 322 // Be lenient about GoUndetermined -> GoSyscall transitions if they 323 // originate from an old trace. These transitions lack thread 324 // information in trace formats older than 1.22. 325 if !v.Go121 { 326 return false 327 } 328 if ev.Kind() != trace.EventStateTransition { 329 return false 330 } 331 tr := ev.StateTransition() 332 if tr.Resource.Kind != trace.ResourceGoroutine { 333 return false 334 } 335 from, to := tr.Goroutine() 336 return from == trace.GoUndetermined && to == trace.GoSyscall 337 } 338 if m == trace.NoThread && !lenient() { 339 e.Errorf("must have thread, but thread ID is none") 340 return nil 341 } 342 s, ok := v.ms[m] 343 if !ok { 344 s = &schedContext{M: m, P: trace.NoProc, G: trace.NoGoroutine} 345 v.ms[m] = s 346 return s 347 } 348 return s 349} 350 351func checkStack(e *errAccumulator, stk trace.Stack) { 352 // Check for non-empty values, but we also check for crashes due to incorrect validation. 353 i := 0 354 stk.Frames(func(f trace.StackFrame) bool { 355 if i == 0 { 356 // Allow for one fully zero stack. 357 // 358 // TODO(mknyszek): Investigate why that happens. 359 return true 360 } 361 if f.Func == "" || f.File == "" || f.PC == 0 || f.Line == 0 { 362 e.Errorf("invalid stack frame %#v: missing information", f) 363 } 364 i++ 365 return true 366 }) 367} 368 369type errAccumulator struct { 370 errs []error 371} 372 373func (e *errAccumulator) Errorf(f string, args ...any) { 374 e.errs = append(e.errs, fmt.Errorf(f, args...)) 375} 376 377func (e *errAccumulator) Errors() error { 378 return errors.Join(e.errs...) 379} 380