1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package runtime
6
7import (
8	"internal/runtime/atomic"
9	"unsafe"
10)
11
12// A profBuf is a lock-free buffer for profiling events,
13// safe for concurrent use by one reader and one writer.
14// The writer may be a signal handler running without a user g.
15// The reader is assumed to be a user g.
16//
17// Each logged event corresponds to a fixed size header, a list of
18// uintptrs (typically a stack), and exactly one unsafe.Pointer tag.
19// The header and uintptrs are stored in the circular buffer data and the
20// tag is stored in a circular buffer tags, running in parallel.
21// In the circular buffer data, each event takes 2+hdrsize+len(stk)
22// words: the value 2+hdrsize+len(stk), then the time of the event, then
23// hdrsize words giving the fixed-size header, and then len(stk) words
24// for the stack.
25//
26// The current effective offsets into the tags and data circular buffers
27// for reading and writing are stored in the high 30 and low 32 bits of r and w.
28// The bottom bits of the high 32 are additional flag bits in w, unused in r.
29// "Effective" offsets means the total number of reads or writes, mod 2^length.
30// The offset in the buffer is the effective offset mod the length of the buffer.
31// To make wraparound mod 2^length match wraparound mod length of the buffer,
32// the length of the buffer must be a power of two.
33//
34// If the reader catches up to the writer, a flag passed to read controls
35// whether the read blocks until more data is available. A read returns a
36// pointer to the buffer data itself; the caller is assumed to be done with
37// that data at the next read. The read offset rNext tracks the next offset to
38// be returned by read. By definition, r ≤ rNext ≤ w (before wraparound),
39// and rNext is only used by the reader, so it can be accessed without atomics.
40//
41// If the writer gets ahead of the reader, so that the buffer fills,
42// future writes are discarded and replaced in the output stream by an
43// overflow entry, which has size 2+hdrsize+1, time set to the time of
44// the first discarded write, a header of all zeroed words, and a "stack"
45// containing one word, the number of discarded writes.
46//
47// Between the time the buffer fills and the buffer becomes empty enough
48// to hold more data, the overflow entry is stored as a pending overflow
49// entry in the fields overflow and overflowTime. The pending overflow
50// entry can be turned into a real record by either the writer or the
51// reader. If the writer is called to write a new record and finds that
52// the output buffer has room for both the pending overflow entry and the
53// new record, the writer emits the pending overflow entry and the new
54// record into the buffer. If the reader is called to read data and finds
55// that the output buffer is empty but that there is a pending overflow
56// entry, the reader will return a synthesized record for the pending
57// overflow entry.
58//
59// Only the writer can create or add to a pending overflow entry, but
60// either the reader or the writer can clear the pending overflow entry.
61// A pending overflow entry is indicated by the low 32 bits of 'overflow'
62// holding the number of discarded writes, and overflowTime holding the
63// time of the first discarded write. The high 32 bits of 'overflow'
64// increment each time the low 32 bits transition from zero to non-zero
65// or vice versa. This sequence number avoids ABA problems in the use of
66// compare-and-swap to coordinate between reader and writer.
67// The overflowTime is only written when the low 32 bits of overflow are
68// zero, that is, only when there is no pending overflow entry, in
69// preparation for creating a new one. The reader can therefore fetch and
70// clear the entry atomically using
71//
72//	for {
73//		overflow = load(&b.overflow)
74//		if uint32(overflow) == 0 {
75//			// no pending entry
76//			break
77//		}
78//		time = load(&b.overflowTime)
79//		if cas(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
80//			// pending entry cleared
81//			break
82//		}
83//	}
84//	if uint32(overflow) > 0 {
85//		emit entry for uint32(overflow), time
86//	}
87type profBuf struct {
88	// accessed atomically
89	r, w         profAtomic
90	overflow     atomic.Uint64
91	overflowTime atomic.Uint64
92	eof          atomic.Uint32
93
94	// immutable (excluding slice content)
95	hdrsize uintptr
96	data    []uint64
97	tags    []unsafe.Pointer
98
99	// owned by reader
100	rNext       profIndex
101	overflowBuf []uint64 // for use by reader to return overflow record
102	wait        note
103}
104
105// A profAtomic is the atomically-accessed word holding a profIndex.
106type profAtomic uint64
107
108// A profIndex is the packet tag and data counts and flags bits, described above.
109type profIndex uint64
110
111const (
112	profReaderSleeping profIndex = 1 << 32 // reader is sleeping and must be woken up
113	profWriteExtra     profIndex = 1 << 33 // overflow or eof waiting
114)
115
116func (x *profAtomic) load() profIndex {
117	return profIndex(atomic.Load64((*uint64)(x)))
118}
119
120func (x *profAtomic) store(new profIndex) {
121	atomic.Store64((*uint64)(x), uint64(new))
122}
123
124func (x *profAtomic) cas(old, new profIndex) bool {
125	return atomic.Cas64((*uint64)(x), uint64(old), uint64(new))
126}
127
128func (x profIndex) dataCount() uint32 {
129	return uint32(x)
130}
131
132func (x profIndex) tagCount() uint32 {
133	return uint32(x >> 34)
134}
135
136// countSub subtracts two counts obtained from profIndex.dataCount or profIndex.tagCount,
137// assuming that they are no more than 2^29 apart (guaranteed since they are never more than
138// len(data) or len(tags) apart, respectively).
139// tagCount wraps at 2^30, while dataCount wraps at 2^32.
140// This function works for both.
141func countSub(x, y uint32) int {
142	// x-y is 32-bit signed or 30-bit signed; sign-extend to 32 bits and convert to int.
143	return int(int32(x-y) << 2 >> 2)
144}
145
146// addCountsAndClearFlags returns the packed form of "x + (data, tag) - all flags".
147func (x profIndex) addCountsAndClearFlags(data, tag int) profIndex {
148	return profIndex((uint64(x)>>34+uint64(uint32(tag)<<2>>2))<<34 | uint64(uint32(x)+uint32(data)))
149}
150
151// hasOverflow reports whether b has any overflow records pending.
152func (b *profBuf) hasOverflow() bool {
153	return uint32(b.overflow.Load()) > 0
154}
155
156// takeOverflow consumes the pending overflow records, returning the overflow count
157// and the time of the first overflow.
158// When called by the reader, it is racing against incrementOverflow.
159func (b *profBuf) takeOverflow() (count uint32, time uint64) {
160	overflow := b.overflow.Load()
161	time = b.overflowTime.Load()
162	for {
163		count = uint32(overflow)
164		if count == 0 {
165			time = 0
166			break
167		}
168		// Increment generation, clear overflow count in low bits.
169		if b.overflow.CompareAndSwap(overflow, ((overflow>>32)+1)<<32) {
170			break
171		}
172		overflow = b.overflow.Load()
173		time = b.overflowTime.Load()
174	}
175	return uint32(overflow), time
176}
177
178// incrementOverflow records a single overflow at time now.
179// It is racing against a possible takeOverflow in the reader.
180func (b *profBuf) incrementOverflow(now int64) {
181	for {
182		overflow := b.overflow.Load()
183
184		// Once we see b.overflow reach 0, it's stable: no one else is changing it underfoot.
185		// We need to set overflowTime if we're incrementing b.overflow from 0.
186		if uint32(overflow) == 0 {
187			// Store overflowTime first so it's always available when overflow != 0.
188			b.overflowTime.Store(uint64(now))
189			b.overflow.Store((((overflow >> 32) + 1) << 32) + 1)
190			break
191		}
192		// Otherwise we're racing to increment against reader
193		// who wants to set b.overflow to 0.
194		// Out of paranoia, leave 2³²-1 a sticky overflow value,
195		// to avoid wrapping around. Extremely unlikely.
196		if int32(overflow) == -1 {
197			break
198		}
199		if b.overflow.CompareAndSwap(overflow, overflow+1) {
200			break
201		}
202	}
203}
204
205// newProfBuf returns a new profiling buffer with room for
206// a header of hdrsize words and a buffer of at least bufwords words.
207func newProfBuf(hdrsize, bufwords, tags int) *profBuf {
208	if min := 2 + hdrsize + 1; bufwords < min {
209		bufwords = min
210	}
211
212	// Buffer sizes must be power of two, so that we don't have to
213	// worry about uint32 wraparound changing the effective position
214	// within the buffers. We store 30 bits of count; limiting to 28
215	// gives us some room for intermediate calculations.
216	if bufwords >= 1<<28 || tags >= 1<<28 {
217		throw("newProfBuf: buffer too large")
218	}
219	var i int
220	for i = 1; i < bufwords; i <<= 1 {
221	}
222	bufwords = i
223	for i = 1; i < tags; i <<= 1 {
224	}
225	tags = i
226
227	b := new(profBuf)
228	b.hdrsize = uintptr(hdrsize)
229	b.data = make([]uint64, bufwords)
230	b.tags = make([]unsafe.Pointer, tags)
231	b.overflowBuf = make([]uint64, 2+b.hdrsize+1)
232	return b
233}
234
235// canWriteRecord reports whether the buffer has room
236// for a single contiguous record with a stack of length nstk.
237func (b *profBuf) canWriteRecord(nstk int) bool {
238	br := b.r.load()
239	bw := b.w.load()
240
241	// room for tag?
242	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 1 {
243		return false
244	}
245
246	// room for data?
247	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
248	want := 2 + int(b.hdrsize) + nstk
249	i := int(bw.dataCount() % uint32(len(b.data)))
250	if i+want > len(b.data) {
251		// Can't fit in trailing fragment of slice.
252		// Skip over that and start over at beginning of slice.
253		nd -= len(b.data) - i
254	}
255	return nd >= want
256}
257
258// canWriteTwoRecords reports whether the buffer has room
259// for two records with stack lengths nstk1, nstk2, in that order.
260// Each record must be contiguous on its own, but the two
261// records need not be contiguous (one can be at the end of the buffer
262// and the other can wrap around and start at the beginning of the buffer).
263func (b *profBuf) canWriteTwoRecords(nstk1, nstk2 int) bool {
264	br := b.r.load()
265	bw := b.w.load()
266
267	// room for tag?
268	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 2 {
269		return false
270	}
271
272	// room for data?
273	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
274
275	// first record
276	want := 2 + int(b.hdrsize) + nstk1
277	i := int(bw.dataCount() % uint32(len(b.data)))
278	if i+want > len(b.data) {
279		// Can't fit in trailing fragment of slice.
280		// Skip over that and start over at beginning of slice.
281		nd -= len(b.data) - i
282		i = 0
283	}
284	i += want
285	nd -= want
286
287	// second record
288	want = 2 + int(b.hdrsize) + nstk2
289	if i+want > len(b.data) {
290		// Can't fit in trailing fragment of slice.
291		// Skip over that and start over at beginning of slice.
292		nd -= len(b.data) - i
293		i = 0
294	}
295	return nd >= want
296}
297
298// write writes an entry to the profiling buffer b.
299// The entry begins with a fixed hdr, which must have
300// length b.hdrsize, followed by a variable-sized stack
301// and a single tag pointer *tagPtr (or nil if tagPtr is nil).
302// No write barriers allowed because this might be called from a signal handler.
303func (b *profBuf) write(tagPtr *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
304	if b == nil {
305		return
306	}
307	if len(hdr) > int(b.hdrsize) {
308		throw("misuse of profBuf.write")
309	}
310
311	if hasOverflow := b.hasOverflow(); hasOverflow && b.canWriteTwoRecords(1, len(stk)) {
312		// Room for both an overflow record and the one being written.
313		// Write the overflow record if the reader hasn't gotten to it yet.
314		// Only racing against reader, not other writers.
315		count, time := b.takeOverflow()
316		if count > 0 {
317			var stk [1]uintptr
318			stk[0] = uintptr(count)
319			b.write(nil, int64(time), nil, stk[:])
320		}
321	} else if hasOverflow || !b.canWriteRecord(len(stk)) {
322		// Pending overflow without room to write overflow and new records
323		// or no overflow but also no room for new record.
324		b.incrementOverflow(now)
325		b.wakeupExtra()
326		return
327	}
328
329	// There's room: write the record.
330	br := b.r.load()
331	bw := b.w.load()
332
333	// Profiling tag
334	//
335	// The tag is a pointer, but we can't run a write barrier here.
336	// We have interrupted the OS-level execution of gp, but the
337	// runtime still sees gp as executing. In effect, we are running
338	// in place of the real gp. Since gp is the only goroutine that
339	// can overwrite gp.labels, the value of gp.labels is stable during
340	// this signal handler: it will still be reachable from gp when
341	// we finish executing. If a GC is in progress right now, it must
342	// keep gp.labels alive, because gp.labels is reachable from gp.
343	// If gp were to overwrite gp.labels, the deletion barrier would
344	// still shade that pointer, which would preserve it for the
345	// in-progress GC, so all is well. Any future GC will see the
346	// value we copied when scanning b.tags (heap-allocated).
347	// We arrange that the store here is always overwriting a nil,
348	// so there is no need for a deletion barrier on b.tags[wt].
349	wt := int(bw.tagCount() % uint32(len(b.tags)))
350	if tagPtr != nil {
351		*(*uintptr)(unsafe.Pointer(&b.tags[wt])) = uintptr(*tagPtr)
352	}
353
354	// Main record.
355	// It has to fit in a contiguous section of the slice, so if it doesn't fit at the end,
356	// leave a rewind marker (0) and start over at the beginning of the slice.
357	wd := int(bw.dataCount() % uint32(len(b.data)))
358	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
359	skip := 0
360	if wd+2+int(b.hdrsize)+len(stk) > len(b.data) {
361		b.data[wd] = 0
362		skip = len(b.data) - wd
363		nd -= skip
364		wd = 0
365	}
366	data := b.data[wd:]
367	data[0] = uint64(2 + b.hdrsize + uintptr(len(stk))) // length
368	data[1] = uint64(now)                               // time stamp
369	// header, zero-padded
370	i := copy(data[2:2+b.hdrsize], hdr)
371	clear(data[2+i : 2+b.hdrsize])
372	for i, pc := range stk {
373		data[2+b.hdrsize+uintptr(i)] = uint64(pc)
374	}
375
376	for {
377		// Commit write.
378		// Racing with reader setting flag bits in b.w, to avoid lost wakeups.
379		old := b.w.load()
380		new := old.addCountsAndClearFlags(skip+2+len(stk)+int(b.hdrsize), 1)
381		if !b.w.cas(old, new) {
382			continue
383		}
384		// If there was a reader, wake it up.
385		if old&profReaderSleeping != 0 {
386			notewakeup(&b.wait)
387		}
388		break
389	}
390}
391
392// close signals that there will be no more writes on the buffer.
393// Once all the data has been read from the buffer, reads will return eof=true.
394func (b *profBuf) close() {
395	if b.eof.Load() > 0 {
396		throw("runtime: profBuf already closed")
397	}
398	b.eof.Store(1)
399	b.wakeupExtra()
400}
401
402// wakeupExtra must be called after setting one of the "extra"
403// atomic fields b.overflow or b.eof.
404// It records the change in b.w and wakes up the reader if needed.
405func (b *profBuf) wakeupExtra() {
406	for {
407		old := b.w.load()
408		new := old | profWriteExtra
409		if !b.w.cas(old, new) {
410			continue
411		}
412		if old&profReaderSleeping != 0 {
413			notewakeup(&b.wait)
414		}
415		break
416	}
417}
418
419// profBufReadMode specifies whether to block when no data is available to read.
420type profBufReadMode int
421
422const (
423	profBufBlocking profBufReadMode = iota
424	profBufNonBlocking
425)
426
427var overflowTag [1]unsafe.Pointer // always nil
428
429func (b *profBuf) read(mode profBufReadMode) (data []uint64, tags []unsafe.Pointer, eof bool) {
430	if b == nil {
431		return nil, nil, true
432	}
433
434	br := b.rNext
435
436	// Commit previous read, returning that part of the ring to the writer.
437	// First clear tags that have now been read, both to avoid holding
438	// up the memory they point at for longer than necessary
439	// and so that b.write can assume it is always overwriting
440	// nil tag entries (see comment in b.write).
441	rPrev := b.r.load()
442	if rPrev != br {
443		ntag := countSub(br.tagCount(), rPrev.tagCount())
444		ti := int(rPrev.tagCount() % uint32(len(b.tags)))
445		for i := 0; i < ntag; i++ {
446			b.tags[ti] = nil
447			if ti++; ti == len(b.tags) {
448				ti = 0
449			}
450		}
451		b.r.store(br)
452	}
453
454Read:
455	bw := b.w.load()
456	numData := countSub(bw.dataCount(), br.dataCount())
457	if numData == 0 {
458		if b.hasOverflow() {
459			// No data to read, but there is overflow to report.
460			// Racing with writer flushing b.overflow into a real record.
461			count, time := b.takeOverflow()
462			if count == 0 {
463				// Lost the race, go around again.
464				goto Read
465			}
466			// Won the race, report overflow.
467			dst := b.overflowBuf
468			dst[0] = uint64(2 + b.hdrsize + 1)
469			dst[1] = time
470			clear(dst[2 : 2+b.hdrsize])
471			dst[2+b.hdrsize] = uint64(count)
472			return dst[:2+b.hdrsize+1], overflowTag[:1], false
473		}
474		if b.eof.Load() > 0 {
475			// No data, no overflow, EOF set: done.
476			return nil, nil, true
477		}
478		if bw&profWriteExtra != 0 {
479			// Writer claims to have published extra information (overflow or eof).
480			// Attempt to clear notification and then check again.
481			// If we fail to clear the notification it means b.w changed,
482			// so we still need to check again.
483			b.w.cas(bw, bw&^profWriteExtra)
484			goto Read
485		}
486
487		// Nothing to read right now.
488		// Return or sleep according to mode.
489		if mode == profBufNonBlocking {
490			// Necessary on Darwin, notetsleepg below does not work in signal handler, root cause of #61768.
491			return nil, nil, false
492		}
493		if !b.w.cas(bw, bw|profReaderSleeping) {
494			goto Read
495		}
496		// Committed to sleeping.
497		notetsleepg(&b.wait, -1)
498		noteclear(&b.wait)
499		goto Read
500	}
501	data = b.data[br.dataCount()%uint32(len(b.data)):]
502	if len(data) > numData {
503		data = data[:numData]
504	} else {
505		numData -= len(data) // available in case of wraparound
506	}
507	skip := 0
508	if data[0] == 0 {
509		// Wraparound record. Go back to the beginning of the ring.
510		skip = len(data)
511		data = b.data
512		if len(data) > numData {
513			data = data[:numData]
514		}
515	}
516
517	ntag := countSub(bw.tagCount(), br.tagCount())
518	if ntag == 0 {
519		throw("runtime: malformed profBuf buffer - tag and data out of sync")
520	}
521	tags = b.tags[br.tagCount()%uint32(len(b.tags)):]
522	if len(tags) > ntag {
523		tags = tags[:ntag]
524	}
525
526	// Count out whole data records until either data or tags is done.
527	// They are always in sync in the buffer, but due to an end-of-slice
528	// wraparound we might need to stop early and return the rest
529	// in the next call.
530	di := 0
531	ti := 0
532	for di < len(data) && data[di] != 0 && ti < len(tags) {
533		if uintptr(di)+uintptr(data[di]) > uintptr(len(data)) {
534			throw("runtime: malformed profBuf buffer - invalid size")
535		}
536		di += int(data[di])
537		ti++
538	}
539
540	// Remember how much we returned, to commit read on next call.
541	b.rNext = br.addCountsAndClearFlags(skip+di, ti)
542
543	if raceenabled {
544		// Match racereleasemerge in runtime_setProfLabel,
545		// so that the setting of the labels in runtime_setProfLabel
546		// is treated as happening before any use of the labels
547		// by our caller. The synchronization on labelSync itself is a fiction
548		// for the race detector. The actual synchronization is handled
549		// by the fact that the signal handler only reads from the current
550		// goroutine and uses atomics to write the updated queue indices,
551		// and then the read-out from the signal handler buffer uses
552		// atomics to read those queue indices.
553		raceacquire(unsafe.Pointer(&labelSync))
554	}
555
556	return data[:di], tags[:ti], false
557}
558