xref: /aosp_15_r20/external/libcap/cap/cap.go (revision 2810ac1b38eead2603277920c78344c84ddf3aff)
1*2810ac1bSKiyoung Kim// Package cap provides all the Linux Capabilities userspace library API
2*2810ac1bSKiyoung Kim// bindings in native Go.
3*2810ac1bSKiyoung Kim//
4*2810ac1bSKiyoung Kim// Capabilities are a feature of the Linux kernel that allow fine
5*2810ac1bSKiyoung Kim// grain permissions to perform privileged operations. Privileged
6*2810ac1bSKiyoung Kim// operations are required to do irregular system level operations
7*2810ac1bSKiyoung Kim// from code. You can read more about how Capabilities are intended to
8*2810ac1bSKiyoung Kim// work here:
9*2810ac1bSKiyoung Kim//
10*2810ac1bSKiyoung Kim//   https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf
11*2810ac1bSKiyoung Kim//
12*2810ac1bSKiyoung Kim// This package supports native Go bindings for all the features
13*2810ac1bSKiyoung Kim// described in that paper as well as supporting subsequent changes to
14*2810ac1bSKiyoung Kim// the kernel for other styles of inheritable Capability.
15*2810ac1bSKiyoung Kim//
16*2810ac1bSKiyoung Kim// Some simple things you can do with this package are:
17*2810ac1bSKiyoung Kim//
18*2810ac1bSKiyoung Kim//   // Read and display the capabilities of the running process
19*2810ac1bSKiyoung Kim//   c := cap.GetProc()
20*2810ac1bSKiyoung Kim//   log.Printf("this process has these caps:", c)
21*2810ac1bSKiyoung Kim//
22*2810ac1bSKiyoung Kim//   // Drop any privilege a process might have (including for root,
23*2810ac1bSKiyoung Kim//   // but note root 'owns' a lot of system files so a cap-limited
24*2810ac1bSKiyoung Kim//   // root can still do considerable damage to a running system).
25*2810ac1bSKiyoung Kim//   old := cap.GetProc()
26*2810ac1bSKiyoung Kim//   empty := cap.NewSet()
27*2810ac1bSKiyoung Kim//   if err := empty.SetProc(); err != nil {
28*2810ac1bSKiyoung Kim//       log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err)
29*2810ac1bSKiyoung Kim//   }
30*2810ac1bSKiyoung Kim//   now := cap.GetProc()
31*2810ac1bSKiyoung Kim//   if cf, _ := now.Cf(empty); cf != 0 {
32*2810ac1bSKiyoung Kim//       log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty)
33*2810ac1bSKiyoung Kim//   }
34*2810ac1bSKiyoung Kim//
35*2810ac1bSKiyoung Kim// The "cap" package operates with POSIX semantics for security
36*2810ac1bSKiyoung Kim// state. That is all OS threads are kept in sync at all times. The
37*2810ac1bSKiyoung Kim// package "kernel.org/pub/linux/libs/security/libcap/psx" is used to
38*2810ac1bSKiyoung Kim// implement POSIX semantics system calls that manipulate thread state
39*2810ac1bSKiyoung Kim// uniformly over the whole Go (and any CGo linked) process runtime.
40*2810ac1bSKiyoung Kim//
41*2810ac1bSKiyoung Kim// Note, if the Go runtime syscall interface contains the Linux
42*2810ac1bSKiyoung Kim// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see
43*2810ac1bSKiyoung Kim// https://github.com/golang/go/issues/1435 for its history) then the
44*2810ac1bSKiyoung Kim// "libcap/psx" package will use that to invoke Capability setting
45*2810ac1bSKiyoung Kim// system calls in pure Go binaries. With such an enhanced Go runtime,
46*2810ac1bSKiyoung Kim// to force this behavior, use the CGO_ENABLED=0 environment variable.
47*2810ac1bSKiyoung Kim//
48*2810ac1bSKiyoung Kim// POSIX semantics are more secure than trying to manage privilege at
49*2810ac1bSKiyoung Kim// a thread level when those threads share a common memory image as
50*2810ac1bSKiyoung Kim// they do under Linux: it is trivial to exploit a vulnerability in
51*2810ac1bSKiyoung Kim// one thread of a process to cause execution on any another
52*2810ac1bSKiyoung Kim// thread. So, any imbalance in security state, in such cases will
53*2810ac1bSKiyoung Kim// readily create an opportunity for a privilege escalation
54*2810ac1bSKiyoung Kim// vulnerability.
55*2810ac1bSKiyoung Kim//
56*2810ac1bSKiyoung Kim// POSIX semantics also work well with Go, which deliberately tries to
57*2810ac1bSKiyoung Kim// insulate the user from worrying about the number of OS threads that
58*2810ac1bSKiyoung Kim// are actually running in their program. Indeed, Go can efficiently
59*2810ac1bSKiyoung Kim// launch and manage tens of thousands of concurrent goroutines
60*2810ac1bSKiyoung Kim// without bogging the program or wider system down. It does this by
61*2810ac1bSKiyoung Kim// aggressively migrating idle threads to make progress on unblocked
62*2810ac1bSKiyoung Kim// goroutines. So, inconsistent security state across OS threads can
63*2810ac1bSKiyoung Kim// also lead to program misbehavior.
64*2810ac1bSKiyoung Kim//
65*2810ac1bSKiyoung Kim// The only exception to this process-wide common security state is
66*2810ac1bSKiyoung Kim// the cap.Launcher related functionality. This briefly locks an OS
67*2810ac1bSKiyoung Kim// thread to a goroutine in order to launch another executable - the
68*2810ac1bSKiyoung Kim// robust implementation of this kind of support is quite subtle, so
69*2810ac1bSKiyoung Kim// please read its documentation carefully, if you find that you need
70*2810ac1bSKiyoung Kim// it.
71*2810ac1bSKiyoung Kim//
72*2810ac1bSKiyoung Kim// See https://sites.google.com/site/fullycapable/ for recent updates,
73*2810ac1bSKiyoung Kim// some more complete walk-through examples of ways of using
74*2810ac1bSKiyoung Kim// 'cap.Set's etc and information on how to file bugs.
75*2810ac1bSKiyoung Kim//
76*2810ac1bSKiyoung Kim// Copyright (c) 2019-21 Andrew G. Morgan <[email protected]>
77*2810ac1bSKiyoung Kim//
78*2810ac1bSKiyoung Kim// The cap and psx packages are licensed with a (you choose) BSD
79*2810ac1bSKiyoung Kim// 3-clause or GPL2. See LICENSE file for details.
80*2810ac1bSKiyoung Kimpackage cap // import "kernel.org/pub/linux/libs/security/libcap/cap"
81*2810ac1bSKiyoung Kim
82*2810ac1bSKiyoung Kimimport (
83*2810ac1bSKiyoung Kim	"errors"
84*2810ac1bSKiyoung Kim	"sort"
85*2810ac1bSKiyoung Kim	"sync"
86*2810ac1bSKiyoung Kim	"syscall"
87*2810ac1bSKiyoung Kim	"unsafe"
88*2810ac1bSKiyoung Kim)
89*2810ac1bSKiyoung Kim
90*2810ac1bSKiyoung Kim// Value is the type of a single capability (or permission) bit.
91*2810ac1bSKiyoung Kimtype Value uint
92*2810ac1bSKiyoung Kim
93*2810ac1bSKiyoung Kim// Flag is the type of one of the three Value dimensions held in a
94*2810ac1bSKiyoung Kim// Set.  It is also used in the (*IAB).Fill() method for changing the
95*2810ac1bSKiyoung Kim// Bounding and Ambient Vectors.
96*2810ac1bSKiyoung Kimtype Flag uint
97*2810ac1bSKiyoung Kim
98*2810ac1bSKiyoung Kim// Effective, Permitted, Inheritable are the three Flags of Values
99*2810ac1bSKiyoung Kim// held in a Set.
100*2810ac1bSKiyoung Kimconst (
101*2810ac1bSKiyoung Kim	Effective Flag = iota
102*2810ac1bSKiyoung Kim	Permitted
103*2810ac1bSKiyoung Kim	Inheritable
104*2810ac1bSKiyoung Kim)
105*2810ac1bSKiyoung Kim
106*2810ac1bSKiyoung Kim// Diff summarizes the result of the (*Set).Cf() function.
107*2810ac1bSKiyoung Kimtype Diff uint
108*2810ac1bSKiyoung Kim
109*2810ac1bSKiyoung Kimconst (
110*2810ac1bSKiyoung Kim	effectiveDiff   Diff = 1 << Effective
111*2810ac1bSKiyoung Kim	permittedDiff   Diff = 1 << Permitted
112*2810ac1bSKiyoung Kim	inheritableDiff Diff = 1 << Inheritable
113*2810ac1bSKiyoung Kim)
114*2810ac1bSKiyoung Kim
115*2810ac1bSKiyoung Kim// String identifies a Flag value by its conventional "e", "p" or "i"
116*2810ac1bSKiyoung Kim// string abbreviation.
117*2810ac1bSKiyoung Kimfunc (f Flag) String() string {
118*2810ac1bSKiyoung Kim	switch f {
119*2810ac1bSKiyoung Kim	case Effective:
120*2810ac1bSKiyoung Kim		return "e"
121*2810ac1bSKiyoung Kim	case Permitted:
122*2810ac1bSKiyoung Kim		return "p"
123*2810ac1bSKiyoung Kim	case Inheritable:
124*2810ac1bSKiyoung Kim		return "i"
125*2810ac1bSKiyoung Kim	default:
126*2810ac1bSKiyoung Kim		return "<Error>"
127*2810ac1bSKiyoung Kim	}
128*2810ac1bSKiyoung Kim}
129*2810ac1bSKiyoung Kim
130*2810ac1bSKiyoung Kim// data holds a 32-bit slice of the compressed bitmaps of capability
131*2810ac1bSKiyoung Kim// sets as understood by the kernel.
132*2810ac1bSKiyoung Kimtype data [Inheritable + 1]uint32
133*2810ac1bSKiyoung Kim
134*2810ac1bSKiyoung Kim// Set is an opaque capabilities container for a set of system
135*2810ac1bSKiyoung Kim// capbilities. It holds individually addressable capability Value's
136*2810ac1bSKiyoung Kim// for the three capability Flag's. See GetFlag() and SetFlag() for
137*2810ac1bSKiyoung Kim// how to adjust them individually, and Clear() and ClearFlag() for
138*2810ac1bSKiyoung Kim// how to do bulk operations.
139*2810ac1bSKiyoung Kim//
140*2810ac1bSKiyoung Kim// For admin tasks associated with managing namespace specific file
141*2810ac1bSKiyoung Kim// capabilities, Set can also support a namespace-root-UID value which
142*2810ac1bSKiyoung Kim// defaults to zero. See GetNSOwner() and SetNSOwner().
143*2810ac1bSKiyoung Kimtype Set struct {
144*2810ac1bSKiyoung Kim	// mu protects all other members of a Set.
145*2810ac1bSKiyoung Kim	mu sync.RWMutex
146*2810ac1bSKiyoung Kim
147*2810ac1bSKiyoung Kim	// flat holds Flag Value bitmaps for all capabilities
148*2810ac1bSKiyoung Kim	// associated with this Set.
149*2810ac1bSKiyoung Kim	flat []data
150*2810ac1bSKiyoung Kim
151*2810ac1bSKiyoung Kim	// Linux specific
152*2810ac1bSKiyoung Kim	nsRoot int
153*2810ac1bSKiyoung Kim}
154*2810ac1bSKiyoung Kim
155*2810ac1bSKiyoung Kim// Various known kernel magic values.
156*2810ac1bSKiyoung Kimconst (
157*2810ac1bSKiyoung Kim	kv1 = 0x19980330 // First iteration of process capabilities (32 bits).
158*2810ac1bSKiyoung Kim	kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated.
159*2810ac1bSKiyoung Kim	kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits).
160*2810ac1bSKiyoung Kim)
161*2810ac1bSKiyoung Kim
162*2810ac1bSKiyoung Kimvar (
163*2810ac1bSKiyoung Kim	// startUp protects setting of the following values: magic,
164*2810ac1bSKiyoung Kim	// words, maxValues.
165*2810ac1bSKiyoung Kim	startUp sync.Once
166*2810ac1bSKiyoung Kim
167*2810ac1bSKiyoung Kim	// magic holds the preferred magic number for the kernel ABI.
168*2810ac1bSKiyoung Kim	magic uint32
169*2810ac1bSKiyoung Kim
170*2810ac1bSKiyoung Kim	// words holds the number of uint32's associated with each
171*2810ac1bSKiyoung Kim	// capability Flag for this session.
172*2810ac1bSKiyoung Kim	words int
173*2810ac1bSKiyoung Kim
174*2810ac1bSKiyoung Kim	// maxValues holds the number of bit values that are named by
175*2810ac1bSKiyoung Kim	// the running kernel. This is generally expected to match
176*2810ac1bSKiyoung Kim	// ValueCount which is autogenerated at packaging time.
177*2810ac1bSKiyoung Kim	maxValues uint
178*2810ac1bSKiyoung Kim)
179*2810ac1bSKiyoung Kim
180*2810ac1bSKiyoung Kimtype header struct {
181*2810ac1bSKiyoung Kim	magic uint32
182*2810ac1bSKiyoung Kim	pid   int32
183*2810ac1bSKiyoung Kim}
184*2810ac1bSKiyoung Kim
185*2810ac1bSKiyoung Kim// syscaller is a type for abstracting syscalls. The r* variants are
186*2810ac1bSKiyoung Kim// for reading state, and can be parallelized, the w* variants need to
187*2810ac1bSKiyoung Kim// be serialized so all OS threads can share state.
188*2810ac1bSKiyoung Kimtype syscaller struct {
189*2810ac1bSKiyoung Kim	r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
190*2810ac1bSKiyoung Kim	w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
191*2810ac1bSKiyoung Kim	r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
192*2810ac1bSKiyoung Kim	w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
193*2810ac1bSKiyoung Kim}
194*2810ac1bSKiyoung Kim
195*2810ac1bSKiyoung Kim// caprcall provides a pointer etc wrapper for the system calls
196*2810ac1bSKiyoung Kim// associated with getcap.
197*2810ac1bSKiyoung Kim//go:uintptrescapes
198*2810ac1bSKiyoung Kimfunc (sc *syscaller) caprcall(call uintptr, h *header, d []data) error {
199*2810ac1bSKiyoung Kim	x := uintptr(0)
200*2810ac1bSKiyoung Kim	if d != nil {
201*2810ac1bSKiyoung Kim		x = uintptr(unsafe.Pointer(&d[0]))
202*2810ac1bSKiyoung Kim	}
203*2810ac1bSKiyoung Kim	_, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0)
204*2810ac1bSKiyoung Kim	if err != 0 {
205*2810ac1bSKiyoung Kim		return err
206*2810ac1bSKiyoung Kim	}
207*2810ac1bSKiyoung Kim	return nil
208*2810ac1bSKiyoung Kim}
209*2810ac1bSKiyoung Kim
210*2810ac1bSKiyoung Kim// capwcall provides a pointer etc wrapper for the system calls
211*2810ac1bSKiyoung Kim// associated with setcap.
212*2810ac1bSKiyoung Kim//go:uintptrescapes
213*2810ac1bSKiyoung Kimfunc (sc *syscaller) capwcall(call uintptr, h *header, d []data) error {
214*2810ac1bSKiyoung Kim	x := uintptr(0)
215*2810ac1bSKiyoung Kim	if d != nil {
216*2810ac1bSKiyoung Kim		x = uintptr(unsafe.Pointer(&d[0]))
217*2810ac1bSKiyoung Kim	}
218*2810ac1bSKiyoung Kim	_, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0)
219*2810ac1bSKiyoung Kim	if err != 0 {
220*2810ac1bSKiyoung Kim		return err
221*2810ac1bSKiyoung Kim	}
222*2810ac1bSKiyoung Kim	return nil
223*2810ac1bSKiyoung Kim}
224*2810ac1bSKiyoung Kim
225*2810ac1bSKiyoung Kim// prctlrcall provides a wrapper for the prctl systemcalls that only
226*2810ac1bSKiyoung Kim// read kernel state. There is a limited number of arguments needed
227*2810ac1bSKiyoung Kim// and the caller should use 0 for those not needed.
228*2810ac1bSKiyoung Kimfunc (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) {
229*2810ac1bSKiyoung Kim	r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2)
230*2810ac1bSKiyoung Kim	if err != 0 {
231*2810ac1bSKiyoung Kim		return int(r), err
232*2810ac1bSKiyoung Kim	}
233*2810ac1bSKiyoung Kim	return int(r), nil
234*2810ac1bSKiyoung Kim}
235*2810ac1bSKiyoung Kim
236*2810ac1bSKiyoung Kim// prctlrcall6 provides a wrapper for the prctl systemcalls that only
237*2810ac1bSKiyoung Kim// read kernel state and require 6 arguments - ambient cap API, I'm
238*2810ac1bSKiyoung Kim// looking at you. There is a limited number of arguments needed and
239*2810ac1bSKiyoung Kim// the caller should use 0 for those not needed.
240*2810ac1bSKiyoung Kimfunc (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
241*2810ac1bSKiyoung Kim	r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
242*2810ac1bSKiyoung Kim	if err != 0 {
243*2810ac1bSKiyoung Kim		return int(r), err
244*2810ac1bSKiyoung Kim	}
245*2810ac1bSKiyoung Kim	return int(r), nil
246*2810ac1bSKiyoung Kim}
247*2810ac1bSKiyoung Kim
248*2810ac1bSKiyoung Kim// prctlwcall provides a wrapper for the prctl systemcalls that
249*2810ac1bSKiyoung Kim// write/modify kernel state. Where available, these will use the
250*2810ac1bSKiyoung Kim// POSIX semantics fixup system calls. There is a limited number of
251*2810ac1bSKiyoung Kim// arguments needed and the caller should use 0 for those not needed.
252*2810ac1bSKiyoung Kimfunc (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) {
253*2810ac1bSKiyoung Kim	r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2)
254*2810ac1bSKiyoung Kim	if err != 0 {
255*2810ac1bSKiyoung Kim		return int(r), err
256*2810ac1bSKiyoung Kim	}
257*2810ac1bSKiyoung Kim	return int(r), nil
258*2810ac1bSKiyoung Kim}
259*2810ac1bSKiyoung Kim
260*2810ac1bSKiyoung Kim// prctlwcall6 provides a wrapper for the prctl systemcalls that
261*2810ac1bSKiyoung Kim// write/modify kernel state and require 6 arguments - ambient cap
262*2810ac1bSKiyoung Kim// API, I'm looking at you. (Where available, these will use the POSIX
263*2810ac1bSKiyoung Kim// semantics fixup system calls). There is a limited number of
264*2810ac1bSKiyoung Kim// arguments needed and the caller should use 0 for those not needed.
265*2810ac1bSKiyoung Kimfunc (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
266*2810ac1bSKiyoung Kim	r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
267*2810ac1bSKiyoung Kim	if err != 0 {
268*2810ac1bSKiyoung Kim		return int(r), err
269*2810ac1bSKiyoung Kim	}
270*2810ac1bSKiyoung Kim	return int(r), nil
271*2810ac1bSKiyoung Kim}
272*2810ac1bSKiyoung Kim
273*2810ac1bSKiyoung Kim// cInit performs the lazy identification of the capability vintage of
274*2810ac1bSKiyoung Kim// the running system.
275*2810ac1bSKiyoung Kimfunc (sc *syscaller) cInit() {
276*2810ac1bSKiyoung Kim	h := &header{
277*2810ac1bSKiyoung Kim		magic: kv3,
278*2810ac1bSKiyoung Kim	}
279*2810ac1bSKiyoung Kim	sc.caprcall(syscall.SYS_CAPGET, h, nil)
280*2810ac1bSKiyoung Kim	magic = h.magic
281*2810ac1bSKiyoung Kim	switch magic {
282*2810ac1bSKiyoung Kim	case kv1:
283*2810ac1bSKiyoung Kim		words = 1
284*2810ac1bSKiyoung Kim	case kv2, kv3:
285*2810ac1bSKiyoung Kim		words = 2
286*2810ac1bSKiyoung Kim	default:
287*2810ac1bSKiyoung Kim		// Fall back to a known good version.
288*2810ac1bSKiyoung Kim		magic = kv3
289*2810ac1bSKiyoung Kim		words = 2
290*2810ac1bSKiyoung Kim	}
291*2810ac1bSKiyoung Kim	// Use the bounding set to evaluate which capabilities exist.
292*2810ac1bSKiyoung Kim	maxValues = uint(sort.Search(32*words, func(n int) bool {
293*2810ac1bSKiyoung Kim		_, err := GetBound(Value(n))
294*2810ac1bSKiyoung Kim		return err != nil
295*2810ac1bSKiyoung Kim	}))
296*2810ac1bSKiyoung Kim	if maxValues == 0 {
297*2810ac1bSKiyoung Kim		// Fall back to using the largest value defined at build time.
298*2810ac1bSKiyoung Kim		maxValues = NamedCount
299*2810ac1bSKiyoung Kim	}
300*2810ac1bSKiyoung Kim}
301*2810ac1bSKiyoung Kim
302*2810ac1bSKiyoung Kim// MaxBits returns the number of kernel-named capabilities discovered
303*2810ac1bSKiyoung Kim// at runtime in the current system.
304*2810ac1bSKiyoung Kimfunc MaxBits() Value {
305*2810ac1bSKiyoung Kim	startUp.Do(multisc.cInit)
306*2810ac1bSKiyoung Kim	return Value(maxValues)
307*2810ac1bSKiyoung Kim}
308*2810ac1bSKiyoung Kim
309*2810ac1bSKiyoung Kim// NewSet returns an empty capability set.
310*2810ac1bSKiyoung Kimfunc NewSet() *Set {
311*2810ac1bSKiyoung Kim	startUp.Do(multisc.cInit)
312*2810ac1bSKiyoung Kim	return &Set{
313*2810ac1bSKiyoung Kim		flat: make([]data, words),
314*2810ac1bSKiyoung Kim	}
315*2810ac1bSKiyoung Kim}
316*2810ac1bSKiyoung Kim
317*2810ac1bSKiyoung Kim// ErrBadSet indicates a nil pointer was used for a *Set, or the
318*2810ac1bSKiyoung Kim// request of the Set is invalid in some way.
319*2810ac1bSKiyoung Kimvar ErrBadSet = errors.New("bad capability set")
320*2810ac1bSKiyoung Kim
321*2810ac1bSKiyoung Kim// good confirms that c looks valid.
322*2810ac1bSKiyoung Kimfunc (c *Set) good() error {
323*2810ac1bSKiyoung Kim	if c == nil || len(c.flat) == 0 {
324*2810ac1bSKiyoung Kim		return ErrBadSet
325*2810ac1bSKiyoung Kim	}
326*2810ac1bSKiyoung Kim	return nil
327*2810ac1bSKiyoung Kim}
328*2810ac1bSKiyoung Kim
329*2810ac1bSKiyoung Kim// Dup returns a copy of the specified capability set.
330*2810ac1bSKiyoung Kimfunc (c *Set) Dup() (*Set, error) {
331*2810ac1bSKiyoung Kim	if err := c.good(); err != nil {
332*2810ac1bSKiyoung Kim		return nil, err
333*2810ac1bSKiyoung Kim	}
334*2810ac1bSKiyoung Kim	n := NewSet()
335*2810ac1bSKiyoung Kim	c.mu.RLock()
336*2810ac1bSKiyoung Kim	defer c.mu.RUnlock()
337*2810ac1bSKiyoung Kim	copy(n.flat, c.flat)
338*2810ac1bSKiyoung Kim	n.nsRoot = c.nsRoot
339*2810ac1bSKiyoung Kim	return n, nil
340*2810ac1bSKiyoung Kim}
341*2810ac1bSKiyoung Kim
342*2810ac1bSKiyoung Kim// GetPID returns the capability set associated with the target process
343*2810ac1bSKiyoung Kim// id; pid=0 is an alias for current.
344*2810ac1bSKiyoung Kimfunc GetPID(pid int) (*Set, error) {
345*2810ac1bSKiyoung Kim	v := NewSet()
346*2810ac1bSKiyoung Kim	if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil {
347*2810ac1bSKiyoung Kim		return nil, err
348*2810ac1bSKiyoung Kim	}
349*2810ac1bSKiyoung Kim	return v, nil
350*2810ac1bSKiyoung Kim}
351*2810ac1bSKiyoung Kim
352*2810ac1bSKiyoung Kim// GetProc returns the capability Set of the current process. If the
353*2810ac1bSKiyoung Kim// kernel is unable to determine the Set associated with the current
354*2810ac1bSKiyoung Kim// process, the function panic()s.
355*2810ac1bSKiyoung Kimfunc GetProc() *Set {
356*2810ac1bSKiyoung Kim	c, err := GetPID(0)
357*2810ac1bSKiyoung Kim	if err != nil {
358*2810ac1bSKiyoung Kim		panic(err)
359*2810ac1bSKiyoung Kim	}
360*2810ac1bSKiyoung Kim	return c
361*2810ac1bSKiyoung Kim}
362*2810ac1bSKiyoung Kim
363*2810ac1bSKiyoung Kim// setProc uses syscaller to set process capabilities.  Note, c is
364*2810ac1bSKiyoung Kim// either private to or (read) locked by the caller.
365*2810ac1bSKiyoung Kimfunc (sc *syscaller) setProc(c *Set) error {
366*2810ac1bSKiyoung Kim	return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat)
367*2810ac1bSKiyoung Kim}
368*2810ac1bSKiyoung Kim
369*2810ac1bSKiyoung Kim// SetProc attempts to set the capability Set of the current
370*2810ac1bSKiyoung Kim// process. The kernel will perform permission checks and an error
371*2810ac1bSKiyoung Kim// will be returned if the attempt fails. Should the attempt fail
372*2810ac1bSKiyoung Kim// no process capabilities will have been modified.
373*2810ac1bSKiyoung Kim//
374*2810ac1bSKiyoung Kim// Note, the general behavior of this call is to set the
375*2810ac1bSKiyoung Kim// process-shared capabilities. However, when called from a callback
376*2810ac1bSKiyoung Kim// function as part of a (*Launcher).Launch(), the call only sets the
377*2810ac1bSKiyoung Kim// capabilities of the thread being used to perform the launch.
378*2810ac1bSKiyoung Kimfunc (c *Set) SetProc() error {
379*2810ac1bSKiyoung Kim	if err := c.good(); err != nil {
380*2810ac1bSKiyoung Kim		return err
381*2810ac1bSKiyoung Kim	}
382*2810ac1bSKiyoung Kim	state, sc := scwStateSC()
383*2810ac1bSKiyoung Kim	defer scwSetState(launchBlocked, state, -1)
384*2810ac1bSKiyoung Kim	c.mu.RLock()
385*2810ac1bSKiyoung Kim	defer c.mu.RUnlock()
386*2810ac1bSKiyoung Kim	return sc.setProc(c)
387*2810ac1bSKiyoung Kim}
388*2810ac1bSKiyoung Kim
389*2810ac1bSKiyoung Kim// defines from uapi/linux/prctl.h
390*2810ac1bSKiyoung Kimconst (
391*2810ac1bSKiyoung Kim	prCapBSetRead = 23
392*2810ac1bSKiyoung Kim	prCapBSetDrop = 24
393*2810ac1bSKiyoung Kim)
394*2810ac1bSKiyoung Kim
395*2810ac1bSKiyoung Kim// GetBound determines if a specific capability is currently part of
396*2810ac1bSKiyoung Kim// the local bounding set. On systems where the bounding set Value is
397*2810ac1bSKiyoung Kim// not present, this function returns an error.
398*2810ac1bSKiyoung Kimfunc GetBound(val Value) (bool, error) {
399*2810ac1bSKiyoung Kim	v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0)
400*2810ac1bSKiyoung Kim	if err != nil {
401*2810ac1bSKiyoung Kim		return false, err
402*2810ac1bSKiyoung Kim	}
403*2810ac1bSKiyoung Kim	return v > 0, nil
404*2810ac1bSKiyoung Kim}
405*2810ac1bSKiyoung Kim
406*2810ac1bSKiyoung Kim//go:uintptrescapes
407*2810ac1bSKiyoung Kimfunc (sc *syscaller) dropBound(val ...Value) error {
408*2810ac1bSKiyoung Kim	for _, v := range val {
409*2810ac1bSKiyoung Kim		if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil {
410*2810ac1bSKiyoung Kim			return err
411*2810ac1bSKiyoung Kim		}
412*2810ac1bSKiyoung Kim	}
413*2810ac1bSKiyoung Kim	return nil
414*2810ac1bSKiyoung Kim}
415*2810ac1bSKiyoung Kim
416*2810ac1bSKiyoung Kim// DropBound attempts to suppress bounding set Values. The kernel will
417*2810ac1bSKiyoung Kim// never allow a bounding set Value bit to be raised once successfully
418*2810ac1bSKiyoung Kim// dropped. However, dropping requires the current process is
419*2810ac1bSKiyoung Kim// sufficiently capable (usually via cap.SETPCAP being raised in the
420*2810ac1bSKiyoung Kim// Effective flag of the process' Set). Note, the drops are performed
421*2810ac1bSKiyoung Kim// in order and if one bounding value cannot be dropped, the function
422*2810ac1bSKiyoung Kim// returns immediately with an error which may leave the system in an
423*2810ac1bSKiyoung Kim// ill-defined state. The caller can determine where things went wrong
424*2810ac1bSKiyoung Kim// using GetBound().
425*2810ac1bSKiyoung Kimfunc DropBound(val ...Value) error {
426*2810ac1bSKiyoung Kim	state, sc := scwStateSC()
427*2810ac1bSKiyoung Kim	defer scwSetState(launchBlocked, state, -1)
428*2810ac1bSKiyoung Kim	return sc.dropBound(val...)
429*2810ac1bSKiyoung Kim}
430*2810ac1bSKiyoung Kim
431*2810ac1bSKiyoung Kim// defines from uapi/linux/prctl.h
432*2810ac1bSKiyoung Kimconst (
433*2810ac1bSKiyoung Kim	prCapAmbient = 47
434*2810ac1bSKiyoung Kim
435*2810ac1bSKiyoung Kim	prCapAmbientIsSet    = 1
436*2810ac1bSKiyoung Kim	prCapAmbientRaise    = 2
437*2810ac1bSKiyoung Kim	prCapAmbientLower    = 3
438*2810ac1bSKiyoung Kim	prCapAmbientClearAll = 4
439*2810ac1bSKiyoung Kim)
440*2810ac1bSKiyoung Kim
441*2810ac1bSKiyoung Kim// GetAmbient determines if a specific capability is currently part of
442*2810ac1bSKiyoung Kim// the local ambient set. On systems where the ambient set Value is
443*2810ac1bSKiyoung Kim// not present, this function returns an error.
444*2810ac1bSKiyoung Kimfunc GetAmbient(val Value) (bool, error) {
445*2810ac1bSKiyoung Kim	r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0)
446*2810ac1bSKiyoung Kim	return r > 0, err
447*2810ac1bSKiyoung Kim}
448*2810ac1bSKiyoung Kim
449*2810ac1bSKiyoung Kim//go:uintptrescapes
450*2810ac1bSKiyoung Kimfunc (sc *syscaller) setAmbient(enable bool, val ...Value) error {
451*2810ac1bSKiyoung Kim	dir := uintptr(prCapAmbientLower)
452*2810ac1bSKiyoung Kim	if enable {
453*2810ac1bSKiyoung Kim		dir = prCapAmbientRaise
454*2810ac1bSKiyoung Kim	}
455*2810ac1bSKiyoung Kim	for _, v := range val {
456*2810ac1bSKiyoung Kim		_, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0)
457*2810ac1bSKiyoung Kim		if err != nil {
458*2810ac1bSKiyoung Kim			return err
459*2810ac1bSKiyoung Kim		}
460*2810ac1bSKiyoung Kim	}
461*2810ac1bSKiyoung Kim	return nil
462*2810ac1bSKiyoung Kim}
463*2810ac1bSKiyoung Kim
464*2810ac1bSKiyoung Kim// SetAmbient attempts to set a specific Value bit to the state,
465*2810ac1bSKiyoung Kim// enable. This function will return an error if insufficient
466*2810ac1bSKiyoung Kim// permission is available to perform this task. The settings are
467*2810ac1bSKiyoung Kim// performed in order and the function returns immediately an error is
468*2810ac1bSKiyoung Kim// detected. Use GetAmbient() to unravel where things went
469*2810ac1bSKiyoung Kim// wrong. Note, the cap package manages an abstraction IAB that
470*2810ac1bSKiyoung Kim// captures all three inheritable vectors in a single type. Consider
471*2810ac1bSKiyoung Kim// using that.
472*2810ac1bSKiyoung Kimfunc SetAmbient(enable bool, val ...Value) error {
473*2810ac1bSKiyoung Kim	state, sc := scwStateSC()
474*2810ac1bSKiyoung Kim	defer scwSetState(launchBlocked, state, -1)
475*2810ac1bSKiyoung Kim	return sc.setAmbient(enable, val...)
476*2810ac1bSKiyoung Kim}
477*2810ac1bSKiyoung Kim
478*2810ac1bSKiyoung Kimfunc (sc *syscaller) resetAmbient() error {
479*2810ac1bSKiyoung Kim	var v bool
480*2810ac1bSKiyoung Kim	var err error
481*2810ac1bSKiyoung Kim
482*2810ac1bSKiyoung Kim	for c := Value(0); !v; c++ {
483*2810ac1bSKiyoung Kim		if v, err = GetAmbient(c); err != nil {
484*2810ac1bSKiyoung Kim			// no non-zero values found.
485*2810ac1bSKiyoung Kim			return nil
486*2810ac1bSKiyoung Kim		}
487*2810ac1bSKiyoung Kim	}
488*2810ac1bSKiyoung Kim	_, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0)
489*2810ac1bSKiyoung Kim	return err
490*2810ac1bSKiyoung Kim}
491*2810ac1bSKiyoung Kim
492*2810ac1bSKiyoung Kim// ResetAmbient attempts to ensure the Ambient set is fully
493*2810ac1bSKiyoung Kim// cleared. It works by first reading the set and if it finds any bits
494*2810ac1bSKiyoung Kim// raised it will attempt a reset. The test before attempting a reset
495*2810ac1bSKiyoung Kim// behavior is a workaround for situations where the Ambient API is
496*2810ac1bSKiyoung Kim// locked, but a reset is not actually needed. No Ambient bit not
497*2810ac1bSKiyoung Kim// already raised in both the Permitted and Inheritable Set is allowed
498*2810ac1bSKiyoung Kim// to be raised by the kernel.
499*2810ac1bSKiyoung Kimfunc ResetAmbient() error {
500*2810ac1bSKiyoung Kim	state, sc := scwStateSC()
501*2810ac1bSKiyoung Kim	defer scwSetState(launchBlocked, state, -1)
502*2810ac1bSKiyoung Kim	return sc.resetAmbient()
503*2810ac1bSKiyoung Kim}
504