1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "go_tls.h"
7#include "funcdata.h"
8#include "textflag.h"
9#include "cgo/abi_amd64.h"
10
11// _rt0_amd64 is common startup code for most amd64 systems when using
12// internal linking. This is the entry point for the program from the
13// kernel for an ordinary -buildmode=exe program. The stack holds the
14// number of arguments and the C-style argv.
15TEXT _rt0_amd64(SB),NOSPLIT,$-8
16	MOVQ	0(SP), DI	// argc
17	LEAQ	8(SP), SI	// argv
18	JMP	runtime·rt0_go(SB)
19
20// main is common startup code for most amd64 systems when using
21// external linking. The C startup code will call the symbol "main"
22// passing argc and argv in the usual C ABI registers DI and SI.
23TEXT main(SB),NOSPLIT,$-8
24	JMP	runtime·rt0_go(SB)
25
26// _rt0_amd64_lib is common startup code for most amd64 systems when
27// using -buildmode=c-archive or -buildmode=c-shared. The linker will
28// arrange to invoke this function as a global constructor (for
29// c-archive) or when the shared library is loaded (for c-shared).
30// We expect argc and argv to be passed in the usual C ABI registers
31// DI and SI.
32TEXT _rt0_amd64_lib(SB),NOSPLIT|NOFRAME,$0
33	// Transition from C ABI to Go ABI.
34	PUSH_REGS_HOST_TO_ABI0()
35
36	MOVQ	DI, _rt0_amd64_lib_argc<>(SB)
37	MOVQ	SI, _rt0_amd64_lib_argv<>(SB)
38
39	// Synchronous initialization.
40	CALL	runtime·libpreinit(SB)
41
42	// Create a new thread to finish Go runtime initialization.
43	MOVQ	_cgo_sys_thread_create(SB), AX
44	TESTQ	AX, AX
45	JZ	nocgo
46
47	// We're calling back to C.
48	// Align stack per ELF ABI requirements.
49	MOVQ	SP, BX  // Callee-save in C ABI
50	ANDQ	$~15, SP
51	MOVQ	$_rt0_amd64_lib_go(SB), DI
52	MOVQ	$0, SI
53	CALL	AX
54	MOVQ	BX, SP
55	JMP	restore
56
57nocgo:
58	ADJSP	$16
59	MOVQ	$0x800000, 0(SP)		// stacksize
60	MOVQ	$_rt0_amd64_lib_go(SB), AX
61	MOVQ	AX, 8(SP)			// fn
62	CALL	runtime·newosproc0(SB)
63	ADJSP	$-16
64
65restore:
66	POP_REGS_HOST_TO_ABI0()
67	RET
68
69// _rt0_amd64_lib_go initializes the Go runtime.
70// This is started in a separate thread by _rt0_amd64_lib.
71TEXT _rt0_amd64_lib_go(SB),NOSPLIT,$0
72	MOVQ	_rt0_amd64_lib_argc<>(SB), DI
73	MOVQ	_rt0_amd64_lib_argv<>(SB), SI
74	JMP	runtime·rt0_go(SB)
75
76DATA _rt0_amd64_lib_argc<>(SB)/8, $0
77GLOBL _rt0_amd64_lib_argc<>(SB),NOPTR, $8
78DATA _rt0_amd64_lib_argv<>(SB)/8, $0
79GLOBL _rt0_amd64_lib_argv<>(SB),NOPTR, $8
80
81#ifdef GOAMD64_v2
82DATA bad_cpu_msg<>+0x00(SB)/84, $"This program can only be run on AMD64 processors with v2 microarchitecture support.\n"
83#endif
84
85#ifdef GOAMD64_v3
86DATA bad_cpu_msg<>+0x00(SB)/84, $"This program can only be run on AMD64 processors with v3 microarchitecture support.\n"
87#endif
88
89#ifdef GOAMD64_v4
90DATA bad_cpu_msg<>+0x00(SB)/84, $"This program can only be run on AMD64 processors with v4 microarchitecture support.\n"
91#endif
92
93GLOBL bad_cpu_msg<>(SB), RODATA, $84
94
95// Define a list of AMD64 microarchitecture level features
96// https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels
97
98                     // SSE3     SSSE3    CMPXCHNG16 SSE4.1    SSE4.2    POPCNT
99#define V2_FEATURES_CX (1 << 0 | 1 << 9 | 1 << 13  | 1 << 19 | 1 << 20 | 1 << 23)
100                         // LAHF/SAHF
101#define V2_EXT_FEATURES_CX (1 << 0)
102                                      // FMA       MOVBE     OSXSAVE   AVX       F16C
103#define V3_FEATURES_CX (V2_FEATURES_CX | 1 << 12 | 1 << 22 | 1 << 27 | 1 << 28 | 1 << 29)
104                                              // ABM (FOR LZNCT)
105#define V3_EXT_FEATURES_CX (V2_EXT_FEATURES_CX | 1 << 5)
106                         // BMI1     AVX2     BMI2
107#define V3_EXT_FEATURES_BX (1 << 3 | 1 << 5 | 1 << 8)
108                       // XMM      YMM
109#define V3_OS_SUPPORT_AX (1 << 1 | 1 << 2)
110
111#define V4_FEATURES_CX V3_FEATURES_CX
112
113#define V4_EXT_FEATURES_CX V3_EXT_FEATURES_CX
114                                              // AVX512F   AVX512DQ  AVX512CD  AVX512BW  AVX512VL
115#define V4_EXT_FEATURES_BX (V3_EXT_FEATURES_BX | 1 << 16 | 1 << 17 | 1 << 28 | 1 << 30 | 1 << 31)
116                                          // OPMASK   ZMM
117#define V4_OS_SUPPORT_AX (V3_OS_SUPPORT_AX | 1 << 5 | (1 << 6 | 1 << 7))
118
119#ifdef GOAMD64_v2
120#define NEED_MAX_CPUID 0x80000001
121#define NEED_FEATURES_CX V2_FEATURES_CX
122#define NEED_EXT_FEATURES_CX V2_EXT_FEATURES_CX
123#endif
124
125#ifdef GOAMD64_v3
126#define NEED_MAX_CPUID 0x80000001
127#define NEED_FEATURES_CX V3_FEATURES_CX
128#define NEED_EXT_FEATURES_CX V3_EXT_FEATURES_CX
129#define NEED_EXT_FEATURES_BX V3_EXT_FEATURES_BX
130#define NEED_OS_SUPPORT_AX V3_OS_SUPPORT_AX
131#endif
132
133#ifdef GOAMD64_v4
134#define NEED_MAX_CPUID 0x80000001
135#define NEED_FEATURES_CX V4_FEATURES_CX
136#define NEED_EXT_FEATURES_CX V4_EXT_FEATURES_CX
137#define NEED_EXT_FEATURES_BX V4_EXT_FEATURES_BX
138
139// Darwin requires a different approach to check AVX512 support, see CL 285572.
140#ifdef GOOS_darwin
141#define NEED_OS_SUPPORT_AX V3_OS_SUPPORT_AX
142// These values are from:
143// https://github.com/apple/darwin-xnu/blob/xnu-4570.1.46/osfmk/i386/cpu_capabilities.h
144#define commpage64_base_address         0x00007fffffe00000
145#define commpage64_cpu_capabilities64   (commpage64_base_address+0x010)
146#define commpage64_version              (commpage64_base_address+0x01E)
147#define AVX512F                         0x0000004000000000
148#define AVX512CD                        0x0000008000000000
149#define AVX512DQ                        0x0000010000000000
150#define AVX512BW                        0x0000020000000000
151#define AVX512VL                        0x0000100000000000
152#define NEED_DARWIN_SUPPORT             (AVX512F | AVX512DQ | AVX512CD | AVX512BW | AVX512VL)
153#else
154#define NEED_OS_SUPPORT_AX V4_OS_SUPPORT_AX
155#endif
156
157#endif
158
159TEXT runtime·rt0_go(SB),NOSPLIT|NOFRAME|TOPFRAME,$0
160	// copy arguments forward on an even stack
161	MOVQ	DI, AX		// argc
162	MOVQ	SI, BX		// argv
163	SUBQ	$(5*8), SP		// 3args 2auto
164	ANDQ	$~15, SP
165	MOVQ	AX, 24(SP)
166	MOVQ	BX, 32(SP)
167
168	// create istack out of the given (operating system) stack.
169	// _cgo_init may update stackguard.
170	MOVQ	$runtime·g0(SB), DI
171	LEAQ	(-64*1024)(SP), BX
172	MOVQ	BX, g_stackguard0(DI)
173	MOVQ	BX, g_stackguard1(DI)
174	MOVQ	BX, (g_stack+stack_lo)(DI)
175	MOVQ	SP, (g_stack+stack_hi)(DI)
176
177	// find out information about the processor we're on
178	MOVL	$0, AX
179	CPUID
180	CMPL	AX, $0
181	JE	nocpuinfo
182
183	CMPL	BX, $0x756E6547  // "Genu"
184	JNE	notintel
185	CMPL	DX, $0x49656E69  // "ineI"
186	JNE	notintel
187	CMPL	CX, $0x6C65746E  // "ntel"
188	JNE	notintel
189	MOVB	$1, runtime·isIntel(SB)
190
191notintel:
192	// Load EAX=1 cpuid flags
193	MOVL	$1, AX
194	CPUID
195	MOVL	AX, runtime·processorVersionInfo(SB)
196
197nocpuinfo:
198	// if there is an _cgo_init, call it.
199	MOVQ	_cgo_init(SB), AX
200	TESTQ	AX, AX
201	JZ	needtls
202	// arg 1: g0, already in DI
203	MOVQ	$setg_gcc<>(SB), SI // arg 2: setg_gcc
204	MOVQ	$0, DX	// arg 3, 4: not used when using platform's TLS
205	MOVQ	$0, CX
206#ifdef GOOS_android
207	MOVQ	$runtime·tls_g(SB), DX 	// arg 3: &tls_g
208	// arg 4: TLS base, stored in slot 0 (Android's TLS_SLOT_SELF).
209	// Compensate for tls_g (+16).
210	MOVQ	-16(TLS), CX
211#endif
212#ifdef GOOS_windows
213	MOVQ	$runtime·tls_g(SB), DX 	// arg 3: &tls_g
214	// Adjust for the Win64 calling convention.
215	MOVQ	CX, R9 // arg 4
216	MOVQ	DX, R8 // arg 3
217	MOVQ	SI, DX // arg 2
218	MOVQ	DI, CX // arg 1
219#endif
220	CALL	AX
221
222	// update stackguard after _cgo_init
223	MOVQ	$runtime·g0(SB), CX
224	MOVQ	(g_stack+stack_lo)(CX), AX
225	ADDQ	$const_stackGuard, AX
226	MOVQ	AX, g_stackguard0(CX)
227	MOVQ	AX, g_stackguard1(CX)
228
229#ifndef GOOS_windows
230	JMP ok
231#endif
232needtls:
233#ifdef GOOS_plan9
234	// skip TLS setup on Plan 9
235	JMP ok
236#endif
237#ifdef GOOS_solaris
238	// skip TLS setup on Solaris
239	JMP ok
240#endif
241#ifdef GOOS_illumos
242	// skip TLS setup on illumos
243	JMP ok
244#endif
245#ifdef GOOS_darwin
246	// skip TLS setup on Darwin
247	JMP ok
248#endif
249#ifdef GOOS_openbsd
250	// skip TLS setup on OpenBSD
251	JMP ok
252#endif
253
254#ifdef GOOS_windows
255	CALL	runtime·wintls(SB)
256#endif
257
258	LEAQ	runtime·m0+m_tls(SB), DI
259	CALL	runtime·settls(SB)
260
261	// store through it, to make sure it works
262	get_tls(BX)
263	MOVQ	$0x123, g(BX)
264	MOVQ	runtime·m0+m_tls(SB), AX
265	CMPQ	AX, $0x123
266	JEQ 2(PC)
267	CALL	runtime·abort(SB)
268ok:
269	// set the per-goroutine and per-mach "registers"
270	get_tls(BX)
271	LEAQ	runtime·g0(SB), CX
272	MOVQ	CX, g(BX)
273	LEAQ	runtime·m0(SB), AX
274
275	// save m->g0 = g0
276	MOVQ	CX, m_g0(AX)
277	// save m0 to g0->m
278	MOVQ	AX, g_m(CX)
279
280	CLD				// convention is D is always left cleared
281
282	// Check GOAMD64 requirements
283	// We need to do this after setting up TLS, so that
284	// we can report an error if there is a failure. See issue 49586.
285#ifdef NEED_FEATURES_CX
286	MOVL	$0, AX
287	CPUID
288	CMPL	AX, $0
289	JE	bad_cpu
290	MOVL	$1, AX
291	CPUID
292	ANDL	$NEED_FEATURES_CX, CX
293	CMPL	CX, $NEED_FEATURES_CX
294	JNE	bad_cpu
295#endif
296
297#ifdef NEED_MAX_CPUID
298	MOVL	$0x80000000, AX
299	CPUID
300	CMPL	AX, $NEED_MAX_CPUID
301	JL	bad_cpu
302#endif
303
304#ifdef NEED_EXT_FEATURES_BX
305	MOVL	$7, AX
306	MOVL	$0, CX
307	CPUID
308	ANDL	$NEED_EXT_FEATURES_BX, BX
309	CMPL	BX, $NEED_EXT_FEATURES_BX
310	JNE	bad_cpu
311#endif
312
313#ifdef NEED_EXT_FEATURES_CX
314	MOVL	$0x80000001, AX
315	CPUID
316	ANDL	$NEED_EXT_FEATURES_CX, CX
317	CMPL	CX, $NEED_EXT_FEATURES_CX
318	JNE	bad_cpu
319#endif
320
321#ifdef NEED_OS_SUPPORT_AX
322	XORL    CX, CX
323	XGETBV
324	ANDL	$NEED_OS_SUPPORT_AX, AX
325	CMPL	AX, $NEED_OS_SUPPORT_AX
326	JNE	bad_cpu
327#endif
328
329#ifdef NEED_DARWIN_SUPPORT
330	MOVQ	$commpage64_version, BX
331	CMPW	(BX), $13  // cpu_capabilities64 undefined in versions < 13
332	JL	bad_cpu
333	MOVQ	$commpage64_cpu_capabilities64, BX
334	MOVQ	(BX), BX
335	MOVQ	$NEED_DARWIN_SUPPORT, CX
336	ANDQ	CX, BX
337	CMPQ	BX, CX
338	JNE	bad_cpu
339#endif
340
341	CALL	runtime·check(SB)
342
343	MOVL	24(SP), AX		// copy argc
344	MOVL	AX, 0(SP)
345	MOVQ	32(SP), AX		// copy argv
346	MOVQ	AX, 8(SP)
347	CALL	runtime·args(SB)
348	CALL	runtime·osinit(SB)
349	CALL	runtime·schedinit(SB)
350
351	// create a new goroutine to start program
352	MOVQ	$runtime·mainPC(SB), AX		// entry
353	PUSHQ	AX
354	CALL	runtime·newproc(SB)
355	POPQ	AX
356
357	// start this M
358	CALL	runtime·mstart(SB)
359
360	CALL	runtime·abort(SB)	// mstart should never return
361	RET
362
363bad_cpu: // show that the program requires a certain microarchitecture level.
364	MOVQ	$2, 0(SP)
365	MOVQ	$bad_cpu_msg<>(SB), AX
366	MOVQ	AX, 8(SP)
367	MOVQ	$84, 16(SP)
368	CALL	runtime·write(SB)
369	MOVQ	$1, 0(SP)
370	CALL	runtime·exit(SB)
371	CALL	runtime·abort(SB)
372	RET
373
374	// Prevent dead-code elimination of debugCallV2 and debugPinnerV1, which are
375	// intended to be called by debuggers.
376	MOVQ	$runtime·debugPinnerV1<ABIInternal>(SB), AX
377	MOVQ	$runtime·debugCallV2<ABIInternal>(SB), AX
378	RET
379
380// mainPC is a function value for runtime.main, to be passed to newproc.
381// The reference to runtime.main is made via ABIInternal, since the
382// actual function (not the ABI0 wrapper) is needed by newproc.
383DATA	runtime·mainPC+0(SB)/8,$runtime·main<ABIInternal>(SB)
384GLOBL	runtime·mainPC(SB),RODATA,$8
385
386TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
387	BYTE	$0xcc
388	RET
389
390TEXT runtime·asminit(SB),NOSPLIT,$0-0
391	// No per-thread init.
392	RET
393
394TEXT runtime·mstart(SB),NOSPLIT|TOPFRAME|NOFRAME,$0
395	CALL	runtime·mstart0(SB)
396	RET // not reached
397
398/*
399 *  go-routine
400 */
401
402// func gogo(buf *gobuf)
403// restore state from Gobuf; longjmp
404TEXT runtime·gogo(SB), NOSPLIT, $0-8
405	MOVQ	buf+0(FP), BX		// gobuf
406	MOVQ	gobuf_g(BX), DX
407	MOVQ	0(DX), CX		// make sure g != nil
408	JMP	gogo<>(SB)
409
410TEXT gogo<>(SB), NOSPLIT, $0
411	get_tls(CX)
412	MOVQ	DX, g(CX)
413	MOVQ	DX, R14		// set the g register
414	MOVQ	gobuf_sp(BX), SP	// restore SP
415	MOVQ	gobuf_ret(BX), AX
416	MOVQ	gobuf_ctxt(BX), DX
417	MOVQ	gobuf_bp(BX), BP
418	MOVQ	$0, gobuf_sp(BX)	// clear to help garbage collector
419	MOVQ	$0, gobuf_ret(BX)
420	MOVQ	$0, gobuf_ctxt(BX)
421	MOVQ	$0, gobuf_bp(BX)
422	MOVQ	gobuf_pc(BX), BX
423	JMP	BX
424
425// func mcall(fn func(*g))
426// Switch to m->g0's stack, call fn(g).
427// Fn must never return. It should gogo(&g->sched)
428// to keep running g.
429TEXT runtime·mcall<ABIInternal>(SB), NOSPLIT, $0-8
430	MOVQ	AX, DX	// DX = fn
431
432	// Save state in g->sched. The caller's SP and PC are restored by gogo to
433	// resume execution in the caller's frame (implicit return). The caller's BP
434	// is also restored to support frame pointer unwinding.
435	MOVQ	SP, BX	// hide (SP) reads from vet
436	MOVQ	8(BX), BX	// caller's PC
437	MOVQ	BX, (g_sched+gobuf_pc)(R14)
438	LEAQ	fn+0(FP), BX	// caller's SP
439	MOVQ	BX, (g_sched+gobuf_sp)(R14)
440	// Get the caller's frame pointer by dereferencing BP. Storing BP as it is
441	// can cause a frame pointer cycle, see CL 476235.
442	MOVQ	(BP), BX // caller's BP
443	MOVQ	BX, (g_sched+gobuf_bp)(R14)
444
445	// switch to m->g0 & its stack, call fn
446	MOVQ	g_m(R14), BX
447	MOVQ	m_g0(BX), SI	// SI = g.m.g0
448	CMPQ	SI, R14	// if g == m->g0 call badmcall
449	JNE	goodm
450	JMP	runtime·badmcall(SB)
451goodm:
452	MOVQ	R14, AX		// AX (and arg 0) = g
453	MOVQ	SI, R14		// g = g.m.g0
454	get_tls(CX)		// Set G in TLS
455	MOVQ	R14, g(CX)
456	MOVQ	(g_sched+gobuf_sp)(R14), SP	// sp = g0.sched.sp
457	PUSHQ	AX	// open up space for fn's arg spill slot
458	MOVQ	0(DX), R12
459	CALL	R12		// fn(g)
460	// The Windows native stack unwinder incorrectly classifies the next instruction
461	// as part of the function epilogue, producing a wrong call stack.
462	// Add a NOP to work around this issue. See go.dev/issue/67007.
463	BYTE	$0x90
464	POPQ	AX
465	JMP	runtime·badmcall2(SB)
466	RET
467
468// systemstack_switch is a dummy routine that systemstack leaves at the bottom
469// of the G stack. We need to distinguish the routine that
470// lives at the bottom of the G stack from the one that lives
471// at the top of the system stack because the one at the top of
472// the system stack terminates the stack walk (see topofstack()).
473// The frame layout needs to match systemstack
474// so that it can pretend to be systemstack_switch.
475TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
476	UNDEF
477	// Make sure this function is not leaf,
478	// so the frame is saved.
479	CALL	runtime·abort(SB)
480	RET
481
482// func systemstack(fn func())
483TEXT runtime·systemstack(SB), NOSPLIT, $0-8
484	MOVQ	fn+0(FP), DI	// DI = fn
485	get_tls(CX)
486	MOVQ	g(CX), AX	// AX = g
487	MOVQ	g_m(AX), BX	// BX = m
488
489	CMPQ	AX, m_gsignal(BX)
490	JEQ	noswitch
491
492	MOVQ	m_g0(BX), DX	// DX = g0
493	CMPQ	AX, DX
494	JEQ	noswitch
495
496	CMPQ	AX, m_curg(BX)
497	JNE	bad
498
499	// Switch stacks.
500	// The original frame pointer is stored in BP,
501	// which is useful for stack unwinding.
502	// Save our state in g->sched. Pretend to
503	// be systemstack_switch if the G stack is scanned.
504	CALL	gosave_systemstack_switch<>(SB)
505
506	// switch to g0
507	MOVQ	DX, g(CX)
508	MOVQ	DX, R14 // set the g register
509	MOVQ	(g_sched+gobuf_sp)(DX), SP
510
511	// call target function
512	MOVQ	DI, DX
513	MOVQ	0(DI), DI
514	CALL	DI
515
516	// switch back to g
517	get_tls(CX)
518	MOVQ	g(CX), AX
519	MOVQ	g_m(AX), BX
520	MOVQ	m_curg(BX), AX
521	MOVQ	AX, g(CX)
522	MOVQ	(g_sched+gobuf_sp)(AX), SP
523	MOVQ	(g_sched+gobuf_bp)(AX), BP
524	MOVQ	$0, (g_sched+gobuf_sp)(AX)
525	MOVQ	$0, (g_sched+gobuf_bp)(AX)
526	RET
527
528noswitch:
529	// already on m stack; tail call the function
530	// Using a tail call here cleans up tracebacks since we won't stop
531	// at an intermediate systemstack.
532	MOVQ	DI, DX
533	MOVQ	0(DI), DI
534	// The function epilogue is not called on a tail call.
535	// Pop BP from the stack to simulate it.
536	POPQ	BP
537	JMP	DI
538
539bad:
540	// Bad: g is not gsignal, not g0, not curg. What is it?
541	MOVQ	$runtime·badsystemstack(SB), AX
542	CALL	AX
543	INT	$3
544
545// func switchToCrashStack0(fn func())
546TEXT runtime·switchToCrashStack0<ABIInternal>(SB), NOSPLIT, $0-8
547	MOVQ	g_m(R14), BX // curm
548
549	// set g to gcrash
550	LEAQ	runtime·gcrash(SB), R14 // g = &gcrash
551	MOVQ	BX, g_m(R14)            // g.m = curm
552	MOVQ	R14, m_g0(BX)           // curm.g0 = g
553	get_tls(CX)
554	MOVQ	R14, g(CX)
555
556	// switch to crashstack
557	MOVQ	(g_stack+stack_hi)(R14), BX
558	SUBQ	$(4*8), BX
559	MOVQ	BX, SP
560
561	// call target function
562	MOVQ	AX, DX
563	MOVQ	0(AX), AX
564	CALL	AX
565
566	// should never return
567	CALL	runtime·abort(SB)
568	UNDEF
569
570/*
571 * support for morestack
572 */
573
574// Called during function prolog when more stack is needed.
575//
576// The traceback routines see morestack on a g0 as being
577// the top of a stack (for example, morestack calling newstack
578// calling the scheduler calling newm calling gc), so we must
579// record an argument size. For that purpose, it has no arguments.
580TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
581	// Cannot grow scheduler stack (m->g0).
582	get_tls(CX)
583	MOVQ	g(CX), DI     // DI = g
584	MOVQ	g_m(DI), BX   // BX = m
585
586	// Set g->sched to context in f.
587	MOVQ	0(SP), AX // f's PC
588	MOVQ	AX, (g_sched+gobuf_pc)(DI)
589	LEAQ	8(SP), AX // f's SP
590	MOVQ	AX, (g_sched+gobuf_sp)(DI)
591	MOVQ	BP, (g_sched+gobuf_bp)(DI)
592	MOVQ	DX, (g_sched+gobuf_ctxt)(DI)
593
594	MOVQ	m_g0(BX), SI  // SI = m.g0
595	CMPQ	DI, SI
596	JNE	3(PC)
597	CALL	runtime·badmorestackg0(SB)
598	CALL	runtime·abort(SB)
599
600	// Cannot grow signal stack (m->gsignal).
601	MOVQ	m_gsignal(BX), SI
602	CMPQ	DI, SI
603	JNE	3(PC)
604	CALL	runtime·badmorestackgsignal(SB)
605	CALL	runtime·abort(SB)
606
607	// Called from f.
608	// Set m->morebuf to f's caller.
609	NOP	SP	// tell vet SP changed - stop checking offsets
610	MOVQ	8(SP), AX	// f's caller's PC
611	MOVQ	AX, (m_morebuf+gobuf_pc)(BX)
612	LEAQ	16(SP), AX	// f's caller's SP
613	MOVQ	AX, (m_morebuf+gobuf_sp)(BX)
614	MOVQ	DI, (m_morebuf+gobuf_g)(BX)
615
616	// Call newstack on m->g0's stack.
617	MOVQ	m_g0(BX), BX
618	MOVQ	BX, g(CX)
619	MOVQ	(g_sched+gobuf_sp)(BX), SP
620	MOVQ	(g_sched+gobuf_bp)(BX), BP
621	CALL	runtime·newstack(SB)
622	CALL	runtime·abort(SB)	// crash if newstack returns
623	RET
624
625// morestack but not preserving ctxt.
626TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
627	MOVL	$0, DX
628	JMP	runtime·morestack(SB)
629
630// spillArgs stores return values from registers to a *internal/abi.RegArgs in R12.
631TEXT ·spillArgs(SB),NOSPLIT,$0-0
632	MOVQ AX, 0(R12)
633	MOVQ BX, 8(R12)
634	MOVQ CX, 16(R12)
635	MOVQ DI, 24(R12)
636	MOVQ SI, 32(R12)
637	MOVQ R8, 40(R12)
638	MOVQ R9, 48(R12)
639	MOVQ R10, 56(R12)
640	MOVQ R11, 64(R12)
641	MOVQ X0, 72(R12)
642	MOVQ X1, 80(R12)
643	MOVQ X2, 88(R12)
644	MOVQ X3, 96(R12)
645	MOVQ X4, 104(R12)
646	MOVQ X5, 112(R12)
647	MOVQ X6, 120(R12)
648	MOVQ X7, 128(R12)
649	MOVQ X8, 136(R12)
650	MOVQ X9, 144(R12)
651	MOVQ X10, 152(R12)
652	MOVQ X11, 160(R12)
653	MOVQ X12, 168(R12)
654	MOVQ X13, 176(R12)
655	MOVQ X14, 184(R12)
656	RET
657
658// unspillArgs loads args into registers from a *internal/abi.RegArgs in R12.
659TEXT ·unspillArgs(SB),NOSPLIT,$0-0
660	MOVQ 0(R12), AX
661	MOVQ 8(R12), BX
662	MOVQ 16(R12), CX
663	MOVQ 24(R12), DI
664	MOVQ 32(R12), SI
665	MOVQ 40(R12), R8
666	MOVQ 48(R12), R9
667	MOVQ 56(R12), R10
668	MOVQ 64(R12), R11
669	MOVQ 72(R12), X0
670	MOVQ 80(R12), X1
671	MOVQ 88(R12), X2
672	MOVQ 96(R12), X3
673	MOVQ 104(R12), X4
674	MOVQ 112(R12), X5
675	MOVQ 120(R12), X6
676	MOVQ 128(R12), X7
677	MOVQ 136(R12), X8
678	MOVQ 144(R12), X9
679	MOVQ 152(R12), X10
680	MOVQ 160(R12), X11
681	MOVQ 168(R12), X12
682	MOVQ 176(R12), X13
683	MOVQ 184(R12), X14
684	RET
685
686// reflectcall: call a function with the given argument list
687// func call(stackArgsType *_type, f *FuncVal, stackArgs *byte, stackArgsSize, stackRetOffset, frameSize uint32, regArgs *abi.RegArgs).
688// we don't have variable-sized frames, so we use a small number
689// of constant-sized-frame functions to encode a few bits of size in the pc.
690// Caution: ugly multiline assembly macros in your future!
691
692#define DISPATCH(NAME,MAXSIZE)		\
693	CMPQ	CX, $MAXSIZE;		\
694	JA	3(PC);			\
695	MOVQ	$NAME(SB), AX;		\
696	JMP	AX
697// Note: can't just "JMP NAME(SB)" - bad inlining results.
698
699TEXT ·reflectcall(SB), NOSPLIT, $0-48
700	MOVLQZX frameSize+32(FP), CX
701	DISPATCH(runtime·call16, 16)
702	DISPATCH(runtime·call32, 32)
703	DISPATCH(runtime·call64, 64)
704	DISPATCH(runtime·call128, 128)
705	DISPATCH(runtime·call256, 256)
706	DISPATCH(runtime·call512, 512)
707	DISPATCH(runtime·call1024, 1024)
708	DISPATCH(runtime·call2048, 2048)
709	DISPATCH(runtime·call4096, 4096)
710	DISPATCH(runtime·call8192, 8192)
711	DISPATCH(runtime·call16384, 16384)
712	DISPATCH(runtime·call32768, 32768)
713	DISPATCH(runtime·call65536, 65536)
714	DISPATCH(runtime·call131072, 131072)
715	DISPATCH(runtime·call262144, 262144)
716	DISPATCH(runtime·call524288, 524288)
717	DISPATCH(runtime·call1048576, 1048576)
718	DISPATCH(runtime·call2097152, 2097152)
719	DISPATCH(runtime·call4194304, 4194304)
720	DISPATCH(runtime·call8388608, 8388608)
721	DISPATCH(runtime·call16777216, 16777216)
722	DISPATCH(runtime·call33554432, 33554432)
723	DISPATCH(runtime·call67108864, 67108864)
724	DISPATCH(runtime·call134217728, 134217728)
725	DISPATCH(runtime·call268435456, 268435456)
726	DISPATCH(runtime·call536870912, 536870912)
727	DISPATCH(runtime·call1073741824, 1073741824)
728	MOVQ	$runtime·badreflectcall(SB), AX
729	JMP	AX
730
731#define CALLFN(NAME,MAXSIZE)			\
732TEXT NAME(SB), WRAPPER, $MAXSIZE-48;		\
733	NO_LOCAL_POINTERS;			\
734	/* copy arguments to stack */		\
735	MOVQ	stackArgs+16(FP), SI;		\
736	MOVLQZX stackArgsSize+24(FP), CX;		\
737	MOVQ	SP, DI;				\
738	REP;MOVSB;				\
739	/* set up argument registers */		\
740	MOVQ    regArgs+40(FP), R12;		\
741	CALL    ·unspillArgs(SB);		\
742	/* call function */			\
743	MOVQ	f+8(FP), DX;			\
744	PCDATA  $PCDATA_StackMapIndex, $0;	\
745	MOVQ	(DX), R12;			\
746	CALL	R12;				\
747	/* copy register return values back */		\
748	MOVQ    regArgs+40(FP), R12;		\
749	CALL    ·spillArgs(SB);		\
750	MOVLQZX	stackArgsSize+24(FP), CX;		\
751	MOVLQZX	stackRetOffset+28(FP), BX;		\
752	MOVQ	stackArgs+16(FP), DI;		\
753	MOVQ	stackArgsType+0(FP), DX;		\
754	MOVQ	SP, SI;				\
755	ADDQ	BX, DI;				\
756	ADDQ	BX, SI;				\
757	SUBQ	BX, CX;				\
758	CALL	callRet<>(SB);			\
759	RET
760
761// callRet copies return values back at the end of call*. This is a
762// separate function so it can allocate stack space for the arguments
763// to reflectcallmove. It does not follow the Go ABI; it expects its
764// arguments in registers.
765TEXT callRet<>(SB), NOSPLIT, $40-0
766	NO_LOCAL_POINTERS
767	MOVQ	DX, 0(SP)
768	MOVQ	DI, 8(SP)
769	MOVQ	SI, 16(SP)
770	MOVQ	CX, 24(SP)
771	MOVQ	R12, 32(SP)
772	CALL	runtime·reflectcallmove(SB)
773	RET
774
775CALLFNcall16, 16)
776CALLFNcall32, 32)
777CALLFNcall64, 64)
778CALLFNcall128, 128)
779CALLFNcall256, 256)
780CALLFNcall512, 512)
781CALLFNcall1024, 1024)
782CALLFNcall2048, 2048)
783CALLFNcall4096, 4096)
784CALLFNcall8192, 8192)
785CALLFNcall16384, 16384)
786CALLFNcall32768, 32768)
787CALLFNcall65536, 65536)
788CALLFNcall131072, 131072)
789CALLFNcall262144, 262144)
790CALLFNcall524288, 524288)
791CALLFNcall1048576, 1048576)
792CALLFNcall2097152, 2097152)
793CALLFNcall4194304, 4194304)
794CALLFNcall8388608, 8388608)
795CALLFNcall16777216, 16777216)
796CALLFNcall33554432, 33554432)
797CALLFNcall67108864, 67108864)
798CALLFNcall134217728, 134217728)
799CALLFNcall268435456, 268435456)
800CALLFNcall536870912, 536870912)
801CALLFNcall1073741824, 1073741824)
802
803TEXT runtime·procyield(SB),NOSPLIT,$0-0
804	MOVL	cycles+0(FP), AX
805again:
806	PAUSE
807	SUBL	$1, AX
808	JNZ	again
809	RET
810
811
812TEXT ·publicationBarrier<ABIInternal>(SB),NOSPLIT,$0-0
813	// Stores are already ordered on x86, so this is just a
814	// compile barrier.
815	RET
816
817// Save state of caller into g->sched,
818// but using fake PC from systemstack_switch.
819// Must only be called from functions with frame pointer
820// and without locals ($0) or else unwinding from
821// systemstack_switch is incorrect.
822// Smashes R9.
823TEXT gosave_systemstack_switch<>(SB),NOSPLIT|NOFRAME,$0
824	// Take systemstack_switch PC and add 8 bytes to skip
825	// the prologue. The final location does not matter
826	// as long as we are between the prologue and the epilogue.
827	MOVQ	$runtime·systemstack_switch+8(SB), R9
828	MOVQ	R9, (g_sched+gobuf_pc)(R14)
829	LEAQ	8(SP), R9
830	MOVQ	R9, (g_sched+gobuf_sp)(R14)
831	MOVQ	$0, (g_sched+gobuf_ret)(R14)
832	MOVQ	BP, (g_sched+gobuf_bp)(R14)
833	// Assert ctxt is zero. See func save.
834	MOVQ	(g_sched+gobuf_ctxt)(R14), R9
835	TESTQ	R9, R9
836	JZ	2(PC)
837	CALL	runtime·abort(SB)
838	RET
839
840// func asmcgocall_no_g(fn, arg unsafe.Pointer)
841// Call fn(arg) aligned appropriately for the gcc ABI.
842// Called on a system stack, and there may be no g yet (during needm).
843TEXT ·asmcgocall_no_g(SB),NOSPLIT,$32-16
844	MOVQ	fn+0(FP), AX
845	MOVQ	arg+8(FP), BX
846	MOVQ	SP, DX
847	ANDQ	$~15, SP	// alignment
848	MOVQ	DX, 8(SP)
849	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
850	MOVQ	BX, CX		// CX = first argument in Win64
851	CALL	AX
852	MOVQ	8(SP), DX
853	MOVQ	DX, SP
854	RET
855
856// asmcgocall_landingpad calls AX with BX as argument.
857// Must be called on the system stack.
858TEXT ·asmcgocall_landingpad(SB),NOSPLIT,$0-0
859#ifdef GOOS_windows
860	// Make sure we have enough room for 4 stack-backed fast-call
861	// registers as per Windows amd64 calling convention.
862	ADJSP	$32
863	// On Windows, asmcgocall_landingpad acts as landing pad for exceptions
864	// thrown in the cgo call. Exceptions that reach this function will be
865	// handled by runtime.sehtramp thanks to the SEH metadata added
866	// by the compiler.
867	// Note that runtime.sehtramp can't be attached directly to asmcgocall
868	// because its initial stack pointer can be outside the system stack bounds,
869	// and Windows stops the stack unwinding without calling the exception handler
870	// when it reaches that point.
871	MOVQ	BX, CX		// CX = first argument in Win64
872	CALL	AX
873	// The exception handler is not called if the next instruction is part of
874	// the epilogue, which includes the RET instruction, so we need to add a NOP here.
875	BYTE	$0x90
876	ADJSP	$-32
877	RET
878#endif
879	// Tail call AX on non-Windows, as the extra stack frame is not needed.
880	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
881	JMP	AX
882
883// func asmcgocall(fn, arg unsafe.Pointer) int32
884// Call fn(arg) on the scheduler stack,
885// aligned appropriately for the gcc ABI.
886// See cgocall.go for more details.
887TEXT ·asmcgocall(SB),NOSPLIT,$0-20
888	MOVQ	fn+0(FP), AX
889	MOVQ	arg+8(FP), BX
890
891	MOVQ	SP, DX
892
893	// Figure out if we need to switch to m->g0 stack.
894	// We get called to create new OS threads too, and those
895	// come in on the m->g0 stack already. Or we might already
896	// be on the m->gsignal stack.
897	get_tls(CX)
898	MOVQ	g(CX), DI
899	CMPQ	DI, $0
900	JEQ	nosave
901	MOVQ	g_m(DI), R8
902	MOVQ	m_gsignal(R8), SI
903	CMPQ	DI, SI
904	JEQ	nosave
905	MOVQ	m_g0(R8), SI
906	CMPQ	DI, SI
907	JEQ	nosave
908
909	// Switch to system stack.
910	// The original frame pointer is stored in BP,
911	// which is useful for stack unwinding.
912	CALL	gosave_systemstack_switch<>(SB)
913	MOVQ	SI, g(CX)
914	MOVQ	(g_sched+gobuf_sp)(SI), SP
915
916	// Now on a scheduling stack (a pthread-created stack).
917	SUBQ	$16, SP
918	ANDQ	$~15, SP	// alignment for gcc ABI
919	MOVQ	DI, 8(SP)	// save g
920	MOVQ	(g_stack+stack_hi)(DI), DI
921	SUBQ	DX, DI
922	MOVQ	DI, 0(SP)	// save depth in stack (can't just save SP, as stack might be copied during a callback)
923	CALL	runtime·asmcgocall_landingpad(SB)
924
925	// Restore registers, g, stack pointer.
926	get_tls(CX)
927	MOVQ	8(SP), DI
928	MOVQ	(g_stack+stack_hi)(DI), SI
929	SUBQ	0(SP), SI
930	MOVQ	DI, g(CX)
931	MOVQ	SI, SP
932
933	MOVL	AX, ret+16(FP)
934	RET
935
936nosave:
937	// Running on a system stack, perhaps even without a g.
938	// Having no g can happen during thread creation or thread teardown
939	// (see needm/dropm on Solaris, for example).
940	// This code is like the above sequence but without saving/restoring g
941	// and without worrying about the stack moving out from under us
942	// (because we're on a system stack, not a goroutine stack).
943	// The above code could be used directly if already on a system stack,
944	// but then the only path through this code would be a rare case on Solaris.
945	// Using this code for all "already on system stack" calls exercises it more,
946	// which should help keep it correct.
947	SUBQ	$16, SP
948	ANDQ	$~15, SP
949	MOVQ	$0, 8(SP)		// where above code stores g, in case someone looks during debugging
950	MOVQ	DX, 0(SP)	// save original stack pointer
951	CALL	runtime·asmcgocall_landingpad(SB)
952	MOVQ	0(SP), SI	// restore original stack pointer
953	MOVQ	SI, SP
954	MOVL	AX, ret+16(FP)
955	RET
956
957#ifdef GOOS_windows
958// Dummy TLS that's used on Windows so that we don't crash trying
959// to restore the G register in needm. needm and its callees are
960// very careful never to actually use the G, the TLS just can't be
961// unset since we're in Go code.
962GLOBL zeroTLS<>(SB),RODATA,$const_tlsSize
963#endif
964
965// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
966// See cgocall.go for more details.
967TEXT ·cgocallback(SB),NOSPLIT,$24-24
968	NO_LOCAL_POINTERS
969
970	// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
971	// It is used to dropm while thread is exiting.
972	MOVQ	fn+0(FP), AX
973	CMPQ	AX, $0
974	JNE	loadg
975	// Restore the g from frame.
976	get_tls(CX)
977	MOVQ	frame+8(FP), BX
978	MOVQ	BX, g(CX)
979	JMP	dropm
980
981loadg:
982	// If g is nil, Go did not create the current thread,
983	// or if this thread never called into Go on pthread platforms.
984	// Call needm to obtain one m for temporary use.
985	// In this case, we're running on the thread stack, so there's
986	// lots of space, but the linker doesn't know. Hide the call from
987	// the linker analysis by using an indirect call through AX.
988	get_tls(CX)
989#ifdef GOOS_windows
990	MOVL	$0, BX
991	CMPQ	CX, $0
992	JEQ	2(PC)
993#endif
994	MOVQ	g(CX), BX
995	CMPQ	BX, $0
996	JEQ	needm
997	MOVQ	g_m(BX), BX
998	MOVQ	BX, savedm-8(SP)	// saved copy of oldm
999	JMP	havem
1000needm:
1001#ifdef GOOS_windows
1002	// Set up a dummy TLS value. needm is careful not to use it,
1003	// but it needs to be there to prevent autogenerated code from
1004	// crashing when it loads from it.
1005	// We don't need to clear it or anything later because needm
1006	// will set up TLS properly.
1007	MOVQ	$zeroTLS<>(SB), DI
1008	CALL	runtime·settls(SB)
1009#endif
1010	// On some platforms (Windows) we cannot call needm through
1011	// an ABI wrapper because there's no TLS set up, and the ABI
1012	// wrapper will try to restore the G register (R14) from TLS.
1013	// Clear X15 because Go expects it and we're not calling
1014	// through a wrapper, but otherwise avoid setting the G
1015	// register in the wrapper and call needm directly. It
1016	// takes no arguments and doesn't return any values so
1017	// there's no need to handle that. Clear R14 so that there's
1018	// a bad value in there, in case needm tries to use it.
1019	XORPS	X15, X15
1020	XORQ    R14, R14
1021	MOVQ	$runtime·needAndBindM<ABIInternal>(SB), AX
1022	CALL	AX
1023	MOVQ	$0, savedm-8(SP)
1024	get_tls(CX)
1025	MOVQ	g(CX), BX
1026	MOVQ	g_m(BX), BX
1027
1028	// Set m->sched.sp = SP, so that if a panic happens
1029	// during the function we are about to execute, it will
1030	// have a valid SP to run on the g0 stack.
1031	// The next few lines (after the havem label)
1032	// will save this SP onto the stack and then write
1033	// the same SP back to m->sched.sp. That seems redundant,
1034	// but if an unrecovered panic happens, unwindm will
1035	// restore the g->sched.sp from the stack location
1036	// and then systemstack will try to use it. If we don't set it here,
1037	// that restored SP will be uninitialized (typically 0) and
1038	// will not be usable.
1039	MOVQ	m_g0(BX), SI
1040	MOVQ	SP, (g_sched+gobuf_sp)(SI)
1041
1042havem:
1043	// Now there's a valid m, and we're running on its m->g0.
1044	// Save current m->g0->sched.sp on stack and then set it to SP.
1045	// Save current sp in m->g0->sched.sp in preparation for
1046	// switch back to m->curg stack.
1047	// NOTE: unwindm knows that the saved g->sched.sp is at 0(SP).
1048	MOVQ	m_g0(BX), SI
1049	MOVQ	(g_sched+gobuf_sp)(SI), AX
1050	MOVQ	AX, 0(SP)
1051	MOVQ	SP, (g_sched+gobuf_sp)(SI)
1052
1053	// Switch to m->curg stack and call runtime.cgocallbackg.
1054	// Because we are taking over the execution of m->curg
1055	// but *not* resuming what had been running, we need to
1056	// save that information (m->curg->sched) so we can restore it.
1057	// We can restore m->curg->sched.sp easily, because calling
1058	// runtime.cgocallbackg leaves SP unchanged upon return.
1059	// To save m->curg->sched.pc, we push it onto the curg stack and
1060	// open a frame the same size as cgocallback's g0 frame.
1061	// Once we switch to the curg stack, the pushed PC will appear
1062	// to be the return PC of cgocallback, so that the traceback
1063	// will seamlessly trace back into the earlier calls.
1064	MOVQ	m_curg(BX), SI
1065	MOVQ	SI, g(CX)
1066	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
1067	MOVQ	(g_sched+gobuf_pc)(SI), BX
1068	MOVQ	BX, -8(DI)  // "push" return PC on the g stack
1069	// Gather our arguments into registers.
1070	MOVQ	fn+0(FP), BX
1071	MOVQ	frame+8(FP), CX
1072	MOVQ	ctxt+16(FP), DX
1073	// Compute the size of the frame, including return PC and, if
1074	// GOEXPERIMENT=framepointer, the saved base pointer
1075	LEAQ	fn+0(FP), AX
1076	SUBQ	SP, AX   // AX is our actual frame size
1077	SUBQ	AX, DI   // Allocate the same frame size on the g stack
1078	MOVQ	DI, SP
1079
1080	MOVQ	BX, 0(SP)
1081	MOVQ	CX, 8(SP)
1082	MOVQ	DX, 16(SP)
1083	MOVQ	$runtime·cgocallbackg(SB), AX
1084	CALL	AX	// indirect call to bypass nosplit check. We're on a different stack now.
1085
1086	// Compute the size of the frame again. FP and SP have
1087	// completely different values here than they did above,
1088	// but only their difference matters.
1089	LEAQ	fn+0(FP), AX
1090	SUBQ	SP, AX
1091
1092	// Restore g->sched (== m->curg->sched) from saved values.
1093	get_tls(CX)
1094	MOVQ	g(CX), SI
1095	MOVQ	SP, DI
1096	ADDQ	AX, DI
1097	MOVQ	-8(DI), BX
1098	MOVQ	BX, (g_sched+gobuf_pc)(SI)
1099	MOVQ	DI, (g_sched+gobuf_sp)(SI)
1100
1101	// Switch back to m->g0's stack and restore m->g0->sched.sp.
1102	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
1103	// so we do not have to restore it.)
1104	MOVQ	g(CX), BX
1105	MOVQ	g_m(BX), BX
1106	MOVQ	m_g0(BX), SI
1107	MOVQ	SI, g(CX)
1108	MOVQ	(g_sched+gobuf_sp)(SI), SP
1109	MOVQ	0(SP), AX
1110	MOVQ	AX, (g_sched+gobuf_sp)(SI)
1111
1112	// If the m on entry was nil, we called needm above to borrow an m,
1113	// 1. for the duration of the call on non-pthread platforms,
1114	// 2. or the duration of the C thread alive on pthread platforms.
1115	// If the m on entry wasn't nil,
1116	// 1. the thread might be a Go thread,
1117	// 2. or it wasn't the first call from a C thread on pthread platforms,
1118	//    since then we skip dropm to reuse the m in the first call.
1119	MOVQ	savedm-8(SP), BX
1120	CMPQ	BX, $0
1121	JNE	done
1122
1123	// Skip dropm to reuse it in the next call, when a pthread key has been created.
1124	MOVQ	_cgo_pthread_key_created(SB), AX
1125	// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
1126	CMPQ	AX, $0
1127	JEQ	dropm
1128	CMPQ	(AX), $0
1129	JNE	done
1130
1131dropm:
1132	MOVQ	$runtime·dropm(SB), AX
1133	CALL	AX
1134#ifdef GOOS_windows
1135	// We need to clear the TLS pointer in case the next
1136	// thread that comes into Go tries to reuse that space
1137	// but uses the same M.
1138	XORQ	DI, DI
1139	CALL	runtime·settls(SB)
1140#endif
1141done:
1142
1143	// Done!
1144	RET
1145
1146// func setg(gg *g)
1147// set g. for use by needm.
1148TEXT runtime·setg(SB), NOSPLIT, $0-8
1149	MOVQ	gg+0(FP), BX
1150	get_tls(CX)
1151	MOVQ	BX, g(CX)
1152	RET
1153
1154// void setg_gcc(G*); set g called from gcc.
1155TEXT setg_gcc<>(SB),NOSPLIT,$0
1156	get_tls(AX)
1157	MOVQ	DI, g(AX)
1158	MOVQ	DI, R14 // set the g register
1159	RET
1160
1161TEXT runtime·abort(SB),NOSPLIT,$0-0
1162	INT	$3
1163loop:
1164	JMP	loop
1165
1166// check that SP is in range [g->stack.lo, g->stack.hi)
1167TEXT runtime·stackcheck(SB), NOSPLIT|NOFRAME, $0-0
1168	get_tls(CX)
1169	MOVQ	g(CX), AX
1170	CMPQ	(g_stack+stack_hi)(AX), SP
1171	JHI	2(PC)
1172	CALL	runtime·abort(SB)
1173	CMPQ	SP, (g_stack+stack_lo)(AX)
1174	JHI	2(PC)
1175	CALL	runtime·abort(SB)
1176	RET
1177
1178// func cputicks() int64
1179TEXT runtime·cputicks(SB),NOSPLIT,$0-0
1180	CMPB	internalcpu·X86+const_offsetX86HasRDTSCP(SB), $1
1181	JNE	fences
1182	// Instruction stream serializing RDTSCP is supported.
1183	// RDTSCP is supported by Intel Nehalem (2008) and
1184	// AMD K8 Rev. F (2006) and newer.
1185	RDTSCP
1186done:
1187	SHLQ	$32, DX
1188	ADDQ	DX, AX
1189	MOVQ	AX, ret+0(FP)
1190	RET
1191fences:
1192	// MFENCE is instruction stream serializing and flushes the
1193	// store buffers on AMD. The serialization semantics of LFENCE on AMD
1194	// are dependent on MSR C001_1029 and CPU generation.
1195	// LFENCE on Intel does wait for all previous instructions to have executed.
1196	// Intel recommends MFENCE;LFENCE in its manuals before RDTSC to have all
1197	// previous instructions executed and all previous loads and stores to globally visible.
1198	// Using MFENCE;LFENCE here aligns the serializing properties without
1199	// runtime detection of CPU manufacturer.
1200	MFENCE
1201	LFENCE
1202	RDTSC
1203	JMP done
1204
1205// func memhash(p unsafe.Pointer, h, s uintptr) uintptr
1206// hash function using AES hardware instructions
1207TEXT runtime·memhash<ABIInternal>(SB),NOSPLIT,$0-32
1208	// AX = ptr to data
1209	// BX = seed
1210	// CX = size
1211	CMPB	runtime·useAeshash(SB), $0
1212	JEQ	noaes
1213	JMP	aeshashbody<>(SB)
1214noaes:
1215	JMP	runtime·memhashFallback<ABIInternal>(SB)
1216
1217// func strhash(p unsafe.Pointer, h uintptr) uintptr
1218TEXT runtime·strhash<ABIInternal>(SB),NOSPLIT,$0-24
1219	// AX = ptr to string struct
1220	// BX = seed
1221	CMPB	runtime·useAeshash(SB), $0
1222	JEQ	noaes
1223	MOVQ	8(AX), CX	// length of string
1224	MOVQ	(AX), AX	// string data
1225	JMP	aeshashbody<>(SB)
1226noaes:
1227	JMP	runtime·strhashFallback<ABIInternal>(SB)
1228
1229// AX: data
1230// BX: hash seed
1231// CX: length
1232// At return: AX = return value
1233TEXT aeshashbody<>(SB),NOSPLIT,$0-0
1234	// Fill an SSE register with our seeds.
1235	MOVQ	BX, X0				// 64 bits of per-table hash seed
1236	PINSRW	$4, CX, X0			// 16 bits of length
1237	PSHUFHW $0, X0, X0			// repeat length 4 times total
1238	MOVO	X0, X1				// save unscrambled seed
1239	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
1240	AESENC	X0, X0				// scramble seed
1241
1242	CMPQ	CX, $16
1243	JB	aes0to15
1244	JE	aes16
1245	CMPQ	CX, $32
1246	JBE	aes17to32
1247	CMPQ	CX, $64
1248	JBE	aes33to64
1249	CMPQ	CX, $128
1250	JBE	aes65to128
1251	JMP	aes129plus
1252
1253aes0to15:
1254	TESTQ	CX, CX
1255	JE	aes0
1256
1257	ADDQ	$16, AX
1258	TESTW	$0xff0, AX
1259	JE	endofpage
1260
1261	// 16 bytes loaded at this address won't cross
1262	// a page boundary, so we can load it directly.
1263	MOVOU	-16(AX), X1
1264	ADDQ	CX, CX
1265	MOVQ	$masks<>(SB), AX
1266	PAND	(AX)(CX*8), X1
1267final1:
1268	PXOR	X0, X1	// xor data with seed
1269	AESENC	X1, X1	// scramble combo 3 times
1270	AESENC	X1, X1
1271	AESENC	X1, X1
1272	MOVQ	X1, AX	// return X1
1273	RET
1274
1275endofpage:
1276	// address ends in 1111xxxx. Might be up against
1277	// a page boundary, so load ending at last byte.
1278	// Then shift bytes down using pshufb.
1279	MOVOU	-32(AX)(CX*1), X1
1280	ADDQ	CX, CX
1281	MOVQ	$shifts<>(SB), AX
1282	PSHUFB	(AX)(CX*8), X1
1283	JMP	final1
1284
1285aes0:
1286	// Return scrambled input seed
1287	AESENC	X0, X0
1288	MOVQ	X0, AX	// return X0
1289	RET
1290
1291aes16:
1292	MOVOU	(AX), X1
1293	JMP	final1
1294
1295aes17to32:
1296	// make second starting seed
1297	PXOR	runtime·aeskeysched+16(SB), X1
1298	AESENC	X1, X1
1299
1300	// load data to be hashed
1301	MOVOU	(AX), X2
1302	MOVOU	-16(AX)(CX*1), X3
1303
1304	// xor with seed
1305	PXOR	X0, X2
1306	PXOR	X1, X3
1307
1308	// scramble 3 times
1309	AESENC	X2, X2
1310	AESENC	X3, X3
1311	AESENC	X2, X2
1312	AESENC	X3, X3
1313	AESENC	X2, X2
1314	AESENC	X3, X3
1315
1316	// combine results
1317	PXOR	X3, X2
1318	MOVQ	X2, AX	// return X2
1319	RET
1320
1321aes33to64:
1322	// make 3 more starting seeds
1323	MOVO	X1, X2
1324	MOVO	X1, X3
1325	PXOR	runtime·aeskeysched+16(SB), X1
1326	PXOR	runtime·aeskeysched+32(SB), X2
1327	PXOR	runtime·aeskeysched+48(SB), X3
1328	AESENC	X1, X1
1329	AESENC	X2, X2
1330	AESENC	X3, X3
1331
1332	MOVOU	(AX), X4
1333	MOVOU	16(AX), X5
1334	MOVOU	-32(AX)(CX*1), X6
1335	MOVOU	-16(AX)(CX*1), X7
1336
1337	PXOR	X0, X4
1338	PXOR	X1, X5
1339	PXOR	X2, X6
1340	PXOR	X3, X7
1341
1342	AESENC	X4, X4
1343	AESENC	X5, X5
1344	AESENC	X6, X6
1345	AESENC	X7, X7
1346
1347	AESENC	X4, X4
1348	AESENC	X5, X5
1349	AESENC	X6, X6
1350	AESENC	X7, X7
1351
1352	AESENC	X4, X4
1353	AESENC	X5, X5
1354	AESENC	X6, X6
1355	AESENC	X7, X7
1356
1357	PXOR	X6, X4
1358	PXOR	X7, X5
1359	PXOR	X5, X4
1360	MOVQ	X4, AX	// return X4
1361	RET
1362
1363aes65to128:
1364	// make 7 more starting seeds
1365	MOVO	X1, X2
1366	MOVO	X1, X3
1367	MOVO	X1, X4
1368	MOVO	X1, X5
1369	MOVO	X1, X6
1370	MOVO	X1, X7
1371	PXOR	runtime·aeskeysched+16(SB), X1
1372	PXOR	runtime·aeskeysched+32(SB), X2
1373	PXOR	runtime·aeskeysched+48(SB), X3
1374	PXOR	runtime·aeskeysched+64(SB), X4
1375	PXOR	runtime·aeskeysched+80(SB), X5
1376	PXOR	runtime·aeskeysched+96(SB), X6
1377	PXOR	runtime·aeskeysched+112(SB), X7
1378	AESENC	X1, X1
1379	AESENC	X2, X2
1380	AESENC	X3, X3
1381	AESENC	X4, X4
1382	AESENC	X5, X5
1383	AESENC	X6, X6
1384	AESENC	X7, X7
1385
1386	// load data
1387	MOVOU	(AX), X8
1388	MOVOU	16(AX), X9
1389	MOVOU	32(AX), X10
1390	MOVOU	48(AX), X11
1391	MOVOU	-64(AX)(CX*1), X12
1392	MOVOU	-48(AX)(CX*1), X13
1393	MOVOU	-32(AX)(CX*1), X14
1394	MOVOU	-16(AX)(CX*1), X15
1395
1396	// xor with seed
1397	PXOR	X0, X8
1398	PXOR	X1, X9
1399	PXOR	X2, X10
1400	PXOR	X3, X11
1401	PXOR	X4, X12
1402	PXOR	X5, X13
1403	PXOR	X6, X14
1404	PXOR	X7, X15
1405
1406	// scramble 3 times
1407	AESENC	X8, X8
1408	AESENC	X9, X9
1409	AESENC	X10, X10
1410	AESENC	X11, X11
1411	AESENC	X12, X12
1412	AESENC	X13, X13
1413	AESENC	X14, X14
1414	AESENC	X15, X15
1415
1416	AESENC	X8, X8
1417	AESENC	X9, X9
1418	AESENC	X10, X10
1419	AESENC	X11, X11
1420	AESENC	X12, X12
1421	AESENC	X13, X13
1422	AESENC	X14, X14
1423	AESENC	X15, X15
1424
1425	AESENC	X8, X8
1426	AESENC	X9, X9
1427	AESENC	X10, X10
1428	AESENC	X11, X11
1429	AESENC	X12, X12
1430	AESENC	X13, X13
1431	AESENC	X14, X14
1432	AESENC	X15, X15
1433
1434	// combine results
1435	PXOR	X12, X8
1436	PXOR	X13, X9
1437	PXOR	X14, X10
1438	PXOR	X15, X11
1439	PXOR	X10, X8
1440	PXOR	X11, X9
1441	PXOR	X9, X8
1442	// X15 must be zero on return
1443	PXOR	X15, X15
1444	MOVQ	X8, AX	// return X8
1445	RET
1446
1447aes129plus:
1448	// make 7 more starting seeds
1449	MOVO	X1, X2
1450	MOVO	X1, X3
1451	MOVO	X1, X4
1452	MOVO	X1, X5
1453	MOVO	X1, X6
1454	MOVO	X1, X7
1455	PXOR	runtime·aeskeysched+16(SB), X1
1456	PXOR	runtime·aeskeysched+32(SB), X2
1457	PXOR	runtime·aeskeysched+48(SB), X3
1458	PXOR	runtime·aeskeysched+64(SB), X4
1459	PXOR	runtime·aeskeysched+80(SB), X5
1460	PXOR	runtime·aeskeysched+96(SB), X6
1461	PXOR	runtime·aeskeysched+112(SB), X7
1462	AESENC	X1, X1
1463	AESENC	X2, X2
1464	AESENC	X3, X3
1465	AESENC	X4, X4
1466	AESENC	X5, X5
1467	AESENC	X6, X6
1468	AESENC	X7, X7
1469
1470	// start with last (possibly overlapping) block
1471	MOVOU	-128(AX)(CX*1), X8
1472	MOVOU	-112(AX)(CX*1), X9
1473	MOVOU	-96(AX)(CX*1), X10
1474	MOVOU	-80(AX)(CX*1), X11
1475	MOVOU	-64(AX)(CX*1), X12
1476	MOVOU	-48(AX)(CX*1), X13
1477	MOVOU	-32(AX)(CX*1), X14
1478	MOVOU	-16(AX)(CX*1), X15
1479
1480	// xor in seed
1481	PXOR	X0, X8
1482	PXOR	X1, X9
1483	PXOR	X2, X10
1484	PXOR	X3, X11
1485	PXOR	X4, X12
1486	PXOR	X5, X13
1487	PXOR	X6, X14
1488	PXOR	X7, X15
1489
1490	// compute number of remaining 128-byte blocks
1491	DECQ	CX
1492	SHRQ	$7, CX
1493
1494	PCALIGN $16
1495aesloop:
1496	// scramble state
1497	AESENC	X8, X8
1498	AESENC	X9, X9
1499	AESENC	X10, X10
1500	AESENC	X11, X11
1501	AESENC	X12, X12
1502	AESENC	X13, X13
1503	AESENC	X14, X14
1504	AESENC	X15, X15
1505
1506	// scramble state, xor in a block
1507	MOVOU	(AX), X0
1508	MOVOU	16(AX), X1
1509	MOVOU	32(AX), X2
1510	MOVOU	48(AX), X3
1511	AESENC	X0, X8
1512	AESENC	X1, X9
1513	AESENC	X2, X10
1514	AESENC	X3, X11
1515	MOVOU	64(AX), X4
1516	MOVOU	80(AX), X5
1517	MOVOU	96(AX), X6
1518	MOVOU	112(AX), X7
1519	AESENC	X4, X12
1520	AESENC	X5, X13
1521	AESENC	X6, X14
1522	AESENC	X7, X15
1523
1524	ADDQ	$128, AX
1525	DECQ	CX
1526	JNE	aesloop
1527
1528	// 3 more scrambles to finish
1529	AESENC	X8, X8
1530	AESENC	X9, X9
1531	AESENC	X10, X10
1532	AESENC	X11, X11
1533	AESENC	X12, X12
1534	AESENC	X13, X13
1535	AESENC	X14, X14
1536	AESENC	X15, X15
1537	AESENC	X8, X8
1538	AESENC	X9, X9
1539	AESENC	X10, X10
1540	AESENC	X11, X11
1541	AESENC	X12, X12
1542	AESENC	X13, X13
1543	AESENC	X14, X14
1544	AESENC	X15, X15
1545	AESENC	X8, X8
1546	AESENC	X9, X9
1547	AESENC	X10, X10
1548	AESENC	X11, X11
1549	AESENC	X12, X12
1550	AESENC	X13, X13
1551	AESENC	X14, X14
1552	AESENC	X15, X15
1553
1554	PXOR	X12, X8
1555	PXOR	X13, X9
1556	PXOR	X14, X10
1557	PXOR	X15, X11
1558	PXOR	X10, X8
1559	PXOR	X11, X9
1560	PXOR	X9, X8
1561	// X15 must be zero on return
1562	PXOR	X15, X15
1563	MOVQ	X8, AX	// return X8
1564	RET
1565
1566// func memhash32(p unsafe.Pointer, h uintptr) uintptr
1567// ABIInternal for performance.
1568TEXT runtime·memhash32<ABIInternal>(SB),NOSPLIT,$0-24
1569	// AX = ptr to data
1570	// BX = seed
1571	CMPB	runtime·useAeshash(SB), $0
1572	JEQ	noaes
1573	MOVQ	BX, X0	// X0 = seed
1574	PINSRD	$2, (AX), X0	// data
1575	AESENC	runtime·aeskeysched+0(SB), X0
1576	AESENC	runtime·aeskeysched+16(SB), X0
1577	AESENC	runtime·aeskeysched+32(SB), X0
1578	MOVQ	X0, AX	// return X0
1579	RET
1580noaes:
1581	JMP	runtime·memhash32Fallback<ABIInternal>(SB)
1582
1583// func memhash64(p unsafe.Pointer, h uintptr) uintptr
1584// ABIInternal for performance.
1585TEXT runtime·memhash64<ABIInternal>(SB),NOSPLIT,$0-24
1586	// AX = ptr to data
1587	// BX = seed
1588	CMPB	runtime·useAeshash(SB), $0
1589	JEQ	noaes
1590	MOVQ	BX, X0	// X0 = seed
1591	PINSRQ	$1, (AX), X0	// data
1592	AESENC	runtime·aeskeysched+0(SB), X0
1593	AESENC	runtime·aeskeysched+16(SB), X0
1594	AESENC	runtime·aeskeysched+32(SB), X0
1595	MOVQ	X0, AX	// return X0
1596	RET
1597noaes:
1598	JMP	runtime·memhash64Fallback<ABIInternal>(SB)
1599
1600// simple mask to get rid of data in the high part of the register.
1601DATA masks<>+0x00(SB)/8, $0x0000000000000000
1602DATA masks<>+0x08(SB)/8, $0x0000000000000000
1603DATA masks<>+0x10(SB)/8, $0x00000000000000ff
1604DATA masks<>+0x18(SB)/8, $0x0000000000000000
1605DATA masks<>+0x20(SB)/8, $0x000000000000ffff
1606DATA masks<>+0x28(SB)/8, $0x0000000000000000
1607DATA masks<>+0x30(SB)/8, $0x0000000000ffffff
1608DATA masks<>+0x38(SB)/8, $0x0000000000000000
1609DATA masks<>+0x40(SB)/8, $0x00000000ffffffff
1610DATA masks<>+0x48(SB)/8, $0x0000000000000000
1611DATA masks<>+0x50(SB)/8, $0x000000ffffffffff
1612DATA masks<>+0x58(SB)/8, $0x0000000000000000
1613DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff
1614DATA masks<>+0x68(SB)/8, $0x0000000000000000
1615DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff
1616DATA masks<>+0x78(SB)/8, $0x0000000000000000
1617DATA masks<>+0x80(SB)/8, $0xffffffffffffffff
1618DATA masks<>+0x88(SB)/8, $0x0000000000000000
1619DATA masks<>+0x90(SB)/8, $0xffffffffffffffff
1620DATA masks<>+0x98(SB)/8, $0x00000000000000ff
1621DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff
1622DATA masks<>+0xa8(SB)/8, $0x000000000000ffff
1623DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff
1624DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff
1625DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff
1626DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff
1627DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff
1628DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff
1629DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff
1630DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff
1631DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff
1632DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
1633GLOBL masks<>(SB),RODATA,$256
1634
1635// func checkASM() bool
1636TEXT ·checkASM(SB),NOSPLIT,$0-1
1637	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
1638	MOVQ	$masks<>(SB), AX
1639	MOVQ	$shifts<>(SB), BX
1640	ORQ	BX, AX
1641	TESTQ	$15, AX
1642	SETEQ	ret+0(FP)
1643	RET
1644
1645// these are arguments to pshufb. They move data down from
1646// the high bytes of the register to the low bytes of the register.
1647// index is how many bytes to move.
1648DATA shifts<>+0x00(SB)/8, $0x0000000000000000
1649DATA shifts<>+0x08(SB)/8, $0x0000000000000000
1650DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f
1651DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff
1652DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e
1653DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff
1654DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d
1655DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff
1656DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c
1657DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff
1658DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b
1659DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff
1660DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a
1661DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff
1662DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09
1663DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff
1664DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908
1665DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff
1666DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807
1667DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f
1668DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706
1669DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e
1670DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605
1671DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d
1672DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504
1673DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c
1674DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403
1675DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b
1676DATA shifts<>+0xe0(SB)/8, $0x0908070605040302
1677DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a
1678DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
1679DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
1680GLOBL shifts<>(SB),RODATA,$256
1681
1682TEXT runtime·return0(SB), NOSPLIT, $0
1683	MOVL	$0, AX
1684	RET
1685
1686
1687// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
1688// Must obey the gcc calling convention.
1689TEXT _cgo_topofstack(SB),NOSPLIT,$0
1690	get_tls(CX)
1691	MOVQ	g(CX), AX
1692	MOVQ	g_m(AX), AX
1693	MOVQ	m_curg(AX), AX
1694	MOVQ	(g_stack+stack_hi)(AX), AX
1695	RET
1696
1697// The top-most function running on a goroutine
1698// returns to goexit+PCQuantum.
1699TEXT runtime·goexit(SB),NOSPLIT|TOPFRAME|NOFRAME,$0-0
1700	BYTE	$0x90	// NOP
1701	CALL	runtime·goexit1(SB)	// does not return
1702	// traceback from goexit1 must hit code range of goexit
1703	BYTE	$0x90	// NOP
1704
1705// This is called from .init_array and follows the platform, not Go, ABI.
1706TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
1707	PUSHQ	R15 // The access to global variables below implicitly uses R15, which is callee-save
1708	MOVQ	runtime·lastmoduledatap(SB), AX
1709	MOVQ	DI, moduledata_next(AX)
1710	MOVQ	DI, runtime·lastmoduledatap(SB)
1711	POPQ	R15
1712	RET
1713
1714// Initialize special registers then jump to sigpanic.
1715// This function is injected from the signal handler for panicking
1716// signals. It is quite painful to set X15 in the signal context,
1717// so we do it here.
1718TEXT ·sigpanic0(SB),NOSPLIT,$0-0
1719	get_tls(R14)
1720	MOVQ	g(R14), R14
1721#ifndef GOOS_plan9
1722	XORPS	X15, X15
1723#endif
1724	JMP	·sigpanic<ABIInternal>(SB)
1725
1726// gcWriteBarrier informs the GC about heap pointer writes.
1727//
1728// gcWriteBarrier returns space in a write barrier buffer which
1729// should be filled in by the caller.
1730// gcWriteBarrier does NOT follow the Go ABI. It accepts the
1731// number of bytes of buffer needed in R11, and returns a pointer
1732// to the buffer space in R11.
1733// It clobbers FLAGS. It does not clobber any general-purpose registers,
1734// but may clobber others (e.g., SSE registers).
1735// Typical use would be, when doing *(CX+88) = AX
1736//     CMPL    $0, runtime.writeBarrier(SB)
1737//     JEQ     dowrite
1738//     CALL    runtime.gcBatchBarrier2(SB)
1739//     MOVQ    AX, (R11)
1740//     MOVQ    88(CX), DX
1741//     MOVQ    DX, 8(R11)
1742// dowrite:
1743//     MOVQ    AX, 88(CX)
1744TEXT gcWriteBarrier<>(SB),NOSPLIT,$112
1745	// Save the registers clobbered by the fast path. This is slightly
1746	// faster than having the caller spill these.
1747	MOVQ	R12, 96(SP)
1748	MOVQ	R13, 104(SP)
1749retry:
1750	// TODO: Consider passing g.m.p in as an argument so they can be shared
1751	// across a sequence of write barriers.
1752	MOVQ	g_m(R14), R13
1753	MOVQ	m_p(R13), R13
1754	// Get current buffer write position.
1755	MOVQ	(p_wbBuf+wbBuf_next)(R13), R12	// original next position
1756	ADDQ	R11, R12			// new next position
1757	// Is the buffer full?
1758	CMPQ	R12, (p_wbBuf+wbBuf_end)(R13)
1759	JA	flush
1760	// Commit to the larger buffer.
1761	MOVQ	R12, (p_wbBuf+wbBuf_next)(R13)
1762	// Make return value (the original next position)
1763	SUBQ	R11, R12
1764	MOVQ	R12, R11
1765	// Restore registers.
1766	MOVQ	96(SP), R12
1767	MOVQ	104(SP), R13
1768	RET
1769
1770flush:
1771	// Save all general purpose registers since these could be
1772	// clobbered by wbBufFlush and were not saved by the caller.
1773	// It is possible for wbBufFlush to clobber other registers
1774	// (e.g., SSE registers), but the compiler takes care of saving
1775	// those in the caller if necessary. This strikes a balance
1776	// with registers that are likely to be used.
1777	//
1778	// We don't have type information for these, but all code under
1779	// here is NOSPLIT, so nothing will observe these.
1780	//
1781	// TODO: We could strike a different balance; e.g., saving X0
1782	// and not saving GP registers that are less likely to be used.
1783	MOVQ	DI, 0(SP)
1784	MOVQ	AX, 8(SP)
1785	MOVQ	BX, 16(SP)
1786	MOVQ	CX, 24(SP)
1787	MOVQ	DX, 32(SP)
1788	// DI already saved
1789	MOVQ	SI, 40(SP)
1790	MOVQ	BP, 48(SP)
1791	MOVQ	R8, 56(SP)
1792	MOVQ	R9, 64(SP)
1793	MOVQ	R10, 72(SP)
1794	MOVQ	R11, 80(SP)
1795	// R12 already saved
1796	// R13 already saved
1797	// R14 is g
1798	MOVQ	R15, 88(SP)
1799
1800	CALL	runtime·wbBufFlush(SB)
1801
1802	MOVQ	0(SP), DI
1803	MOVQ	8(SP), AX
1804	MOVQ	16(SP), BX
1805	MOVQ	24(SP), CX
1806	MOVQ	32(SP), DX
1807	MOVQ	40(SP), SI
1808	MOVQ	48(SP), BP
1809	MOVQ	56(SP), R8
1810	MOVQ	64(SP), R9
1811	MOVQ	72(SP), R10
1812	MOVQ	80(SP), R11
1813	MOVQ	88(SP), R15
1814	JMP	retry
1815
1816TEXT runtime·gcWriteBarrier1<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1817	MOVL   $8, R11
1818	JMP     gcWriteBarrier<>(SB)
1819TEXT runtime·gcWriteBarrier2<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1820	MOVL   $16, R11
1821	JMP     gcWriteBarrier<>(SB)
1822TEXT runtime·gcWriteBarrier3<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1823	MOVL   $24, R11
1824	JMP     gcWriteBarrier<>(SB)
1825TEXT runtime·gcWriteBarrier4<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1826	MOVL   $32, R11
1827	JMP     gcWriteBarrier<>(SB)
1828TEXT runtime·gcWriteBarrier5<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1829	MOVL   $40, R11
1830	JMP     gcWriteBarrier<>(SB)
1831TEXT runtime·gcWriteBarrier6<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1832	MOVL   $48, R11
1833	JMP     gcWriteBarrier<>(SB)
1834TEXT runtime·gcWriteBarrier7<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1835	MOVL   $56, R11
1836	JMP     gcWriteBarrier<>(SB)
1837TEXT runtime·gcWriteBarrier8<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
1838	MOVL   $64, R11
1839	JMP     gcWriteBarrier<>(SB)
1840
1841DATA	debugCallFrameTooLarge<>+0x00(SB)/20, $"call frame too large"
1842GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
1843
1844// debugCallV2 is the entry point for debugger-injected function
1845// calls on running goroutines. It informs the runtime that a
1846// debug call has been injected and creates a call frame for the
1847// debugger to fill in.
1848//
1849// To inject a function call, a debugger should:
1850// 1. Check that the goroutine is in state _Grunning and that
1851//    there are at least 256 bytes free on the stack.
1852// 2. Push the current PC on the stack (updating SP).
1853// 3. Write the desired argument frame size at SP-16 (using the SP
1854//    after step 2).
1855// 4. Save all machine registers (including flags and XMM registers)
1856//    so they can be restored later by the debugger.
1857// 5. Set the PC to debugCallV2 and resume execution.
1858//
1859// If the goroutine is in state _Grunnable, then it's not generally
1860// safe to inject a call because it may return out via other runtime
1861// operations. Instead, the debugger should unwind the stack to find
1862// the return to non-runtime code, add a temporary breakpoint there,
1863// and inject the call once that breakpoint is hit.
1864//
1865// If the goroutine is in any other state, it's not safe to inject a call.
1866//
1867// This function communicates back to the debugger by setting R12 and
1868// invoking INT3 to raise a breakpoint signal. See the comments in the
1869// implementation for the protocol the debugger is expected to
1870// follow. InjectDebugCall in the runtime tests demonstrates this protocol.
1871//
1872// The debugger must ensure that any pointers passed to the function
1873// obey escape analysis requirements. Specifically, it must not pass
1874// a stack pointer to an escaping argument. debugCallV2 cannot check
1875// this invariant.
1876//
1877// This is ABIInternal because Go code injects its PC directly into new
1878// goroutine stacks.
1879TEXT runtime·debugCallV2<ABIInternal>(SB),NOSPLIT,$152-0
1880	// Save all registers that may contain pointers so they can be
1881	// conservatively scanned.
1882	//
1883	// We can't do anything that might clobber any of these
1884	// registers before this.
1885	MOVQ	R15, r15-(14*8+8)(SP)
1886	MOVQ	R14, r14-(13*8+8)(SP)
1887	MOVQ	R13, r13-(12*8+8)(SP)
1888	MOVQ	R12, r12-(11*8+8)(SP)
1889	MOVQ	R11, r11-(10*8+8)(SP)
1890	MOVQ	R10, r10-(9*8+8)(SP)
1891	MOVQ	R9, r9-(8*8+8)(SP)
1892	MOVQ	R8, r8-(7*8+8)(SP)
1893	MOVQ	DI, di-(6*8+8)(SP)
1894	MOVQ	SI, si-(5*8+8)(SP)
1895	MOVQ	BP, bp-(4*8+8)(SP)
1896	MOVQ	BX, bx-(3*8+8)(SP)
1897	MOVQ	DX, dx-(2*8+8)(SP)
1898	// Save the frame size before we clobber it. Either of the last
1899	// saves could clobber this depending on whether there's a saved BP.
1900	MOVQ	frameSize-24(FP), DX	// aka -16(RSP) before prologue
1901	MOVQ	CX, cx-(1*8+8)(SP)
1902	MOVQ	AX, ax-(0*8+8)(SP)
1903
1904	// Save the argument frame size.
1905	MOVQ	DX, frameSize-128(SP)
1906
1907	// Perform a safe-point check.
1908	MOVQ	retpc-8(FP), AX	// Caller's PC
1909	MOVQ	AX, 0(SP)
1910	CALL	runtime·debugCallCheck(SB)
1911	MOVQ	8(SP), AX
1912	TESTQ	AX, AX
1913	JZ	good
1914	// The safety check failed. Put the reason string at the top
1915	// of the stack.
1916	MOVQ	AX, 0(SP)
1917	MOVQ	16(SP), AX
1918	MOVQ	AX, 8(SP)
1919	// Set R12 to 8 and invoke INT3. The debugger should get the
1920	// reason a call can't be injected from the top of the stack
1921	// and resume execution.
1922	MOVQ	$8, R12
1923	BYTE	$0xcc
1924	JMP	restore
1925
1926good:
1927	// Registers are saved and it's safe to make a call.
1928	// Open up a call frame, moving the stack if necessary.
1929	//
1930	// Once the frame is allocated, this will set R12 to 0 and
1931	// invoke INT3. The debugger should write the argument
1932	// frame for the call at SP, set up argument registers, push
1933	// the trapping PC on the stack, set the PC to the function to
1934	// call, set RDX to point to the closure (if a closure call),
1935	// and resume execution.
1936	//
1937	// If the function returns, this will set R12 to 1 and invoke
1938	// INT3. The debugger can then inspect any return value saved
1939	// on the stack at SP and in registers and resume execution again.
1940	//
1941	// If the function panics, this will set R12 to 2 and invoke INT3.
1942	// The interface{} value of the panic will be at SP. The debugger
1943	// can inspect the panic value and resume execution again.
1944#define DEBUG_CALL_DISPATCH(NAME,MAXSIZE)	\
1945	CMPQ	AX, $MAXSIZE;			\
1946	JA	5(PC);				\
1947	MOVQ	$NAME(SB), AX;			\
1948	MOVQ	AX, 0(SP);			\
1949	CALL	runtime·debugCallWrap(SB);	\
1950	JMP	restore
1951
1952	MOVQ	frameSize-128(SP), AX
1953	DEBUG_CALL_DISPATCH(debugCall32<>, 32)
1954	DEBUG_CALL_DISPATCH(debugCall64<>, 64)
1955	DEBUG_CALL_DISPATCH(debugCall128<>, 128)
1956	DEBUG_CALL_DISPATCH(debugCall256<>, 256)
1957	DEBUG_CALL_DISPATCH(debugCall512<>, 512)
1958	DEBUG_CALL_DISPATCH(debugCall1024<>, 1024)
1959	DEBUG_CALL_DISPATCH(debugCall2048<>, 2048)
1960	DEBUG_CALL_DISPATCH(debugCall4096<>, 4096)
1961	DEBUG_CALL_DISPATCH(debugCall8192<>, 8192)
1962	DEBUG_CALL_DISPATCH(debugCall16384<>, 16384)
1963	DEBUG_CALL_DISPATCH(debugCall32768<>, 32768)
1964	DEBUG_CALL_DISPATCH(debugCall65536<>, 65536)
1965	// The frame size is too large. Report the error.
1966	MOVQ	$debugCallFrameTooLarge<>(SB), AX
1967	MOVQ	AX, 0(SP)
1968	MOVQ	$20, 8(SP) // length of debugCallFrameTooLarge string
1969	MOVQ	$8, R12
1970	BYTE	$0xcc
1971	JMP	restore
1972
1973restore:
1974	// Calls and failures resume here.
1975	//
1976	// Set R12 to 16 and invoke INT3. The debugger should restore
1977	// all registers except RIP and RSP and resume execution.
1978	MOVQ	$16, R12
1979	BYTE	$0xcc
1980	// We must not modify flags after this point.
1981
1982	// Restore pointer-containing registers, which may have been
1983	// modified from the debugger's copy by stack copying.
1984	MOVQ	ax-(0*8+8)(SP), AX
1985	MOVQ	cx-(1*8+8)(SP), CX
1986	MOVQ	dx-(2*8+8)(SP), DX
1987	MOVQ	bx-(3*8+8)(SP), BX
1988	MOVQ	bp-(4*8+8)(SP), BP
1989	MOVQ	si-(5*8+8)(SP), SI
1990	MOVQ	di-(6*8+8)(SP), DI
1991	MOVQ	r8-(7*8+8)(SP), R8
1992	MOVQ	r9-(8*8+8)(SP), R9
1993	MOVQ	r10-(9*8+8)(SP), R10
1994	MOVQ	r11-(10*8+8)(SP), R11
1995	MOVQ	r12-(11*8+8)(SP), R12
1996	MOVQ	r13-(12*8+8)(SP), R13
1997	MOVQ	r14-(13*8+8)(SP), R14
1998	MOVQ	r15-(14*8+8)(SP), R15
1999
2000	RET
2001
2002// runtime.debugCallCheck assumes that functions defined with the
2003// DEBUG_CALL_FN macro are safe points to inject calls.
2004#define DEBUG_CALL_FN(NAME,MAXSIZE)		\
2005TEXT NAME(SB),WRAPPER,$MAXSIZE-0;		\
2006	NO_LOCAL_POINTERS;			\
2007	MOVQ	$0, R12;				\
2008	BYTE	$0xcc;				\
2009	MOVQ	$1, R12;				\
2010	BYTE	$0xcc;				\
2011	RET
2012DEBUG_CALL_FN(debugCall32<>, 32)
2013DEBUG_CALL_FN(debugCall64<>, 64)
2014DEBUG_CALL_FN(debugCall128<>, 128)
2015DEBUG_CALL_FN(debugCall256<>, 256)
2016DEBUG_CALL_FN(debugCall512<>, 512)
2017DEBUG_CALL_FN(debugCall1024<>, 1024)
2018DEBUG_CALL_FN(debugCall2048<>, 2048)
2019DEBUG_CALL_FN(debugCall4096<>, 4096)
2020DEBUG_CALL_FN(debugCall8192<>, 8192)
2021DEBUG_CALL_FN(debugCall16384<>, 16384)
2022DEBUG_CALL_FN(debugCall32768<>, 32768)
2023DEBUG_CALL_FN(debugCall65536<>, 65536)
2024
2025// func debugCallPanicked(val interface{})
2026TEXT runtime·debugCallPanicked(SB),NOSPLIT,$16-16
2027	// Copy the panic value to the top of stack.
2028	MOVQ	val_type+0(FP), AX
2029	MOVQ	AX, 0(SP)
2030	MOVQ	val_data+8(FP), AX
2031	MOVQ	AX, 8(SP)
2032	MOVQ	$2, R12
2033	BYTE	$0xcc
2034	RET
2035
2036// Note: these functions use a special calling convention to save generated code space.
2037// Arguments are passed in registers, but the space for those arguments are allocated
2038// in the caller's stack frame. These stubs write the args into that stack space and
2039// then tail call to the corresponding runtime handler.
2040// The tail call makes these stubs disappear in backtraces.
2041// Defined as ABIInternal since they do not use the stack-based Go ABI.
2042TEXT runtime·panicIndex<ABIInternal>(SB),NOSPLIT,$0-16
2043	MOVQ	CX, BX
2044	JMP	runtime·goPanicIndex<ABIInternal>(SB)
2045TEXT runtime·panicIndexU<ABIInternal>(SB),NOSPLIT,$0-16
2046	MOVQ	CX, BX
2047	JMP	runtime·goPanicIndexU<ABIInternal>(SB)
2048TEXT runtime·panicSliceAlen<ABIInternal>(SB),NOSPLIT,$0-16
2049	MOVQ	CX, AX
2050	MOVQ	DX, BX
2051	JMP	runtime·goPanicSliceAlen<ABIInternal>(SB)
2052TEXT runtime·panicSliceAlenU<ABIInternal>(SB),NOSPLIT,$0-16
2053	MOVQ	CX, AX
2054	MOVQ	DX, BX
2055	JMP	runtime·goPanicSliceAlenU<ABIInternal>(SB)
2056TEXT runtime·panicSliceAcap<ABIInternal>(SB),NOSPLIT,$0-16
2057	MOVQ	CX, AX
2058	MOVQ	DX, BX
2059	JMP	runtime·goPanicSliceAcap<ABIInternal>(SB)
2060TEXT runtime·panicSliceAcapU<ABIInternal>(SB),NOSPLIT,$0-16
2061	MOVQ	CX, AX
2062	MOVQ	DX, BX
2063	JMP	runtime·goPanicSliceAcapU<ABIInternal>(SB)
2064TEXT runtime·panicSliceB<ABIInternal>(SB),NOSPLIT,$0-16
2065	MOVQ	CX, BX
2066	JMP	runtime·goPanicSliceB<ABIInternal>(SB)
2067TEXT runtime·panicSliceBU<ABIInternal>(SB),NOSPLIT,$0-16
2068	MOVQ	CX, BX
2069	JMP	runtime·goPanicSliceBU<ABIInternal>(SB)
2070TEXT runtime·panicSlice3Alen<ABIInternal>(SB),NOSPLIT,$0-16
2071	MOVQ	DX, AX
2072	JMP	runtime·goPanicSlice3Alen<ABIInternal>(SB)
2073TEXT runtime·panicSlice3AlenU<ABIInternal>(SB),NOSPLIT,$0-16
2074	MOVQ	DX, AX
2075	JMP	runtime·goPanicSlice3AlenU<ABIInternal>(SB)
2076TEXT runtime·panicSlice3Acap<ABIInternal>(SB),NOSPLIT,$0-16
2077	MOVQ	DX, AX
2078	JMP	runtime·goPanicSlice3Acap<ABIInternal>(SB)
2079TEXT runtime·panicSlice3AcapU<ABIInternal>(SB),NOSPLIT,$0-16
2080	MOVQ	DX, AX
2081	JMP	runtime·goPanicSlice3AcapU<ABIInternal>(SB)
2082TEXT runtime·panicSlice3B<ABIInternal>(SB),NOSPLIT,$0-16
2083	MOVQ	CX, AX
2084	MOVQ	DX, BX
2085	JMP	runtime·goPanicSlice3B<ABIInternal>(SB)
2086TEXT runtime·panicSlice3BU<ABIInternal>(SB),NOSPLIT,$0-16
2087	MOVQ	CX, AX
2088	MOVQ	DX, BX
2089	JMP	runtime·goPanicSlice3BU<ABIInternal>(SB)
2090TEXT runtime·panicSlice3C<ABIInternal>(SB),NOSPLIT,$0-16
2091	MOVQ	CX, BX
2092	JMP	runtime·goPanicSlice3C<ABIInternal>(SB)
2093TEXT runtime·panicSlice3CU<ABIInternal>(SB),NOSPLIT,$0-16
2094	MOVQ	CX, BX
2095	JMP	runtime·goPanicSlice3CU<ABIInternal>(SB)
2096TEXT runtime·panicSliceConvert<ABIInternal>(SB),NOSPLIT,$0-16
2097	MOVQ	DX, AX
2098	JMP	runtime·goPanicSliceConvert<ABIInternal>(SB)
2099
2100#ifdef GOOS_android
2101// Use the free TLS_SLOT_APP slot #2 on Android Q.
2102// Earlier androids are set up in gcc_android.c.
2103DATA runtime·tls_g+0(SB)/8, $16
2104GLOBL runtime·tls_g+0(SB), NOPTR, $8
2105#endif
2106#ifdef GOOS_windows
2107GLOBL runtime·tls_g+0(SB), NOPTR, $8
2108#endif
2109
2110// The compiler and assembler's -spectre=ret mode rewrites
2111// all indirect CALL AX / JMP AX instructions to be
2112// CALL retpolineAX / JMP retpolineAX.
2113// See https://support.google.com/faqs/answer/7625886.
2114#define RETPOLINE(reg) \
2115	/*   CALL setup */     BYTE $0xE8; BYTE $(2+2); BYTE $0; BYTE $0; BYTE $0;	\
2116	/* nospec: */									\
2117	/*   PAUSE */           BYTE $0xF3; BYTE $0x90;					\
2118	/*   JMP nospec */      BYTE $0xEB; BYTE $-(2+2);				\
2119	/* setup: */									\
2120	/*   MOVQ AX, 0(SP) */  BYTE $0x48|((reg&8)>>1); BYTE $0x89;			\
2121	                        BYTE $0x04|((reg&7)<<3); BYTE $0x24;			\
2122	/*   RET */             BYTE $0xC3
2123
2124TEXT runtime·retpolineAX(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(0)
2125TEXT runtime·retpolineCX(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(1)
2126TEXT runtime·retpolineDX(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(2)
2127TEXT runtime·retpolineBX(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(3)
2128/* SP is 4, can't happen / magic encodings */
2129TEXT runtime·retpolineBP(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(5)
2130TEXT runtime·retpolineSI(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(6)
2131TEXT runtime·retpolineDI(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(7)
2132TEXT runtime·retpolineR8(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(8)
2133TEXT runtime·retpolineR9(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(9)
2134TEXT runtime·retpolineR10(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(10)
2135TEXT runtime·retpolineR11(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(11)
2136TEXT runtime·retpolineR12(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(12)
2137TEXT runtime·retpolineR13(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(13)
2138TEXT runtime·retpolineR14(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(14)
2139TEXT runtime·retpolineR15(SB),NOSPLIT|NOFRAME,$0; RETPOLINE(15)
2140
2141TEXT ·getfp<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
2142	MOVQ BP, AX
2143	RET
2144