1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package ppc64
6
7import (
8	"cmd/compile/internal/base"
9	"cmd/compile/internal/ir"
10	"cmd/compile/internal/logopt"
11	"cmd/compile/internal/objw"
12	"cmd/compile/internal/ssa"
13	"cmd/compile/internal/ssagen"
14	"cmd/compile/internal/types"
15	"cmd/internal/obj"
16	"cmd/internal/obj/ppc64"
17	"internal/buildcfg"
18	"math"
19	"strings"
20)
21
22// ssaMarkMoves marks any MOVXconst ops that need to avoid clobbering flags.
23func ssaMarkMoves(s *ssagen.State, b *ssa.Block) {
24	//	flive := b.FlagsLiveAtEnd
25	//	if b.Control != nil && b.Control.Type.IsFlags() {
26	//		flive = true
27	//	}
28	//	for i := len(b.Values) - 1; i >= 0; i-- {
29	//		v := b.Values[i]
30	//		if flive && (v.Op == v.Op == ssa.OpPPC64MOVDconst) {
31	//			// The "mark" is any non-nil Aux value.
32	//			v.Aux = v
33	//		}
34	//		if v.Type.IsFlags() {
35	//			flive = false
36	//		}
37	//		for _, a := range v.Args {
38	//			if a.Type.IsFlags() {
39	//				flive = true
40	//			}
41	//		}
42	//	}
43}
44
45// loadByType returns the load instruction of the given type.
46func loadByType(t *types.Type) obj.As {
47	if t.IsFloat() {
48		switch t.Size() {
49		case 4:
50			return ppc64.AFMOVS
51		case 8:
52			return ppc64.AFMOVD
53		}
54	} else {
55		switch t.Size() {
56		case 1:
57			if t.IsSigned() {
58				return ppc64.AMOVB
59			} else {
60				return ppc64.AMOVBZ
61			}
62		case 2:
63			if t.IsSigned() {
64				return ppc64.AMOVH
65			} else {
66				return ppc64.AMOVHZ
67			}
68		case 4:
69			if t.IsSigned() {
70				return ppc64.AMOVW
71			} else {
72				return ppc64.AMOVWZ
73			}
74		case 8:
75			return ppc64.AMOVD
76		}
77	}
78	panic("bad load type")
79}
80
81// storeByType returns the store instruction of the given type.
82func storeByType(t *types.Type) obj.As {
83	if t.IsFloat() {
84		switch t.Size() {
85		case 4:
86			return ppc64.AFMOVS
87		case 8:
88			return ppc64.AFMOVD
89		}
90	} else {
91		switch t.Size() {
92		case 1:
93			return ppc64.AMOVB
94		case 2:
95			return ppc64.AMOVH
96		case 4:
97			return ppc64.AMOVW
98		case 8:
99			return ppc64.AMOVD
100		}
101	}
102	panic("bad store type")
103}
104
105func ssaGenValue(s *ssagen.State, v *ssa.Value) {
106	switch v.Op {
107	case ssa.OpCopy:
108		t := v.Type
109		if t.IsMemory() {
110			return
111		}
112		x := v.Args[0].Reg()
113		y := v.Reg()
114		if x != y {
115			rt := obj.TYPE_REG
116			op := ppc64.AMOVD
117
118			if t.IsFloat() {
119				op = ppc64.AFMOVD
120			}
121			p := s.Prog(op)
122			p.From.Type = rt
123			p.From.Reg = x
124			p.To.Type = rt
125			p.To.Reg = y
126		}
127
128	case ssa.OpPPC64LoweredAtomicAnd8,
129		ssa.OpPPC64LoweredAtomicAnd32,
130		ssa.OpPPC64LoweredAtomicOr8,
131		ssa.OpPPC64LoweredAtomicOr32:
132		// LWSYNC
133		// LBAR/LWAR	(Rarg0), Rtmp
134		// AND/OR	Rarg1, Rtmp
135		// STBCCC/STWCCC Rtmp, (Rarg0)
136		// BNE		-3(PC)
137		ld := ppc64.ALBAR
138		st := ppc64.ASTBCCC
139		if v.Op == ssa.OpPPC64LoweredAtomicAnd32 || v.Op == ssa.OpPPC64LoweredAtomicOr32 {
140			ld = ppc64.ALWAR
141			st = ppc64.ASTWCCC
142		}
143		r0 := v.Args[0].Reg()
144		r1 := v.Args[1].Reg()
145		// LWSYNC - Assuming shared data not write-through-required nor
146		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
147		plwsync := s.Prog(ppc64.ALWSYNC)
148		plwsync.To.Type = obj.TYPE_NONE
149		// LBAR or LWAR
150		p := s.Prog(ld)
151		p.From.Type = obj.TYPE_MEM
152		p.From.Reg = r0
153		p.To.Type = obj.TYPE_REG
154		p.To.Reg = ppc64.REGTMP
155		// AND/OR reg1,out
156		p1 := s.Prog(v.Op.Asm())
157		p1.From.Type = obj.TYPE_REG
158		p1.From.Reg = r1
159		p1.To.Type = obj.TYPE_REG
160		p1.To.Reg = ppc64.REGTMP
161		// STBCCC or STWCCC
162		p2 := s.Prog(st)
163		p2.From.Type = obj.TYPE_REG
164		p2.From.Reg = ppc64.REGTMP
165		p2.To.Type = obj.TYPE_MEM
166		p2.To.Reg = r0
167		p2.RegTo2 = ppc64.REGTMP
168		// BNE retry
169		p3 := s.Prog(ppc64.ABNE)
170		p3.To.Type = obj.TYPE_BRANCH
171		p3.To.SetTarget(p)
172
173	case ssa.OpPPC64LoweredAtomicAdd32,
174		ssa.OpPPC64LoweredAtomicAdd64:
175		// LWSYNC
176		// LDAR/LWAR    (Rarg0), Rout
177		// ADD		Rarg1, Rout
178		// STDCCC/STWCCC Rout, (Rarg0)
179		// BNE         -3(PC)
180		// MOVW		Rout,Rout (if Add32)
181		ld := ppc64.ALDAR
182		st := ppc64.ASTDCCC
183		if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
184			ld = ppc64.ALWAR
185			st = ppc64.ASTWCCC
186		}
187		r0 := v.Args[0].Reg()
188		r1 := v.Args[1].Reg()
189		out := v.Reg0()
190		// LWSYNC - Assuming shared data not write-through-required nor
191		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
192		plwsync := s.Prog(ppc64.ALWSYNC)
193		plwsync.To.Type = obj.TYPE_NONE
194		// LDAR or LWAR
195		p := s.Prog(ld)
196		p.From.Type = obj.TYPE_MEM
197		p.From.Reg = r0
198		p.To.Type = obj.TYPE_REG
199		p.To.Reg = out
200		// ADD reg1,out
201		p1 := s.Prog(ppc64.AADD)
202		p1.From.Type = obj.TYPE_REG
203		p1.From.Reg = r1
204		p1.To.Reg = out
205		p1.To.Type = obj.TYPE_REG
206		// STDCCC or STWCCC
207		p3 := s.Prog(st)
208		p3.From.Type = obj.TYPE_REG
209		p3.From.Reg = out
210		p3.To.Type = obj.TYPE_MEM
211		p3.To.Reg = r0
212		// BNE retry
213		p4 := s.Prog(ppc64.ABNE)
214		p4.To.Type = obj.TYPE_BRANCH
215		p4.To.SetTarget(p)
216
217		// Ensure a 32 bit result
218		if v.Op == ssa.OpPPC64LoweredAtomicAdd32 {
219			p5 := s.Prog(ppc64.AMOVWZ)
220			p5.To.Type = obj.TYPE_REG
221			p5.To.Reg = out
222			p5.From.Type = obj.TYPE_REG
223			p5.From.Reg = out
224		}
225
226	case ssa.OpPPC64LoweredAtomicExchange32,
227		ssa.OpPPC64LoweredAtomicExchange64:
228		// LWSYNC
229		// LDAR/LWAR    (Rarg0), Rout
230		// STDCCC/STWCCC Rout, (Rarg0)
231		// BNE         -2(PC)
232		// ISYNC
233		ld := ppc64.ALDAR
234		st := ppc64.ASTDCCC
235		if v.Op == ssa.OpPPC64LoweredAtomicExchange32 {
236			ld = ppc64.ALWAR
237			st = ppc64.ASTWCCC
238		}
239		r0 := v.Args[0].Reg()
240		r1 := v.Args[1].Reg()
241		out := v.Reg0()
242		// LWSYNC - Assuming shared data not write-through-required nor
243		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
244		plwsync := s.Prog(ppc64.ALWSYNC)
245		plwsync.To.Type = obj.TYPE_NONE
246		// LDAR or LWAR
247		p := s.Prog(ld)
248		p.From.Type = obj.TYPE_MEM
249		p.From.Reg = r0
250		p.To.Type = obj.TYPE_REG
251		p.To.Reg = out
252		// STDCCC or STWCCC
253		p1 := s.Prog(st)
254		p1.From.Type = obj.TYPE_REG
255		p1.From.Reg = r1
256		p1.To.Type = obj.TYPE_MEM
257		p1.To.Reg = r0
258		// BNE retry
259		p2 := s.Prog(ppc64.ABNE)
260		p2.To.Type = obj.TYPE_BRANCH
261		p2.To.SetTarget(p)
262		// ISYNC
263		pisync := s.Prog(ppc64.AISYNC)
264		pisync.To.Type = obj.TYPE_NONE
265
266	case ssa.OpPPC64LoweredAtomicLoad8,
267		ssa.OpPPC64LoweredAtomicLoad32,
268		ssa.OpPPC64LoweredAtomicLoad64,
269		ssa.OpPPC64LoweredAtomicLoadPtr:
270		// SYNC
271		// MOVB/MOVD/MOVW (Rarg0), Rout
272		// CMP Rout,Rout
273		// BNE 1(PC)
274		// ISYNC
275		ld := ppc64.AMOVD
276		cmp := ppc64.ACMP
277		switch v.Op {
278		case ssa.OpPPC64LoweredAtomicLoad8:
279			ld = ppc64.AMOVBZ
280		case ssa.OpPPC64LoweredAtomicLoad32:
281			ld = ppc64.AMOVWZ
282			cmp = ppc64.ACMPW
283		}
284		arg0 := v.Args[0].Reg()
285		out := v.Reg0()
286		// SYNC when AuxInt == 1; otherwise, load-acquire
287		if v.AuxInt == 1 {
288			psync := s.Prog(ppc64.ASYNC)
289			psync.To.Type = obj.TYPE_NONE
290		}
291		// Load
292		p := s.Prog(ld)
293		p.From.Type = obj.TYPE_MEM
294		p.From.Reg = arg0
295		p.To.Type = obj.TYPE_REG
296		p.To.Reg = out
297		// CMP
298		p1 := s.Prog(cmp)
299		p1.From.Type = obj.TYPE_REG
300		p1.From.Reg = out
301		p1.To.Type = obj.TYPE_REG
302		p1.To.Reg = out
303		// BNE
304		p2 := s.Prog(ppc64.ABNE)
305		p2.To.Type = obj.TYPE_BRANCH
306		// ISYNC
307		pisync := s.Prog(ppc64.AISYNC)
308		pisync.To.Type = obj.TYPE_NONE
309		p2.To.SetTarget(pisync)
310
311	case ssa.OpPPC64LoweredAtomicStore8,
312		ssa.OpPPC64LoweredAtomicStore32,
313		ssa.OpPPC64LoweredAtomicStore64:
314		// SYNC or LWSYNC
315		// MOVB/MOVW/MOVD arg1,(arg0)
316		st := ppc64.AMOVD
317		switch v.Op {
318		case ssa.OpPPC64LoweredAtomicStore8:
319			st = ppc64.AMOVB
320		case ssa.OpPPC64LoweredAtomicStore32:
321			st = ppc64.AMOVW
322		}
323		arg0 := v.Args[0].Reg()
324		arg1 := v.Args[1].Reg()
325		// If AuxInt == 0, LWSYNC (Store-Release), else SYNC
326		// SYNC
327		syncOp := ppc64.ASYNC
328		if v.AuxInt == 0 {
329			syncOp = ppc64.ALWSYNC
330		}
331		psync := s.Prog(syncOp)
332		psync.To.Type = obj.TYPE_NONE
333		// Store
334		p := s.Prog(st)
335		p.To.Type = obj.TYPE_MEM
336		p.To.Reg = arg0
337		p.From.Type = obj.TYPE_REG
338		p.From.Reg = arg1
339
340	case ssa.OpPPC64LoweredAtomicCas64,
341		ssa.OpPPC64LoweredAtomicCas32:
342		// MOVD        $0, Rout
343		// LWSYNC
344		// loop:
345		// LDAR        (Rarg0), MutexHint, Rtmp
346		// CMP         Rarg1, Rtmp
347		// BNE         end
348		// STDCCC      Rarg2, (Rarg0)
349		// BNE         loop
350		// MOVD        $1, Rout
351		// end:
352		// LWSYNC      // Only for sequential consistency; not required in CasRel.
353		ld := ppc64.ALDAR
354		st := ppc64.ASTDCCC
355		cmp := ppc64.ACMP
356		if v.Op == ssa.OpPPC64LoweredAtomicCas32 {
357			ld = ppc64.ALWAR
358			st = ppc64.ASTWCCC
359			cmp = ppc64.ACMPW
360		}
361		r0 := v.Args[0].Reg()
362		r1 := v.Args[1].Reg()
363		r2 := v.Args[2].Reg()
364		out := v.Reg0()
365		// Initialize return value to false
366		p := s.Prog(ppc64.AMOVD)
367		p.From.Type = obj.TYPE_CONST
368		p.From.Offset = 0
369		p.To.Type = obj.TYPE_REG
370		p.To.Reg = out
371		// LWSYNC - Assuming shared data not write-through-required nor
372		// caching-inhibited. See Appendix B.2.2.2 in the ISA 2.07b.
373		plwsync1 := s.Prog(ppc64.ALWSYNC)
374		plwsync1.To.Type = obj.TYPE_NONE
375		// LDAR or LWAR
376		p0 := s.Prog(ld)
377		p0.From.Type = obj.TYPE_MEM
378		p0.From.Reg = r0
379		p0.To.Type = obj.TYPE_REG
380		p0.To.Reg = ppc64.REGTMP
381		// If it is a Compare-and-Swap-Release operation, set the EH field with
382		// the release hint.
383		if v.AuxInt == 0 {
384			p0.AddRestSourceConst(0)
385		}
386		// CMP reg1,reg2
387		p1 := s.Prog(cmp)
388		p1.From.Type = obj.TYPE_REG
389		p1.From.Reg = r1
390		p1.To.Reg = ppc64.REGTMP
391		p1.To.Type = obj.TYPE_REG
392		// BNE done with return value = false
393		p2 := s.Prog(ppc64.ABNE)
394		p2.To.Type = obj.TYPE_BRANCH
395		// STDCCC or STWCCC
396		p3 := s.Prog(st)
397		p3.From.Type = obj.TYPE_REG
398		p3.From.Reg = r2
399		p3.To.Type = obj.TYPE_MEM
400		p3.To.Reg = r0
401		// BNE retry
402		p4 := s.Prog(ppc64.ABNE)
403		p4.To.Type = obj.TYPE_BRANCH
404		p4.To.SetTarget(p0)
405		// return value true
406		p5 := s.Prog(ppc64.AMOVD)
407		p5.From.Type = obj.TYPE_CONST
408		p5.From.Offset = 1
409		p5.To.Type = obj.TYPE_REG
410		p5.To.Reg = out
411		// LWSYNC - Assuming shared data not write-through-required nor
412		// caching-inhibited. See Appendix B.2.1.1 in the ISA 2.07b.
413		// If the operation is a CAS-Release, then synchronization is not necessary.
414		if v.AuxInt != 0 {
415			plwsync2 := s.Prog(ppc64.ALWSYNC)
416			plwsync2.To.Type = obj.TYPE_NONE
417			p2.To.SetTarget(plwsync2)
418		} else {
419			// done (label)
420			p6 := s.Prog(obj.ANOP)
421			p2.To.SetTarget(p6)
422		}
423
424	case ssa.OpPPC64LoweredPubBarrier:
425		// LWSYNC
426		s.Prog(v.Op.Asm())
427
428	case ssa.OpPPC64LoweredGetClosurePtr:
429		// Closure pointer is R11 (already)
430		ssagen.CheckLoweredGetClosurePtr(v)
431
432	case ssa.OpPPC64LoweredGetCallerSP:
433		// caller's SP is FixedFrameSize below the address of the first arg
434		p := s.Prog(ppc64.AMOVD)
435		p.From.Type = obj.TYPE_ADDR
436		p.From.Offset = -base.Ctxt.Arch.FixedFrameSize
437		p.From.Name = obj.NAME_PARAM
438		p.To.Type = obj.TYPE_REG
439		p.To.Reg = v.Reg()
440
441	case ssa.OpPPC64LoweredGetCallerPC:
442		p := s.Prog(obj.AGETCALLERPC)
443		p.To.Type = obj.TYPE_REG
444		p.To.Reg = v.Reg()
445
446	case ssa.OpPPC64LoweredRound32F, ssa.OpPPC64LoweredRound64F:
447		// input is already rounded
448
449	case ssa.OpLoadReg:
450		loadOp := loadByType(v.Type)
451		p := s.Prog(loadOp)
452		ssagen.AddrAuto(&p.From, v.Args[0])
453		p.To.Type = obj.TYPE_REG
454		p.To.Reg = v.Reg()
455
456	case ssa.OpStoreReg:
457		storeOp := storeByType(v.Type)
458		p := s.Prog(storeOp)
459		p.From.Type = obj.TYPE_REG
460		p.From.Reg = v.Args[0].Reg()
461		ssagen.AddrAuto(&p.To, v)
462
463	case ssa.OpArgIntReg, ssa.OpArgFloatReg:
464		// The assembler needs to wrap the entry safepoint/stack growth code with spill/unspill
465		// The loop only runs once.
466		for _, a := range v.Block.Func.RegArgs {
467			// Pass the spill/unspill information along to the assembler, offset by size of
468			// the saved LR slot.
469			addr := ssagen.SpillSlotAddr(a, ppc64.REGSP, base.Ctxt.Arch.FixedFrameSize)
470			s.FuncInfo().AddSpill(
471				obj.RegSpill{Reg: a.Reg, Addr: addr, Unspill: loadByType(a.Type), Spill: storeByType(a.Type)})
472		}
473		v.Block.Func.RegArgs = nil
474
475		ssagen.CheckArgReg(v)
476
477	case ssa.OpPPC64DIVD:
478		// For now,
479		//
480		// cmp arg1, -1
481		// be  ahead
482		// v = arg0 / arg1
483		// b over
484		// ahead: v = - arg0
485		// over: nop
486		r := v.Reg()
487		r0 := v.Args[0].Reg()
488		r1 := v.Args[1].Reg()
489
490		p := s.Prog(ppc64.ACMP)
491		p.From.Type = obj.TYPE_REG
492		p.From.Reg = r1
493		p.To.Type = obj.TYPE_CONST
494		p.To.Offset = -1
495
496		pbahead := s.Prog(ppc64.ABEQ)
497		pbahead.To.Type = obj.TYPE_BRANCH
498
499		p = s.Prog(v.Op.Asm())
500		p.From.Type = obj.TYPE_REG
501		p.From.Reg = r1
502		p.Reg = r0
503		p.To.Type = obj.TYPE_REG
504		p.To.Reg = r
505
506		pbover := s.Prog(obj.AJMP)
507		pbover.To.Type = obj.TYPE_BRANCH
508
509		p = s.Prog(ppc64.ANEG)
510		p.To.Type = obj.TYPE_REG
511		p.To.Reg = r
512		p.From.Type = obj.TYPE_REG
513		p.From.Reg = r0
514		pbahead.To.SetTarget(p)
515
516		p = s.Prog(obj.ANOP)
517		pbover.To.SetTarget(p)
518
519	case ssa.OpPPC64DIVW:
520		// word-width version of above
521		r := v.Reg()
522		r0 := v.Args[0].Reg()
523		r1 := v.Args[1].Reg()
524
525		p := s.Prog(ppc64.ACMPW)
526		p.From.Type = obj.TYPE_REG
527		p.From.Reg = r1
528		p.To.Type = obj.TYPE_CONST
529		p.To.Offset = -1
530
531		pbahead := s.Prog(ppc64.ABEQ)
532		pbahead.To.Type = obj.TYPE_BRANCH
533
534		p = s.Prog(v.Op.Asm())
535		p.From.Type = obj.TYPE_REG
536		p.From.Reg = r1
537		p.Reg = r0
538		p.To.Type = obj.TYPE_REG
539		p.To.Reg = r
540
541		pbover := s.Prog(obj.AJMP)
542		pbover.To.Type = obj.TYPE_BRANCH
543
544		p = s.Prog(ppc64.ANEG)
545		p.To.Type = obj.TYPE_REG
546		p.To.Reg = r
547		p.From.Type = obj.TYPE_REG
548		p.From.Reg = r0
549		pbahead.To.SetTarget(p)
550
551		p = s.Prog(obj.ANOP)
552		pbover.To.SetTarget(p)
553
554	case ssa.OpPPC64CLRLSLWI:
555		r := v.Reg()
556		r1 := v.Args[0].Reg()
557		shifts := v.AuxInt
558		p := s.Prog(v.Op.Asm())
559		// clrlslwi ra,rs,mb,sh will become rlwinm ra,rs,sh,mb-sh,31-sh as described in ISA
560		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}
561		p.AddRestSourceConst(ssa.GetPPC64Shiftsh(shifts))
562		p.Reg = r1
563		p.To.Type = obj.TYPE_REG
564		p.To.Reg = r
565
566	case ssa.OpPPC64CLRLSLDI:
567		r := v.Reg()
568		r1 := v.Args[0].Reg()
569		shifts := v.AuxInt
570		p := s.Prog(v.Op.Asm())
571		// clrlsldi ra,rs,mb,sh will become rldic ra,rs,sh,mb-sh
572		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}
573		p.AddRestSourceConst(ssa.GetPPC64Shiftsh(shifts))
574		p.Reg = r1
575		p.To.Type = obj.TYPE_REG
576		p.To.Reg = r
577
578	case ssa.OpPPC64ADD, ssa.OpPPC64FADD, ssa.OpPPC64FADDS, ssa.OpPPC64SUB, ssa.OpPPC64FSUB, ssa.OpPPC64FSUBS,
579		ssa.OpPPC64MULLD, ssa.OpPPC64MULLW, ssa.OpPPC64DIVDU, ssa.OpPPC64DIVWU,
580		ssa.OpPPC64SRAD, ssa.OpPPC64SRAW, ssa.OpPPC64SRD, ssa.OpPPC64SRW, ssa.OpPPC64SLD, ssa.OpPPC64SLW,
581		ssa.OpPPC64ROTL, ssa.OpPPC64ROTLW,
582		ssa.OpPPC64MULHD, ssa.OpPPC64MULHW, ssa.OpPPC64MULHDU, ssa.OpPPC64MULHWU,
583		ssa.OpPPC64FMUL, ssa.OpPPC64FMULS, ssa.OpPPC64FDIV, ssa.OpPPC64FDIVS, ssa.OpPPC64FCPSGN,
584		ssa.OpPPC64AND, ssa.OpPPC64OR, ssa.OpPPC64ANDN, ssa.OpPPC64ORN, ssa.OpPPC64NOR, ssa.OpPPC64XOR, ssa.OpPPC64EQV,
585		ssa.OpPPC64MODUD, ssa.OpPPC64MODSD, ssa.OpPPC64MODUW, ssa.OpPPC64MODSW, ssa.OpPPC64XSMINJDP, ssa.OpPPC64XSMAXJDP:
586		r := v.Reg()
587		r1 := v.Args[0].Reg()
588		r2 := v.Args[1].Reg()
589		p := s.Prog(v.Op.Asm())
590		p.From.Type = obj.TYPE_REG
591		p.From.Reg = r2
592		p.Reg = r1
593		p.To.Type = obj.TYPE_REG
594		p.To.Reg = r
595
596	case ssa.OpPPC64ADDCC, ssa.OpPPC64ANDCC, ssa.OpPPC64SUBCC, ssa.OpPPC64ORCC, ssa.OpPPC64XORCC, ssa.OpPPC64NORCC,
597		ssa.OpPPC64ANDNCC:
598		r1 := v.Args[0].Reg()
599		r2 := v.Args[1].Reg()
600		p := s.Prog(v.Op.Asm())
601		p.From.Type = obj.TYPE_REG
602		p.From.Reg = r2
603		p.Reg = r1
604		p.To.Type = obj.TYPE_REG
605		p.To.Reg = v.Reg0()
606
607	case ssa.OpPPC64NEGCC, ssa.OpPPC64CNTLZDCC:
608		p := s.Prog(v.Op.Asm())
609		p.To.Type = obj.TYPE_REG
610		p.To.Reg = v.Reg0()
611		p.From.Type = obj.TYPE_REG
612		p.From.Reg = v.Args[0].Reg()
613
614	case ssa.OpPPC64ROTLconst, ssa.OpPPC64ROTLWconst:
615		p := s.Prog(v.Op.Asm())
616		p.From.Type = obj.TYPE_CONST
617		p.From.Offset = v.AuxInt
618		p.Reg = v.Args[0].Reg()
619		p.To.Type = obj.TYPE_REG
620		p.To.Reg = v.Reg()
621
622		// Auxint holds encoded rotate + mask
623	case ssa.OpPPC64RLWINM, ssa.OpPPC64RLWMI:
624		sh, mb, me, _ := ssa.DecodePPC64RotateMask(v.AuxInt)
625		p := s.Prog(v.Op.Asm())
626		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
627		p.Reg = v.Args[0].Reg()
628		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: int64(sh)}
629		p.AddRestSourceArgs([]obj.Addr{{Type: obj.TYPE_CONST, Offset: mb}, {Type: obj.TYPE_CONST, Offset: me}})
630		// Auxint holds mask
631
632	case ssa.OpPPC64RLDICL, ssa.OpPPC64RLDICLCC, ssa.OpPPC64RLDICR:
633		sh, mb, me, _ := ssa.DecodePPC64RotateMask(v.AuxInt)
634		p := s.Prog(v.Op.Asm())
635		p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: sh}
636		switch v.Op {
637		case ssa.OpPPC64RLDICL, ssa.OpPPC64RLDICLCC:
638			p.AddRestSourceConst(mb)
639		case ssa.OpPPC64RLDICR:
640			p.AddRestSourceConst(me)
641		}
642		p.Reg = v.Args[0].Reg()
643		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.ResultReg()}
644
645	case ssa.OpPPC64RLWNM:
646		_, mb, me, _ := ssa.DecodePPC64RotateMask(v.AuxInt)
647		p := s.Prog(v.Op.Asm())
648		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
649		p.Reg = v.Args[0].Reg()
650		p.From = obj.Addr{Type: obj.TYPE_REG, Reg: v.Args[1].Reg()}
651		p.AddRestSourceArgs([]obj.Addr{{Type: obj.TYPE_CONST, Offset: mb}, {Type: obj.TYPE_CONST, Offset: me}})
652
653	case ssa.OpPPC64MADDLD:
654		r := v.Reg()
655		r1 := v.Args[0].Reg()
656		r2 := v.Args[1].Reg()
657		r3 := v.Args[2].Reg()
658		// r = r1*r2 ± r3
659		p := s.Prog(v.Op.Asm())
660		p.From.Type = obj.TYPE_REG
661		p.From.Reg = r1
662		p.Reg = r2
663		p.AddRestSourceReg(r3)
664		p.To.Type = obj.TYPE_REG
665		p.To.Reg = r
666
667	case ssa.OpPPC64FMADD, ssa.OpPPC64FMADDS, ssa.OpPPC64FMSUB, ssa.OpPPC64FMSUBS:
668		r := v.Reg()
669		r1 := v.Args[0].Reg()
670		r2 := v.Args[1].Reg()
671		r3 := v.Args[2].Reg()
672		// r = r1*r2 ± r3
673		p := s.Prog(v.Op.Asm())
674		p.From.Type = obj.TYPE_REG
675		p.From.Reg = r1
676		p.Reg = r3
677		p.AddRestSourceReg(r2)
678		p.To.Type = obj.TYPE_REG
679		p.To.Reg = r
680
681	case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FFLOOR, ssa.OpPPC64FTRUNC, ssa.OpPPC64FCEIL,
682		ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FCFIDS, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW,
683		ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB, ssa.OpPPC64MFVSRD, ssa.OpPPC64MTVSRD, ssa.OpPPC64FABS, ssa.OpPPC64FNABS,
684		ssa.OpPPC64FROUND, ssa.OpPPC64CNTTZW, ssa.OpPPC64CNTTZD, ssa.OpPPC64BRH, ssa.OpPPC64BRW, ssa.OpPPC64BRD:
685		r := v.Reg()
686		p := s.Prog(v.Op.Asm())
687		p.To.Type = obj.TYPE_REG
688		p.To.Reg = r
689		p.From.Type = obj.TYPE_REG
690		p.From.Reg = v.Args[0].Reg()
691
692	case ssa.OpPPC64ADDconst, ssa.OpPPC64ORconst, ssa.OpPPC64XORconst,
693		ssa.OpPPC64SRADconst, ssa.OpPPC64SRAWconst, ssa.OpPPC64SRDconst, ssa.OpPPC64SRWconst,
694		ssa.OpPPC64SLDconst, ssa.OpPPC64SLWconst, ssa.OpPPC64EXTSWSLconst, ssa.OpPPC64MULLWconst, ssa.OpPPC64MULLDconst,
695		ssa.OpPPC64ANDconst:
696		p := s.Prog(v.Op.Asm())
697		p.Reg = v.Args[0].Reg()
698		p.From.Type = obj.TYPE_CONST
699		p.From.Offset = v.AuxInt
700		p.To.Type = obj.TYPE_REG
701		p.To.Reg = v.Reg()
702
703	case ssa.OpPPC64ADDC, ssa.OpPPC64ADDE, ssa.OpPPC64SUBC, ssa.OpPPC64SUBE:
704		r := v.Reg0() // CA is the first, implied argument.
705		r1 := v.Args[0].Reg()
706		r2 := v.Args[1].Reg()
707		p := s.Prog(v.Op.Asm())
708		p.From.Type = obj.TYPE_REG
709		p.From.Reg = r2
710		p.Reg = r1
711		p.To.Type = obj.TYPE_REG
712		p.To.Reg = r
713
714	case ssa.OpPPC64ADDZE:
715		p := s.Prog(v.Op.Asm())
716		p.From.Type = obj.TYPE_REG
717		p.From.Reg = v.Args[0].Reg()
718		p.To.Type = obj.TYPE_REG
719		p.To.Reg = v.Reg0()
720
721	case ssa.OpPPC64ADDZEzero, ssa.OpPPC64SUBZEzero:
722		p := s.Prog(v.Op.Asm())
723		p.From.Type = obj.TYPE_REG
724		p.From.Reg = ppc64.REG_R0
725		p.To.Type = obj.TYPE_REG
726		p.To.Reg = v.Reg()
727
728	case ssa.OpPPC64ADDCconst:
729		p := s.Prog(v.Op.Asm())
730		p.Reg = v.Args[0].Reg()
731		p.From.Type = obj.TYPE_CONST
732		p.From.Offset = v.AuxInt
733		p.To.Type = obj.TYPE_REG
734		// Output is a pair, the second is the CA, which is implied.
735		p.To.Reg = v.Reg0()
736
737	case ssa.OpPPC64SUBCconst:
738		p := s.Prog(v.Op.Asm())
739		p.AddRestSourceConst(v.AuxInt)
740		p.From.Type = obj.TYPE_REG
741		p.From.Reg = v.Args[0].Reg()
742		p.To.Type = obj.TYPE_REG
743		p.To.Reg = v.Reg0()
744
745	case ssa.OpPPC64SUBFCconst:
746		p := s.Prog(v.Op.Asm())
747		p.AddRestSourceConst(v.AuxInt)
748		p.From.Type = obj.TYPE_REG
749		p.From.Reg = v.Args[0].Reg()
750		p.To.Type = obj.TYPE_REG
751		p.To.Reg = v.Reg()
752
753	case ssa.OpPPC64ADDCCconst, ssa.OpPPC64ANDCCconst:
754		p := s.Prog(v.Op.Asm())
755		p.Reg = v.Args[0].Reg()
756		p.From.Type = obj.TYPE_CONST
757		p.From.Offset = v.AuxInt
758		p.To.Type = obj.TYPE_REG
759		p.To.Reg = v.Reg0()
760
761	case ssa.OpPPC64MOVDaddr:
762		switch v.Aux.(type) {
763		default:
764			v.Fatalf("aux in MOVDaddr is of unknown type %T", v.Aux)
765		case nil:
766			// If aux offset and aux int are both 0, and the same
767			// input and output regs are used, no instruction
768			// needs to be generated, since it would just be
769			// addi rx, rx, 0.
770			if v.AuxInt != 0 || v.Args[0].Reg() != v.Reg() {
771				p := s.Prog(ppc64.AMOVD)
772				p.From.Type = obj.TYPE_ADDR
773				p.From.Reg = v.Args[0].Reg()
774				p.From.Offset = v.AuxInt
775				p.To.Type = obj.TYPE_REG
776				p.To.Reg = v.Reg()
777			}
778
779		case *obj.LSym, ir.Node:
780			p := s.Prog(ppc64.AMOVD)
781			p.From.Type = obj.TYPE_ADDR
782			p.From.Reg = v.Args[0].Reg()
783			p.To.Type = obj.TYPE_REG
784			p.To.Reg = v.Reg()
785			ssagen.AddAux(&p.From, v)
786
787		}
788
789	case ssa.OpPPC64MOVDconst:
790		p := s.Prog(v.Op.Asm())
791		p.From.Type = obj.TYPE_CONST
792		p.From.Offset = v.AuxInt
793		p.To.Type = obj.TYPE_REG
794		p.To.Reg = v.Reg()
795
796	case ssa.OpPPC64FMOVDconst, ssa.OpPPC64FMOVSconst:
797		p := s.Prog(v.Op.Asm())
798		p.From.Type = obj.TYPE_FCONST
799		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
800		p.To.Type = obj.TYPE_REG
801		p.To.Reg = v.Reg()
802
803	case ssa.OpPPC64FCMPU, ssa.OpPPC64CMP, ssa.OpPPC64CMPW, ssa.OpPPC64CMPU, ssa.OpPPC64CMPWU:
804		p := s.Prog(v.Op.Asm())
805		p.From.Type = obj.TYPE_REG
806		p.From.Reg = v.Args[0].Reg()
807		p.To.Type = obj.TYPE_REG
808		p.To.Reg = v.Args[1].Reg()
809
810	case ssa.OpPPC64CMPconst, ssa.OpPPC64CMPUconst, ssa.OpPPC64CMPWconst, ssa.OpPPC64CMPWUconst:
811		p := s.Prog(v.Op.Asm())
812		p.From.Type = obj.TYPE_REG
813		p.From.Reg = v.Args[0].Reg()
814		p.To.Type = obj.TYPE_CONST
815		p.To.Offset = v.AuxInt
816
817	case ssa.OpPPC64MOVBreg, ssa.OpPPC64MOVBZreg, ssa.OpPPC64MOVHreg, ssa.OpPPC64MOVHZreg, ssa.OpPPC64MOVWreg, ssa.OpPPC64MOVWZreg:
818		// Shift in register to required size
819		p := s.Prog(v.Op.Asm())
820		p.From.Type = obj.TYPE_REG
821		p.From.Reg = v.Args[0].Reg()
822		p.To.Reg = v.Reg()
823		p.To.Type = obj.TYPE_REG
824
825	case ssa.OpPPC64MOVDload, ssa.OpPPC64MOVWload:
826
827		// MOVDload and MOVWload are DS form instructions that are restricted to
828		// offsets that are a multiple of 4. If the offset is not a multiple of 4,
829		// then the address of the symbol to be loaded is computed (base + offset)
830		// and used as the new base register and the offset field in the instruction
831		// can be set to zero.
832
833		// This same problem can happen with gostrings since the final offset is not
834		// known yet, but could be unaligned after the relocation is resolved.
835		// So gostrings are handled the same way.
836
837		// This allows the MOVDload and MOVWload to be generated in more cases and
838		// eliminates some offset and alignment checking in the rules file.
839
840		fromAddr := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
841		ssagen.AddAux(&fromAddr, v)
842
843		genAddr := false
844
845		switch fromAddr.Name {
846		case obj.NAME_EXTERN, obj.NAME_STATIC:
847			// Special case for a rule combines the bytes of gostring.
848			// The v alignment might seem OK, but we don't want to load it
849			// using an offset because relocation comes later.
850			genAddr = strings.HasPrefix(fromAddr.Sym.Name, "go:string") || v.Type.Alignment()%4 != 0 || fromAddr.Offset%4 != 0
851		default:
852			genAddr = fromAddr.Offset%4 != 0
853		}
854		if genAddr {
855			// Load full address into the temp register.
856			p := s.Prog(ppc64.AMOVD)
857			p.From.Type = obj.TYPE_ADDR
858			p.From.Reg = v.Args[0].Reg()
859			ssagen.AddAux(&p.From, v)
860			// Load target using temp as base register
861			// and offset zero. Setting NAME_NONE
862			// prevents any extra offsets from being
863			// added.
864			p.To.Type = obj.TYPE_REG
865			p.To.Reg = ppc64.REGTMP
866			fromAddr.Reg = ppc64.REGTMP
867			// Clear the offset field and other
868			// information that might be used
869			// by the assembler to add to the
870			// final offset value.
871			fromAddr.Offset = 0
872			fromAddr.Name = obj.NAME_NONE
873			fromAddr.Sym = nil
874		}
875		p := s.Prog(v.Op.Asm())
876		p.From = fromAddr
877		p.To.Type = obj.TYPE_REG
878		p.To.Reg = v.Reg()
879
880	case ssa.OpPPC64MOVHload, ssa.OpPPC64MOVWZload, ssa.OpPPC64MOVBZload, ssa.OpPPC64MOVHZload, ssa.OpPPC64FMOVDload, ssa.OpPPC64FMOVSload:
881		p := s.Prog(v.Op.Asm())
882		p.From.Type = obj.TYPE_MEM
883		p.From.Reg = v.Args[0].Reg()
884		ssagen.AddAux(&p.From, v)
885		p.To.Type = obj.TYPE_REG
886		p.To.Reg = v.Reg()
887
888	case ssa.OpPPC64MOVDBRload, ssa.OpPPC64MOVWBRload, ssa.OpPPC64MOVHBRload:
889		p := s.Prog(v.Op.Asm())
890		p.From.Type = obj.TYPE_MEM
891		p.From.Reg = v.Args[0].Reg()
892		p.To.Type = obj.TYPE_REG
893		p.To.Reg = v.Reg()
894
895	case ssa.OpPPC64MOVDBRstore, ssa.OpPPC64MOVWBRstore, ssa.OpPPC64MOVHBRstore:
896		p := s.Prog(v.Op.Asm())
897		p.To.Type = obj.TYPE_MEM
898		p.To.Reg = v.Args[0].Reg()
899		p.From.Type = obj.TYPE_REG
900		p.From.Reg = v.Args[1].Reg()
901
902	case ssa.OpPPC64MOVDloadidx, ssa.OpPPC64MOVWloadidx, ssa.OpPPC64MOVHloadidx, ssa.OpPPC64MOVWZloadidx,
903		ssa.OpPPC64MOVBZloadidx, ssa.OpPPC64MOVHZloadidx, ssa.OpPPC64FMOVDloadidx, ssa.OpPPC64FMOVSloadidx,
904		ssa.OpPPC64MOVDBRloadidx, ssa.OpPPC64MOVWBRloadidx, ssa.OpPPC64MOVHBRloadidx:
905		p := s.Prog(v.Op.Asm())
906		p.From.Type = obj.TYPE_MEM
907		p.From.Reg = v.Args[0].Reg()
908		p.From.Index = v.Args[1].Reg()
909		p.To.Type = obj.TYPE_REG
910		p.To.Reg = v.Reg()
911
912	case ssa.OpPPC64DCBT:
913		p := s.Prog(v.Op.Asm())
914		p.From.Type = obj.TYPE_MEM
915		p.From.Reg = v.Args[0].Reg()
916		p.To.Type = obj.TYPE_CONST
917		p.To.Offset = v.AuxInt
918
919	case ssa.OpPPC64MOVWstorezero, ssa.OpPPC64MOVHstorezero, ssa.OpPPC64MOVBstorezero:
920		p := s.Prog(v.Op.Asm())
921		p.From.Type = obj.TYPE_REG
922		p.From.Reg = ppc64.REGZERO
923		p.To.Type = obj.TYPE_MEM
924		p.To.Reg = v.Args[0].Reg()
925		ssagen.AddAux(&p.To, v)
926
927	case ssa.OpPPC64MOVDstore, ssa.OpPPC64MOVDstorezero:
928
929		// MOVDstore and MOVDstorezero become DS form instructions that are restricted
930		// to offset values that are a multiple of 4. If the offset field is not a
931		// multiple of 4, then the full address of the store target is computed (base +
932		// offset) and used as the new base register and the offset in the instruction
933		// is set to 0.
934
935		// This allows the MOVDstore and MOVDstorezero to be generated in more cases,
936		// and prevents checking of the offset value and alignment in the rules.
937
938		toAddr := obj.Addr{Type: obj.TYPE_MEM, Reg: v.Args[0].Reg()}
939		ssagen.AddAux(&toAddr, v)
940
941		if toAddr.Offset%4 != 0 {
942			p := s.Prog(ppc64.AMOVD)
943			p.From.Type = obj.TYPE_ADDR
944			p.From.Reg = v.Args[0].Reg()
945			ssagen.AddAux(&p.From, v)
946			p.To.Type = obj.TYPE_REG
947			p.To.Reg = ppc64.REGTMP
948			toAddr.Reg = ppc64.REGTMP
949			// Clear the offset field and other
950			// information that might be used
951			// by the assembler to add to the
952			// final offset value.
953			toAddr.Offset = 0
954			toAddr.Name = obj.NAME_NONE
955			toAddr.Sym = nil
956		}
957		p := s.Prog(v.Op.Asm())
958		p.To = toAddr
959		p.From.Type = obj.TYPE_REG
960		if v.Op == ssa.OpPPC64MOVDstorezero {
961			p.From.Reg = ppc64.REGZERO
962		} else {
963			p.From.Reg = v.Args[1].Reg()
964		}
965
966	case ssa.OpPPC64MOVWstore, ssa.OpPPC64MOVHstore, ssa.OpPPC64MOVBstore, ssa.OpPPC64FMOVDstore, ssa.OpPPC64FMOVSstore:
967		p := s.Prog(v.Op.Asm())
968		p.From.Type = obj.TYPE_REG
969		p.From.Reg = v.Args[1].Reg()
970		p.To.Type = obj.TYPE_MEM
971		p.To.Reg = v.Args[0].Reg()
972		ssagen.AddAux(&p.To, v)
973
974	case ssa.OpPPC64MOVDstoreidx, ssa.OpPPC64MOVWstoreidx, ssa.OpPPC64MOVHstoreidx, ssa.OpPPC64MOVBstoreidx,
975		ssa.OpPPC64FMOVDstoreidx, ssa.OpPPC64FMOVSstoreidx, ssa.OpPPC64MOVDBRstoreidx, ssa.OpPPC64MOVWBRstoreidx,
976		ssa.OpPPC64MOVHBRstoreidx:
977		p := s.Prog(v.Op.Asm())
978		p.From.Type = obj.TYPE_REG
979		p.From.Reg = v.Args[2].Reg()
980		p.To.Index = v.Args[1].Reg()
981		p.To.Type = obj.TYPE_MEM
982		p.To.Reg = v.Args[0].Reg()
983
984	case ssa.OpPPC64ISEL, ssa.OpPPC64ISELZ:
985		// ISEL  AuxInt ? arg0 : arg1
986		// ISELZ is a special case of ISEL where arg1 is implicitly $0.
987		//
988		// AuxInt value indicates conditions 0=LT 1=GT 2=EQ 3=SO 4=GE 5=LE 6=NE 7=NSO.
989		// ISEL accepts a CR bit argument, not a condition as expressed by AuxInt.
990		// Convert the condition to a CR bit argument by the following conversion:
991		//
992		// AuxInt&3 ? arg0 : arg1 for conditions LT, GT, EQ, SO
993		// AuxInt&3 ? arg1 : arg0 for conditions GE, LE, NE, NSO
994		p := s.Prog(v.Op.Asm())
995		p.To = obj.Addr{Type: obj.TYPE_REG, Reg: v.Reg()}
996		p.Reg = v.Args[0].Reg()
997		if v.Op == ssa.OpPPC64ISEL {
998			p.AddRestSourceReg(v.Args[1].Reg())
999		} else {
1000			p.AddRestSourceReg(ppc64.REG_R0)
1001		}
1002		// AuxInt values 4,5,6 implemented with reverse operand order from 0,1,2
1003		if v.AuxInt > 3 {
1004			p.Reg, p.GetFrom3().Reg = p.GetFrom3().Reg, p.Reg
1005		}
1006		p.From.SetConst(v.AuxInt & 3)
1007
1008	case ssa.OpPPC64SETBC, ssa.OpPPC64SETBCR:
1009		p := s.Prog(v.Op.Asm())
1010		p.To.Type = obj.TYPE_REG
1011		p.To.Reg = v.Reg()
1012		p.From.Type = obj.TYPE_REG
1013		p.From.Reg = int16(ppc64.REG_CR0LT + v.AuxInt)
1014
1015	case ssa.OpPPC64LoweredQuadZero, ssa.OpPPC64LoweredQuadZeroShort:
1016		// The LoweredQuad code generation
1017		// generates STXV instructions on
1018		// power9. The Short variation is used
1019		// if no loop is generated.
1020
1021		// sizes >= 64 generate a loop as follows:
1022
1023		// Set up loop counter in CTR, used by BC
1024		// XXLXOR clears VS32
1025		//       XXLXOR VS32,VS32,VS32
1026		//       MOVD len/64,REG_TMP
1027		//       MOVD REG_TMP,CTR
1028		//       loop:
1029		//       STXV VS32,0(R20)
1030		//       STXV VS32,16(R20)
1031		//       STXV VS32,32(R20)
1032		//       STXV VS32,48(R20)
1033		//       ADD  $64,R20
1034		//       BC   16, 0, loop
1035
1036		// Bytes per iteration
1037		ctr := v.AuxInt / 64
1038
1039		// Remainder bytes
1040		rem := v.AuxInt % 64
1041
1042		// Only generate a loop if there is more
1043		// than 1 iteration.
1044		if ctr > 1 {
1045			// Set up VS32 (V0) to hold 0s
1046			p := s.Prog(ppc64.AXXLXOR)
1047			p.From.Type = obj.TYPE_REG
1048			p.From.Reg = ppc64.REG_VS32
1049			p.To.Type = obj.TYPE_REG
1050			p.To.Reg = ppc64.REG_VS32
1051			p.Reg = ppc64.REG_VS32
1052
1053			// Set up CTR loop counter
1054			p = s.Prog(ppc64.AMOVD)
1055			p.From.Type = obj.TYPE_CONST
1056			p.From.Offset = ctr
1057			p.To.Type = obj.TYPE_REG
1058			p.To.Reg = ppc64.REGTMP
1059
1060			p = s.Prog(ppc64.AMOVD)
1061			p.From.Type = obj.TYPE_REG
1062			p.From.Reg = ppc64.REGTMP
1063			p.To.Type = obj.TYPE_REG
1064			p.To.Reg = ppc64.REG_CTR
1065
1066			// Don't generate padding for
1067			// loops with few iterations.
1068			if ctr > 3 {
1069				p = s.Prog(obj.APCALIGN)
1070				p.From.Type = obj.TYPE_CONST
1071				p.From.Offset = 16
1072			}
1073
1074			// generate 4 STXVs to zero 64 bytes
1075			var top *obj.Prog
1076
1077			p = s.Prog(ppc64.ASTXV)
1078			p.From.Type = obj.TYPE_REG
1079			p.From.Reg = ppc64.REG_VS32
1080			p.To.Type = obj.TYPE_MEM
1081			p.To.Reg = v.Args[0].Reg()
1082
1083			//  Save the top of loop
1084			if top == nil {
1085				top = p
1086			}
1087			p = s.Prog(ppc64.ASTXV)
1088			p.From.Type = obj.TYPE_REG
1089			p.From.Reg = ppc64.REG_VS32
1090			p.To.Type = obj.TYPE_MEM
1091			p.To.Reg = v.Args[0].Reg()
1092			p.To.Offset = 16
1093
1094			p = s.Prog(ppc64.ASTXV)
1095			p.From.Type = obj.TYPE_REG
1096			p.From.Reg = ppc64.REG_VS32
1097			p.To.Type = obj.TYPE_MEM
1098			p.To.Reg = v.Args[0].Reg()
1099			p.To.Offset = 32
1100
1101			p = s.Prog(ppc64.ASTXV)
1102			p.From.Type = obj.TYPE_REG
1103			p.From.Reg = ppc64.REG_VS32
1104			p.To.Type = obj.TYPE_MEM
1105			p.To.Reg = v.Args[0].Reg()
1106			p.To.Offset = 48
1107
1108			// Increment address for the
1109			// 64 bytes just zeroed.
1110			p = s.Prog(ppc64.AADD)
1111			p.Reg = v.Args[0].Reg()
1112			p.From.Type = obj.TYPE_CONST
1113			p.From.Offset = 64
1114			p.To.Type = obj.TYPE_REG
1115			p.To.Reg = v.Args[0].Reg()
1116
1117			// Branch back to top of loop
1118			// based on CTR
1119			// BC with BO_BCTR generates bdnz
1120			p = s.Prog(ppc64.ABC)
1121			p.From.Type = obj.TYPE_CONST
1122			p.From.Offset = ppc64.BO_BCTR
1123			p.Reg = ppc64.REG_CR0LT
1124			p.To.Type = obj.TYPE_BRANCH
1125			p.To.SetTarget(top)
1126		}
1127		// When ctr == 1 the loop was not generated but
1128		// there are at least 64 bytes to clear, so add
1129		// that to the remainder to generate the code
1130		// to clear those doublewords
1131		if ctr == 1 {
1132			rem += 64
1133		}
1134
1135		// Clear the remainder starting at offset zero
1136		offset := int64(0)
1137
1138		if rem >= 16 && ctr <= 1 {
1139			// If the XXLXOR hasn't already been
1140			// generated, do it here to initialize
1141			// VS32 (V0) to 0.
1142			p := s.Prog(ppc64.AXXLXOR)
1143			p.From.Type = obj.TYPE_REG
1144			p.From.Reg = ppc64.REG_VS32
1145			p.To.Type = obj.TYPE_REG
1146			p.To.Reg = ppc64.REG_VS32
1147			p.Reg = ppc64.REG_VS32
1148		}
1149		// Generate STXV for 32 or 64
1150		// bytes.
1151		for rem >= 32 {
1152			p := s.Prog(ppc64.ASTXV)
1153			p.From.Type = obj.TYPE_REG
1154			p.From.Reg = ppc64.REG_VS32
1155			p.To.Type = obj.TYPE_MEM
1156			p.To.Reg = v.Args[0].Reg()
1157			p.To.Offset = offset
1158
1159			p = s.Prog(ppc64.ASTXV)
1160			p.From.Type = obj.TYPE_REG
1161			p.From.Reg = ppc64.REG_VS32
1162			p.To.Type = obj.TYPE_MEM
1163			p.To.Reg = v.Args[0].Reg()
1164			p.To.Offset = offset + 16
1165			offset += 32
1166			rem -= 32
1167		}
1168		// Generate 16 bytes
1169		if rem >= 16 {
1170			p := s.Prog(ppc64.ASTXV)
1171			p.From.Type = obj.TYPE_REG
1172			p.From.Reg = ppc64.REG_VS32
1173			p.To.Type = obj.TYPE_MEM
1174			p.To.Reg = v.Args[0].Reg()
1175			p.To.Offset = offset
1176			offset += 16
1177			rem -= 16
1178		}
1179
1180		// first clear as many doublewords as possible
1181		// then clear remaining sizes as available
1182		for rem > 0 {
1183			op, size := ppc64.AMOVB, int64(1)
1184			switch {
1185			case rem >= 8:
1186				op, size = ppc64.AMOVD, 8
1187			case rem >= 4:
1188				op, size = ppc64.AMOVW, 4
1189			case rem >= 2:
1190				op, size = ppc64.AMOVH, 2
1191			}
1192			p := s.Prog(op)
1193			p.From.Type = obj.TYPE_REG
1194			p.From.Reg = ppc64.REG_R0
1195			p.To.Type = obj.TYPE_MEM
1196			p.To.Reg = v.Args[0].Reg()
1197			p.To.Offset = offset
1198			rem -= size
1199			offset += size
1200		}
1201
1202	case ssa.OpPPC64LoweredZero, ssa.OpPPC64LoweredZeroShort:
1203
1204		// Unaligned data doesn't hurt performance
1205		// for these instructions on power8.
1206
1207		// For sizes >= 64 generate a loop as follows:
1208
1209		// Set up loop counter in CTR, used by BC
1210		//       XXLXOR VS32,VS32,VS32
1211		//	 MOVD len/32,REG_TMP
1212		//	 MOVD REG_TMP,CTR
1213		//       MOVD $16,REG_TMP
1214		//	 loop:
1215		//	 STXVD2X VS32,(R0)(R20)
1216		//	 STXVD2X VS32,(R31)(R20)
1217		//	 ADD  $32,R20
1218		//	 BC   16, 0, loop
1219		//
1220		// any remainder is done as described below
1221
1222		// for sizes < 64 bytes, first clear as many doublewords as possible,
1223		// then handle the remainder
1224		//	MOVD R0,(R20)
1225		//	MOVD R0,8(R20)
1226		// .... etc.
1227		//
1228		// the remainder bytes are cleared using one or more
1229		// of the following instructions with the appropriate
1230		// offsets depending which instructions are needed
1231		//
1232		//	MOVW R0,n1(R20)	4 bytes
1233		//	MOVH R0,n2(R20)	2 bytes
1234		//	MOVB R0,n3(R20)	1 byte
1235		//
1236		// 7 bytes: MOVW, MOVH, MOVB
1237		// 6 bytes: MOVW, MOVH
1238		// 5 bytes: MOVW, MOVB
1239		// 3 bytes: MOVH, MOVB
1240
1241		// each loop iteration does 32 bytes
1242		ctr := v.AuxInt / 32
1243
1244		// remainder bytes
1245		rem := v.AuxInt % 32
1246
1247		// only generate a loop if there is more
1248		// than 1 iteration.
1249		if ctr > 1 {
1250			// Set up VS32 (V0) to hold 0s
1251			p := s.Prog(ppc64.AXXLXOR)
1252			p.From.Type = obj.TYPE_REG
1253			p.From.Reg = ppc64.REG_VS32
1254			p.To.Type = obj.TYPE_REG
1255			p.To.Reg = ppc64.REG_VS32
1256			p.Reg = ppc64.REG_VS32
1257
1258			// Set up CTR loop counter
1259			p = s.Prog(ppc64.AMOVD)
1260			p.From.Type = obj.TYPE_CONST
1261			p.From.Offset = ctr
1262			p.To.Type = obj.TYPE_REG
1263			p.To.Reg = ppc64.REGTMP
1264
1265			p = s.Prog(ppc64.AMOVD)
1266			p.From.Type = obj.TYPE_REG
1267			p.From.Reg = ppc64.REGTMP
1268			p.To.Type = obj.TYPE_REG
1269			p.To.Reg = ppc64.REG_CTR
1270
1271			// Set up R31 to hold index value 16
1272			p = s.Prog(ppc64.AMOVD)
1273			p.From.Type = obj.TYPE_CONST
1274			p.From.Offset = 16
1275			p.To.Type = obj.TYPE_REG
1276			p.To.Reg = ppc64.REGTMP
1277
1278			// Don't add padding for alignment
1279			// with few loop iterations.
1280			if ctr > 3 {
1281				p = s.Prog(obj.APCALIGN)
1282				p.From.Type = obj.TYPE_CONST
1283				p.From.Offset = 16
1284			}
1285
1286			// generate 2 STXVD2Xs to store 16 bytes
1287			// when this is a loop then the top must be saved
1288			var top *obj.Prog
1289			// This is the top of loop
1290
1291			p = s.Prog(ppc64.ASTXVD2X)
1292			p.From.Type = obj.TYPE_REG
1293			p.From.Reg = ppc64.REG_VS32
1294			p.To.Type = obj.TYPE_MEM
1295			p.To.Reg = v.Args[0].Reg()
1296			p.To.Index = ppc64.REGZERO
1297			// Save the top of loop
1298			if top == nil {
1299				top = p
1300			}
1301			p = s.Prog(ppc64.ASTXVD2X)
1302			p.From.Type = obj.TYPE_REG
1303			p.From.Reg = ppc64.REG_VS32
1304			p.To.Type = obj.TYPE_MEM
1305			p.To.Reg = v.Args[0].Reg()
1306			p.To.Index = ppc64.REGTMP
1307
1308			// Increment address for the
1309			// 4 doublewords just zeroed.
1310			p = s.Prog(ppc64.AADD)
1311			p.Reg = v.Args[0].Reg()
1312			p.From.Type = obj.TYPE_CONST
1313			p.From.Offset = 32
1314			p.To.Type = obj.TYPE_REG
1315			p.To.Reg = v.Args[0].Reg()
1316
1317			// Branch back to top of loop
1318			// based on CTR
1319			// BC with BO_BCTR generates bdnz
1320			p = s.Prog(ppc64.ABC)
1321			p.From.Type = obj.TYPE_CONST
1322			p.From.Offset = ppc64.BO_BCTR
1323			p.Reg = ppc64.REG_CR0LT
1324			p.To.Type = obj.TYPE_BRANCH
1325			p.To.SetTarget(top)
1326		}
1327
1328		// when ctr == 1 the loop was not generated but
1329		// there are at least 32 bytes to clear, so add
1330		// that to the remainder to generate the code
1331		// to clear those doublewords
1332		if ctr == 1 {
1333			rem += 32
1334		}
1335
1336		// clear the remainder starting at offset zero
1337		offset := int64(0)
1338
1339		// first clear as many doublewords as possible
1340		// then clear remaining sizes as available
1341		for rem > 0 {
1342			op, size := ppc64.AMOVB, int64(1)
1343			switch {
1344			case rem >= 8:
1345				op, size = ppc64.AMOVD, 8
1346			case rem >= 4:
1347				op, size = ppc64.AMOVW, 4
1348			case rem >= 2:
1349				op, size = ppc64.AMOVH, 2
1350			}
1351			p := s.Prog(op)
1352			p.From.Type = obj.TYPE_REG
1353			p.From.Reg = ppc64.REG_R0
1354			p.To.Type = obj.TYPE_MEM
1355			p.To.Reg = v.Args[0].Reg()
1356			p.To.Offset = offset
1357			rem -= size
1358			offset += size
1359		}
1360
1361	case ssa.OpPPC64LoweredMove, ssa.OpPPC64LoweredMoveShort:
1362
1363		bytesPerLoop := int64(32)
1364		// This will be used when moving more
1365		// than 8 bytes.  Moves start with
1366		// as many 8 byte moves as possible, then
1367		// 4, 2, or 1 byte(s) as remaining.  This will
1368		// work and be efficient for power8 or later.
1369		// If there are 64 or more bytes, then a
1370		// loop is generated to move 32 bytes and
1371		// update the src and dst addresses on each
1372		// iteration. When < 64 bytes, the appropriate
1373		// number of moves are generated based on the
1374		// size.
1375		// When moving >= 64 bytes a loop is used
1376		//	MOVD len/32,REG_TMP
1377		//	MOVD REG_TMP,CTR
1378		//	MOVD $16,REG_TMP
1379		// top:
1380		//	LXVD2X (R0)(R21),VS32
1381		//	LXVD2X (R31)(R21),VS33
1382		//	ADD $32,R21
1383		//	STXVD2X VS32,(R0)(R20)
1384		//	STXVD2X VS33,(R31)(R20)
1385		//	ADD $32,R20
1386		//	BC 16,0,top
1387		// Bytes not moved by this loop are moved
1388		// with a combination of the following instructions,
1389		// starting with the largest sizes and generating as
1390		// many as needed, using the appropriate offset value.
1391		//	MOVD  n(R21),R31
1392		//	MOVD  R31,n(R20)
1393		//	MOVW  n1(R21),R31
1394		//	MOVW  R31,n1(R20)
1395		//	MOVH  n2(R21),R31
1396		//	MOVH  R31,n2(R20)
1397		//	MOVB  n3(R21),R31
1398		//	MOVB  R31,n3(R20)
1399
1400		// Each loop iteration moves 32 bytes
1401		ctr := v.AuxInt / bytesPerLoop
1402
1403		// Remainder after the loop
1404		rem := v.AuxInt % bytesPerLoop
1405
1406		dstReg := v.Args[0].Reg()
1407		srcReg := v.Args[1].Reg()
1408
1409		// The set of registers used here, must match the clobbered reg list
1410		// in PPC64Ops.go.
1411		offset := int64(0)
1412
1413		// top of the loop
1414		var top *obj.Prog
1415		// Only generate looping code when loop counter is > 1 for >= 64 bytes
1416		if ctr > 1 {
1417			// Set up the CTR
1418			p := s.Prog(ppc64.AMOVD)
1419			p.From.Type = obj.TYPE_CONST
1420			p.From.Offset = ctr
1421			p.To.Type = obj.TYPE_REG
1422			p.To.Reg = ppc64.REGTMP
1423
1424			p = s.Prog(ppc64.AMOVD)
1425			p.From.Type = obj.TYPE_REG
1426			p.From.Reg = ppc64.REGTMP
1427			p.To.Type = obj.TYPE_REG
1428			p.To.Reg = ppc64.REG_CTR
1429
1430			// Use REGTMP as index reg
1431			p = s.Prog(ppc64.AMOVD)
1432			p.From.Type = obj.TYPE_CONST
1433			p.From.Offset = 16
1434			p.To.Type = obj.TYPE_REG
1435			p.To.Reg = ppc64.REGTMP
1436
1437			// Don't adding padding for
1438			// alignment with small iteration
1439			// counts.
1440			if ctr > 3 {
1441				p = s.Prog(obj.APCALIGN)
1442				p.From.Type = obj.TYPE_CONST
1443				p.From.Offset = 16
1444			}
1445
1446			// Generate 16 byte loads and stores.
1447			// Use temp register for index (16)
1448			// on the second one.
1449
1450			p = s.Prog(ppc64.ALXVD2X)
1451			p.From.Type = obj.TYPE_MEM
1452			p.From.Reg = srcReg
1453			p.From.Index = ppc64.REGZERO
1454			p.To.Type = obj.TYPE_REG
1455			p.To.Reg = ppc64.REG_VS32
1456			if top == nil {
1457				top = p
1458			}
1459			p = s.Prog(ppc64.ALXVD2X)
1460			p.From.Type = obj.TYPE_MEM
1461			p.From.Reg = srcReg
1462			p.From.Index = ppc64.REGTMP
1463			p.To.Type = obj.TYPE_REG
1464			p.To.Reg = ppc64.REG_VS33
1465
1466			// increment the src reg for next iteration
1467			p = s.Prog(ppc64.AADD)
1468			p.Reg = srcReg
1469			p.From.Type = obj.TYPE_CONST
1470			p.From.Offset = bytesPerLoop
1471			p.To.Type = obj.TYPE_REG
1472			p.To.Reg = srcReg
1473
1474			// generate 16 byte stores
1475			p = s.Prog(ppc64.ASTXVD2X)
1476			p.From.Type = obj.TYPE_REG
1477			p.From.Reg = ppc64.REG_VS32
1478			p.To.Type = obj.TYPE_MEM
1479			p.To.Reg = dstReg
1480			p.To.Index = ppc64.REGZERO
1481
1482			p = s.Prog(ppc64.ASTXVD2X)
1483			p.From.Type = obj.TYPE_REG
1484			p.From.Reg = ppc64.REG_VS33
1485			p.To.Type = obj.TYPE_MEM
1486			p.To.Reg = dstReg
1487			p.To.Index = ppc64.REGTMP
1488
1489			// increment the dst reg for next iteration
1490			p = s.Prog(ppc64.AADD)
1491			p.Reg = dstReg
1492			p.From.Type = obj.TYPE_CONST
1493			p.From.Offset = bytesPerLoop
1494			p.To.Type = obj.TYPE_REG
1495			p.To.Reg = dstReg
1496
1497			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
1498			// to loop top.
1499			p = s.Prog(ppc64.ABC)
1500			p.From.Type = obj.TYPE_CONST
1501			p.From.Offset = ppc64.BO_BCTR
1502			p.Reg = ppc64.REG_CR0LT
1503			p.To.Type = obj.TYPE_BRANCH
1504			p.To.SetTarget(top)
1505
1506			// srcReg and dstReg were incremented in the loop, so
1507			// later instructions start with offset 0.
1508			offset = int64(0)
1509		}
1510
1511		// No loop was generated for one iteration, so
1512		// add 32 bytes to the remainder to move those bytes.
1513		if ctr == 1 {
1514			rem += bytesPerLoop
1515		}
1516
1517		if rem >= 16 {
1518			// Generate 16 byte loads and stores.
1519			// Use temp register for index (value 16)
1520			// on the second one.
1521			p := s.Prog(ppc64.ALXVD2X)
1522			p.From.Type = obj.TYPE_MEM
1523			p.From.Reg = srcReg
1524			p.From.Index = ppc64.REGZERO
1525			p.To.Type = obj.TYPE_REG
1526			p.To.Reg = ppc64.REG_VS32
1527
1528			p = s.Prog(ppc64.ASTXVD2X)
1529			p.From.Type = obj.TYPE_REG
1530			p.From.Reg = ppc64.REG_VS32
1531			p.To.Type = obj.TYPE_MEM
1532			p.To.Reg = dstReg
1533			p.To.Index = ppc64.REGZERO
1534
1535			offset = 16
1536			rem -= 16
1537
1538			if rem >= 16 {
1539				// Use REGTMP as index reg
1540				p := s.Prog(ppc64.AMOVD)
1541				p.From.Type = obj.TYPE_CONST
1542				p.From.Offset = 16
1543				p.To.Type = obj.TYPE_REG
1544				p.To.Reg = ppc64.REGTMP
1545
1546				p = s.Prog(ppc64.ALXVD2X)
1547				p.From.Type = obj.TYPE_MEM
1548				p.From.Reg = srcReg
1549				p.From.Index = ppc64.REGTMP
1550				p.To.Type = obj.TYPE_REG
1551				p.To.Reg = ppc64.REG_VS32
1552
1553				p = s.Prog(ppc64.ASTXVD2X)
1554				p.From.Type = obj.TYPE_REG
1555				p.From.Reg = ppc64.REG_VS32
1556				p.To.Type = obj.TYPE_MEM
1557				p.To.Reg = dstReg
1558				p.To.Index = ppc64.REGTMP
1559
1560				offset = 32
1561				rem -= 16
1562			}
1563		}
1564
1565		// Generate all the remaining load and store pairs, starting with
1566		// as many 8 byte moves as possible, then 4, 2, 1.
1567		for rem > 0 {
1568			op, size := ppc64.AMOVB, int64(1)
1569			switch {
1570			case rem >= 8:
1571				op, size = ppc64.AMOVD, 8
1572			case rem >= 4:
1573				op, size = ppc64.AMOVWZ, 4
1574			case rem >= 2:
1575				op, size = ppc64.AMOVH, 2
1576			}
1577			// Load
1578			p := s.Prog(op)
1579			p.To.Type = obj.TYPE_REG
1580			p.To.Reg = ppc64.REGTMP
1581			p.From.Type = obj.TYPE_MEM
1582			p.From.Reg = srcReg
1583			p.From.Offset = offset
1584
1585			// Store
1586			p = s.Prog(op)
1587			p.From.Type = obj.TYPE_REG
1588			p.From.Reg = ppc64.REGTMP
1589			p.To.Type = obj.TYPE_MEM
1590			p.To.Reg = dstReg
1591			p.To.Offset = offset
1592			rem -= size
1593			offset += size
1594		}
1595
1596	case ssa.OpPPC64LoweredQuadMove, ssa.OpPPC64LoweredQuadMoveShort:
1597		bytesPerLoop := int64(64)
1598		// This is used when moving more
1599		// than 8 bytes on power9.  Moves start with
1600		// as many 8 byte moves as possible, then
1601		// 4, 2, or 1 byte(s) as remaining.  This will
1602		// work and be efficient for power8 or later.
1603		// If there are 64 or more bytes, then a
1604		// loop is generated to move 32 bytes and
1605		// update the src and dst addresses on each
1606		// iteration. When < 64 bytes, the appropriate
1607		// number of moves are generated based on the
1608		// size.
1609		// When moving >= 64 bytes a loop is used
1610		//      MOVD len/32,REG_TMP
1611		//      MOVD REG_TMP,CTR
1612		// top:
1613		//      LXV 0(R21),VS32
1614		//      LXV 16(R21),VS33
1615		//      ADD $32,R21
1616		//      STXV VS32,0(R20)
1617		//      STXV VS33,16(R20)
1618		//      ADD $32,R20
1619		//      BC 16,0,top
1620		// Bytes not moved by this loop are moved
1621		// with a combination of the following instructions,
1622		// starting with the largest sizes and generating as
1623		// many as needed, using the appropriate offset value.
1624		//      MOVD  n(R21),R31
1625		//      MOVD  R31,n(R20)
1626		//      MOVW  n1(R21),R31
1627		//      MOVW  R31,n1(R20)
1628		//      MOVH  n2(R21),R31
1629		//      MOVH  R31,n2(R20)
1630		//      MOVB  n3(R21),R31
1631		//      MOVB  R31,n3(R20)
1632
1633		// Each loop iteration moves 32 bytes
1634		ctr := v.AuxInt / bytesPerLoop
1635
1636		// Remainder after the loop
1637		rem := v.AuxInt % bytesPerLoop
1638
1639		dstReg := v.Args[0].Reg()
1640		srcReg := v.Args[1].Reg()
1641
1642		offset := int64(0)
1643
1644		// top of the loop
1645		var top *obj.Prog
1646
1647		// Only generate looping code when loop counter is > 1 for >= 64 bytes
1648		if ctr > 1 {
1649			// Set up the CTR
1650			p := s.Prog(ppc64.AMOVD)
1651			p.From.Type = obj.TYPE_CONST
1652			p.From.Offset = ctr
1653			p.To.Type = obj.TYPE_REG
1654			p.To.Reg = ppc64.REGTMP
1655
1656			p = s.Prog(ppc64.AMOVD)
1657			p.From.Type = obj.TYPE_REG
1658			p.From.Reg = ppc64.REGTMP
1659			p.To.Type = obj.TYPE_REG
1660			p.To.Reg = ppc64.REG_CTR
1661
1662			p = s.Prog(obj.APCALIGN)
1663			p.From.Type = obj.TYPE_CONST
1664			p.From.Offset = 16
1665
1666			// Generate 16 byte loads and stores.
1667			p = s.Prog(ppc64.ALXV)
1668			p.From.Type = obj.TYPE_MEM
1669			p.From.Reg = srcReg
1670			p.From.Offset = offset
1671			p.To.Type = obj.TYPE_REG
1672			p.To.Reg = ppc64.REG_VS32
1673			if top == nil {
1674				top = p
1675			}
1676			p = s.Prog(ppc64.ALXV)
1677			p.From.Type = obj.TYPE_MEM
1678			p.From.Reg = srcReg
1679			p.From.Offset = offset + 16
1680			p.To.Type = obj.TYPE_REG
1681			p.To.Reg = ppc64.REG_VS33
1682
1683			// generate 16 byte stores
1684			p = s.Prog(ppc64.ASTXV)
1685			p.From.Type = obj.TYPE_REG
1686			p.From.Reg = ppc64.REG_VS32
1687			p.To.Type = obj.TYPE_MEM
1688			p.To.Reg = dstReg
1689			p.To.Offset = offset
1690
1691			p = s.Prog(ppc64.ASTXV)
1692			p.From.Type = obj.TYPE_REG
1693			p.From.Reg = ppc64.REG_VS33
1694			p.To.Type = obj.TYPE_MEM
1695			p.To.Reg = dstReg
1696			p.To.Offset = offset + 16
1697
1698			// Generate 16 byte loads and stores.
1699			p = s.Prog(ppc64.ALXV)
1700			p.From.Type = obj.TYPE_MEM
1701			p.From.Reg = srcReg
1702			p.From.Offset = offset + 32
1703			p.To.Type = obj.TYPE_REG
1704			p.To.Reg = ppc64.REG_VS32
1705
1706			p = s.Prog(ppc64.ALXV)
1707			p.From.Type = obj.TYPE_MEM
1708			p.From.Reg = srcReg
1709			p.From.Offset = offset + 48
1710			p.To.Type = obj.TYPE_REG
1711			p.To.Reg = ppc64.REG_VS33
1712
1713			// generate 16 byte stores
1714			p = s.Prog(ppc64.ASTXV)
1715			p.From.Type = obj.TYPE_REG
1716			p.From.Reg = ppc64.REG_VS32
1717			p.To.Type = obj.TYPE_MEM
1718			p.To.Reg = dstReg
1719			p.To.Offset = offset + 32
1720
1721			p = s.Prog(ppc64.ASTXV)
1722			p.From.Type = obj.TYPE_REG
1723			p.From.Reg = ppc64.REG_VS33
1724			p.To.Type = obj.TYPE_MEM
1725			p.To.Reg = dstReg
1726			p.To.Offset = offset + 48
1727
1728			// increment the src reg for next iteration
1729			p = s.Prog(ppc64.AADD)
1730			p.Reg = srcReg
1731			p.From.Type = obj.TYPE_CONST
1732			p.From.Offset = bytesPerLoop
1733			p.To.Type = obj.TYPE_REG
1734			p.To.Reg = srcReg
1735
1736			// increment the dst reg for next iteration
1737			p = s.Prog(ppc64.AADD)
1738			p.Reg = dstReg
1739			p.From.Type = obj.TYPE_CONST
1740			p.From.Offset = bytesPerLoop
1741			p.To.Type = obj.TYPE_REG
1742			p.To.Reg = dstReg
1743
1744			// BC with BO_BCTR generates bdnz to branch on nonzero CTR
1745			// to loop top.
1746			p = s.Prog(ppc64.ABC)
1747			p.From.Type = obj.TYPE_CONST
1748			p.From.Offset = ppc64.BO_BCTR
1749			p.Reg = ppc64.REG_CR0LT
1750			p.To.Type = obj.TYPE_BRANCH
1751			p.To.SetTarget(top)
1752
1753			// srcReg and dstReg were incremented in the loop, so
1754			// later instructions start with offset 0.
1755			offset = int64(0)
1756		}
1757
1758		// No loop was generated for one iteration, so
1759		// add 32 bytes to the remainder to move those bytes.
1760		if ctr == 1 {
1761			rem += bytesPerLoop
1762		}
1763		if rem >= 32 {
1764			p := s.Prog(ppc64.ALXV)
1765			p.From.Type = obj.TYPE_MEM
1766			p.From.Reg = srcReg
1767			p.To.Type = obj.TYPE_REG
1768			p.To.Reg = ppc64.REG_VS32
1769
1770			p = s.Prog(ppc64.ALXV)
1771			p.From.Type = obj.TYPE_MEM
1772			p.From.Reg = srcReg
1773			p.From.Offset = 16
1774			p.To.Type = obj.TYPE_REG
1775			p.To.Reg = ppc64.REG_VS33
1776
1777			p = s.Prog(ppc64.ASTXV)
1778			p.From.Type = obj.TYPE_REG
1779			p.From.Reg = ppc64.REG_VS32
1780			p.To.Type = obj.TYPE_MEM
1781			p.To.Reg = dstReg
1782
1783			p = s.Prog(ppc64.ASTXV)
1784			p.From.Type = obj.TYPE_REG
1785			p.From.Reg = ppc64.REG_VS33
1786			p.To.Type = obj.TYPE_MEM
1787			p.To.Reg = dstReg
1788			p.To.Offset = 16
1789
1790			offset = 32
1791			rem -= 32
1792		}
1793
1794		if rem >= 16 {
1795			// Generate 16 byte loads and stores.
1796			p := s.Prog(ppc64.ALXV)
1797			p.From.Type = obj.TYPE_MEM
1798			p.From.Reg = srcReg
1799			p.From.Offset = offset
1800			p.To.Type = obj.TYPE_REG
1801			p.To.Reg = ppc64.REG_VS32
1802
1803			p = s.Prog(ppc64.ASTXV)
1804			p.From.Type = obj.TYPE_REG
1805			p.From.Reg = ppc64.REG_VS32
1806			p.To.Type = obj.TYPE_MEM
1807			p.To.Reg = dstReg
1808			p.To.Offset = offset
1809
1810			offset += 16
1811			rem -= 16
1812
1813			if rem >= 16 {
1814				p := s.Prog(ppc64.ALXV)
1815				p.From.Type = obj.TYPE_MEM
1816				p.From.Reg = srcReg
1817				p.From.Offset = offset
1818				p.To.Type = obj.TYPE_REG
1819				p.To.Reg = ppc64.REG_VS32
1820
1821				p = s.Prog(ppc64.ASTXV)
1822				p.From.Type = obj.TYPE_REG
1823				p.From.Reg = ppc64.REG_VS32
1824				p.To.Type = obj.TYPE_MEM
1825				p.To.Reg = dstReg
1826				p.To.Offset = offset
1827
1828				offset += 16
1829				rem -= 16
1830			}
1831		}
1832		// Generate all the remaining load and store pairs, starting with
1833		// as many 8 byte moves as possible, then 4, 2, 1.
1834		for rem > 0 {
1835			op, size := ppc64.AMOVB, int64(1)
1836			switch {
1837			case rem >= 8:
1838				op, size = ppc64.AMOVD, 8
1839			case rem >= 4:
1840				op, size = ppc64.AMOVWZ, 4
1841			case rem >= 2:
1842				op, size = ppc64.AMOVH, 2
1843			}
1844			// Load
1845			p := s.Prog(op)
1846			p.To.Type = obj.TYPE_REG
1847			p.To.Reg = ppc64.REGTMP
1848			p.From.Type = obj.TYPE_MEM
1849			p.From.Reg = srcReg
1850			p.From.Offset = offset
1851
1852			// Store
1853			p = s.Prog(op)
1854			p.From.Type = obj.TYPE_REG
1855			p.From.Reg = ppc64.REGTMP
1856			p.To.Type = obj.TYPE_MEM
1857			p.To.Reg = dstReg
1858			p.To.Offset = offset
1859			rem -= size
1860			offset += size
1861		}
1862
1863	case ssa.OpPPC64CALLstatic:
1864		s.Call(v)
1865
1866	case ssa.OpPPC64CALLtail:
1867		s.TailCall(v)
1868
1869	case ssa.OpPPC64CALLclosure, ssa.OpPPC64CALLinter:
1870		p := s.Prog(ppc64.AMOVD)
1871		p.From.Type = obj.TYPE_REG
1872		p.From.Reg = v.Args[0].Reg()
1873		p.To.Type = obj.TYPE_REG
1874		p.To.Reg = ppc64.REG_LR
1875
1876		if v.Args[0].Reg() != ppc64.REG_R12 {
1877			v.Fatalf("Function address for %v should be in R12 %d but is in %d", v.LongString(), ppc64.REG_R12, p.From.Reg)
1878		}
1879
1880		pp := s.Call(v)
1881
1882		// Convert the call into a blrl with hint this is not a subroutine return.
1883		// The full bclrl opcode must be specified when passing a hint.
1884		pp.As = ppc64.ABCL
1885		pp.From.Type = obj.TYPE_CONST
1886		pp.From.Offset = ppc64.BO_ALWAYS
1887		pp.Reg = ppc64.REG_CR0LT // The preferred value if BI is ignored.
1888		pp.To.Reg = ppc64.REG_LR
1889		pp.AddRestSourceConst(1)
1890
1891		if ppc64.NeedTOCpointer(base.Ctxt) {
1892			// When compiling Go into PIC, the function we just
1893			// called via pointer might have been implemented in
1894			// a separate module and so overwritten the TOC
1895			// pointer in R2; reload it.
1896			q := s.Prog(ppc64.AMOVD)
1897			q.From.Type = obj.TYPE_MEM
1898			q.From.Offset = 24
1899			q.From.Reg = ppc64.REGSP
1900			q.To.Type = obj.TYPE_REG
1901			q.To.Reg = ppc64.REG_R2
1902		}
1903
1904	case ssa.OpPPC64LoweredWB:
1905		p := s.Prog(obj.ACALL)
1906		p.To.Type = obj.TYPE_MEM
1907		p.To.Name = obj.NAME_EXTERN
1908		// AuxInt encodes how many buffer entries we need.
1909		p.To.Sym = ir.Syms.GCWriteBarrier[v.AuxInt-1]
1910
1911	case ssa.OpPPC64LoweredPanicBoundsA, ssa.OpPPC64LoweredPanicBoundsB, ssa.OpPPC64LoweredPanicBoundsC:
1912		p := s.Prog(obj.ACALL)
1913		p.To.Type = obj.TYPE_MEM
1914		p.To.Name = obj.NAME_EXTERN
1915		p.To.Sym = ssagen.BoundsCheckFunc[v.AuxInt]
1916		s.UseArgs(16) // space used in callee args area by assembly stubs
1917
1918	case ssa.OpPPC64LoweredNilCheck:
1919		if buildcfg.GOOS == "aix" {
1920			// CMP Rarg0, $0
1921			// BNE 2(PC)
1922			// STW R0, 0(R0)
1923			// NOP (so the BNE has somewhere to land)
1924
1925			// CMP Rarg0, $0
1926			p := s.Prog(ppc64.ACMP)
1927			p.From.Type = obj.TYPE_REG
1928			p.From.Reg = v.Args[0].Reg()
1929			p.To.Type = obj.TYPE_CONST
1930			p.To.Offset = 0
1931
1932			// BNE 2(PC)
1933			p2 := s.Prog(ppc64.ABNE)
1934			p2.To.Type = obj.TYPE_BRANCH
1935
1936			// STW R0, 0(R0)
1937			// Write at 0 is forbidden and will trigger a SIGSEGV
1938			p = s.Prog(ppc64.AMOVW)
1939			p.From.Type = obj.TYPE_REG
1940			p.From.Reg = ppc64.REG_R0
1941			p.To.Type = obj.TYPE_MEM
1942			p.To.Reg = ppc64.REG_R0
1943
1944			// NOP (so the BNE has somewhere to land)
1945			nop := s.Prog(obj.ANOP)
1946			p2.To.SetTarget(nop)
1947
1948		} else {
1949			// Issue a load which will fault if arg is nil.
1950			p := s.Prog(ppc64.AMOVBZ)
1951			p.From.Type = obj.TYPE_MEM
1952			p.From.Reg = v.Args[0].Reg()
1953			ssagen.AddAux(&p.From, v)
1954			p.To.Type = obj.TYPE_REG
1955			p.To.Reg = ppc64.REGTMP
1956		}
1957		if logopt.Enabled() {
1958			logopt.LogOpt(v.Pos, "nilcheck", "genssa", v.Block.Func.Name)
1959		}
1960		if base.Debug.Nil != 0 && v.Pos.Line() > 1 { // v.Pos.Line()==1 in generated wrappers
1961			base.WarnfAt(v.Pos, "generated nil check")
1962		}
1963
1964	// These should be resolved by rules and not make it here.
1965	case ssa.OpPPC64Equal, ssa.OpPPC64NotEqual, ssa.OpPPC64LessThan, ssa.OpPPC64FLessThan,
1966		ssa.OpPPC64LessEqual, ssa.OpPPC64GreaterThan, ssa.OpPPC64FGreaterThan, ssa.OpPPC64GreaterEqual,
1967		ssa.OpPPC64FLessEqual, ssa.OpPPC64FGreaterEqual:
1968		v.Fatalf("Pseudo-op should not make it to codegen: %s ###\n", v.LongString())
1969	case ssa.OpPPC64InvertFlags:
1970		v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString())
1971	case ssa.OpPPC64FlagEQ, ssa.OpPPC64FlagLT, ssa.OpPPC64FlagGT:
1972		v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString())
1973	case ssa.OpClobber, ssa.OpClobberReg:
1974		// TODO: implement for clobberdead experiment. Nop is ok for now.
1975	default:
1976		v.Fatalf("genValue not implemented: %s", v.LongString())
1977	}
1978}
1979
1980var blockJump = [...]struct {
1981	asm, invasm     obj.As
1982	asmeq, invasmun bool
1983}{
1984	ssa.BlockPPC64EQ: {ppc64.ABEQ, ppc64.ABNE, false, false},
1985	ssa.BlockPPC64NE: {ppc64.ABNE, ppc64.ABEQ, false, false},
1986
1987	ssa.BlockPPC64LT: {ppc64.ABLT, ppc64.ABGE, false, false},
1988	ssa.BlockPPC64GE: {ppc64.ABGE, ppc64.ABLT, false, false},
1989	ssa.BlockPPC64LE: {ppc64.ABLE, ppc64.ABGT, false, false},
1990	ssa.BlockPPC64GT: {ppc64.ABGT, ppc64.ABLE, false, false},
1991
1992	// TODO: need to work FP comparisons into block jumps
1993	ssa.BlockPPC64FLT: {ppc64.ABLT, ppc64.ABGE, false, false},
1994	ssa.BlockPPC64FGE: {ppc64.ABGT, ppc64.ABLT, true, true}, // GE = GT or EQ; !GE = LT or UN
1995	ssa.BlockPPC64FLE: {ppc64.ABLT, ppc64.ABGT, true, true}, // LE = LT or EQ; !LE = GT or UN
1996	ssa.BlockPPC64FGT: {ppc64.ABGT, ppc64.ABLE, false, false},
1997}
1998
1999func ssaGenBlock(s *ssagen.State, b, next *ssa.Block) {
2000	switch b.Kind {
2001	case ssa.BlockDefer:
2002		// defer returns in R3:
2003		// 0 if we should continue executing
2004		// 1 if we should jump to deferreturn call
2005		p := s.Prog(ppc64.ACMP)
2006		p.From.Type = obj.TYPE_REG
2007		p.From.Reg = ppc64.REG_R3
2008		p.To.Type = obj.TYPE_CONST
2009		p.To.Offset = 0
2010
2011		p = s.Prog(ppc64.ABNE)
2012		p.To.Type = obj.TYPE_BRANCH
2013		s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[1].Block()})
2014		if b.Succs[0].Block() != next {
2015			p := s.Prog(obj.AJMP)
2016			p.To.Type = obj.TYPE_BRANCH
2017			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
2018		}
2019
2020	case ssa.BlockPlain:
2021		if b.Succs[0].Block() != next {
2022			p := s.Prog(obj.AJMP)
2023			p.To.Type = obj.TYPE_BRANCH
2024			s.Branches = append(s.Branches, ssagen.Branch{P: p, B: b.Succs[0].Block()})
2025		}
2026	case ssa.BlockExit, ssa.BlockRetJmp:
2027	case ssa.BlockRet:
2028		s.Prog(obj.ARET)
2029
2030	case ssa.BlockPPC64EQ, ssa.BlockPPC64NE,
2031		ssa.BlockPPC64LT, ssa.BlockPPC64GE,
2032		ssa.BlockPPC64LE, ssa.BlockPPC64GT,
2033		ssa.BlockPPC64FLT, ssa.BlockPPC64FGE,
2034		ssa.BlockPPC64FLE, ssa.BlockPPC64FGT:
2035		jmp := blockJump[b.Kind]
2036		switch next {
2037		case b.Succs[0].Block():
2038			s.Br(jmp.invasm, b.Succs[1].Block())
2039			if jmp.invasmun {
2040				// TODO: The second branch is probably predict-not-taken since it is for FP unordered
2041				s.Br(ppc64.ABVS, b.Succs[1].Block())
2042			}
2043		case b.Succs[1].Block():
2044			s.Br(jmp.asm, b.Succs[0].Block())
2045			if jmp.asmeq {
2046				s.Br(ppc64.ABEQ, b.Succs[0].Block())
2047			}
2048		default:
2049			if b.Likely != ssa.BranchUnlikely {
2050				s.Br(jmp.asm, b.Succs[0].Block())
2051				if jmp.asmeq {
2052					s.Br(ppc64.ABEQ, b.Succs[0].Block())
2053				}
2054				s.Br(obj.AJMP, b.Succs[1].Block())
2055			} else {
2056				s.Br(jmp.invasm, b.Succs[1].Block())
2057				if jmp.invasmun {
2058					// TODO: The second branch is probably predict-not-taken since it is for FP unordered
2059					s.Br(ppc64.ABVS, b.Succs[1].Block())
2060				}
2061				s.Br(obj.AJMP, b.Succs[0].Block())
2062			}
2063		}
2064	default:
2065		b.Fatalf("branch not implemented: %s", b.LongString())
2066	}
2067}
2068
2069func loadRegResult(s *ssagen.State, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
2070	p := s.Prog(loadByType(t))
2071	p.From.Type = obj.TYPE_MEM
2072	p.From.Name = obj.NAME_AUTO
2073	p.From.Sym = n.Linksym()
2074	p.From.Offset = n.FrameOffset() + off
2075	p.To.Type = obj.TYPE_REG
2076	p.To.Reg = reg
2077	return p
2078}
2079
2080func spillArgReg(pp *objw.Progs, p *obj.Prog, f *ssa.Func, t *types.Type, reg int16, n *ir.Name, off int64) *obj.Prog {
2081	p = pp.Append(p, storeByType(t), obj.TYPE_REG, reg, 0, obj.TYPE_MEM, 0, n.FrameOffset()+off)
2082	p.To.Name = obj.NAME_PARAM
2083	p.To.Sym = n.Linksym()
2084	p.Pos = p.Pos.WithNotStmt()
2085	return p
2086}
2087