func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { s.SetLineno(v.Line) switch v.Op { case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL: r := gc.SSARegNum(v) r1 := gc.SSARegNum(v.Args[0]) r2 := gc.SSARegNum(v.Args[1]) switch { case r == r1: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r2 p.To.Type = obj.TYPE_REG p.To.Reg = r case r == r2: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r1 p.To.Type = obj.TYPE_REG p.To.Reg = r default: var asm obj.As if v.Op == ssa.OpAMD64ADDQ { asm = x86.ALEAQ } else { asm = x86.ALEAL } p := gc.Prog(asm) p.From.Type = obj.TYPE_MEM p.From.Reg = r1 p.From.Scale = 1 p.From.Index = r2 p.To.Type = obj.TYPE_REG p.To.Reg = r } // 2-address opcode arithmetic case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL, ssa.OpAMD64MULQ, ssa.OpAMD64MULL, ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL, ssa.OpAMD64ORQ, ssa.OpAMD64ORL, ssa.OpAMD64XORQ, ssa.OpAMD64XORL, ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL, ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB, ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB, ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD, ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD, ssa.OpAMD64PXOR: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1])) case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW, ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU, ssa.OpAMD64MODQ, ssa.OpAMD64MODL, ssa.OpAMD64MODW, ssa.OpAMD64MODQU, ssa.OpAMD64MODLU, ssa.OpAMD64MODWU: // Arg[0] is already in AX as it's the only register we allow // and AX is the only output x := gc.SSARegNum(v.Args[1]) // CPU faults upon signed overflow, which occurs when most // negative int is divided by -1. var j *obj.Prog if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL || v.Op == ssa.OpAMD64DIVW || v.Op == ssa.OpAMD64MODQ || v.Op == ssa.OpAMD64MODL || v.Op == ssa.OpAMD64MODW { var c *obj.Prog switch v.Op { case ssa.OpAMD64DIVQ, ssa.OpAMD64MODQ: c = gc.Prog(x86.ACMPQ) j = gc.Prog(x86.AJEQ) // go ahead and sign extend to save doing it later gc.Prog(x86.ACQO) case ssa.OpAMD64DIVL, ssa.OpAMD64MODL: c = gc.Prog(x86.ACMPL) j = gc.Prog(x86.AJEQ) gc.Prog(x86.ACDQ) case ssa.OpAMD64DIVW, ssa.OpAMD64MODW: c = gc.Prog(x86.ACMPW) j = gc.Prog(x86.AJEQ) gc.Prog(x86.ACWD) } c.From.Type = obj.TYPE_REG c.From.Reg = x c.To.Type = obj.TYPE_CONST c.To.Offset = -1 j.To.Type = obj.TYPE_BRANCH } // for unsigned ints, we sign extend by setting DX = 0 // signed ints were sign extended above if v.Op == ssa.OpAMD64DIVQU || v.Op == ssa.OpAMD64MODQU || v.Op == ssa.OpAMD64DIVLU || v.Op == ssa.OpAMD64MODLU || v.Op == ssa.OpAMD64DIVWU || v.Op == ssa.OpAMD64MODWU { c := gc.Prog(x86.AXORQ) c.From.Type = obj.TYPE_REG c.From.Reg = x86.REG_DX c.To.Type = obj.TYPE_REG c.To.Reg = x86.REG_DX } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = x // signed division, rest of the check for -1 case if j != nil { j2 := gc.Prog(obj.AJMP) j2.To.Type = obj.TYPE_BRANCH var n *obj.Prog if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL || v.Op == ssa.OpAMD64DIVW { // n * -1 = -n n = gc.Prog(x86.ANEGQ) n.To.Type = obj.TYPE_REG n.To.Reg = x86.REG_AX } else { // n % -1 == 0 n = gc.Prog(x86.AXORQ) n.From.Type = obj.TYPE_REG n.From.Reg = x86.REG_DX n.To.Type = obj.TYPE_REG n.To.Reg = x86.REG_DX } j.To.Val = n j2.To.Val = s.Pc() } case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULW, ssa.OpAMD64HMULB, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU, ssa.OpAMD64HMULWU, ssa.OpAMD64HMULBU: // the frontend rewrites constant division by 8/16/32 bit integers into // HMUL by a constant // SSA rewrites generate the 64 bit versions // Arg[0] is already in AX as it's the only register we allow // and DX is the only output we care about (the high bits) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[1]) // IMULB puts the high portion in AH instead of DL, // so move it to DL for consistency if v.Type.Size() == 1 { m := gc.Prog(x86.AMOVB) m.From.Type = obj.TYPE_REG m.From.Reg = x86.REG_AH m.To.Type = obj.TYPE_REG m.To.Reg = x86.REG_DX } case ssa.OpAMD64AVGQU: // compute (x+y)/2 unsigned. // Do a 64-bit add, the overflow goes into the carry. // Shift right once and pull the carry back into the 63rd bit. r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(x86.AADDQ) p.From.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG p.To.Reg = r p.From.Reg = gc.SSARegNum(v.Args[1]) p = gc.Prog(x86.ARCRQ) p.From.Type = obj.TYPE_CONST p.From.Offset = 1 p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst: r := gc.SSARegNum(v) a := gc.SSARegNum(v.Args[0]) if r == a { if v.AuxInt == 1 { var asm obj.As // Software optimization manual recommends add $1,reg. // But inc/dec is 1 byte smaller. ICC always uses inc // Clang/GCC choose depending on flags, but prefer add. // Experiments show that inc/dec is both a little faster // and make a binary a little smaller. if v.Op == ssa.OpAMD64ADDQconst { asm = x86.AINCQ } else { asm = x86.AINCL } p := gc.Prog(asm) p.To.Type = obj.TYPE_REG p.To.Reg = r return } if v.AuxInt == -1 { var asm obj.As if v.Op == ssa.OpAMD64ADDQconst { asm = x86.ADECQ } else { asm = x86.ADECL } p := gc.Prog(asm) p.To.Type = obj.TYPE_REG p.To.Reg = r return } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r return } var asm obj.As if v.Op == ssa.OpAMD64ADDQconst { asm = x86.ALEAQ } else { asm = x86.ALEAL } p := gc.Prog(asm) p.From.Type = obj.TYPE_MEM p.From.Reg = a p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64CMOVQEQconst, ssa.OpAMD64CMOVLEQconst, ssa.OpAMD64CMOVWEQconst, ssa.OpAMD64CMOVQNEconst, ssa.OpAMD64CMOVLNEconst, ssa.OpAMD64CMOVWNEconst: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } // Constant into AX p := gc.Prog(moveByType(v.Type)) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = x86.REG_AX p = gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = x86.REG_AX p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r // TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2 // then we don't need to use resultInArg0 for these ops. //p.From3 = new(obj.Addr) //p.From3.Type = obj.TYPE_REG //p.From3.Reg = gc.SSARegNum(v.Args[0]) case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst, ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst, ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst, ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst, ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst, ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst, ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst, ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask: r := gc.SSARegNum(v) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8: r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) p := gc.Prog(x86.ALEAQ) switch v.Op { case ssa.OpAMD64LEAQ1: p.From.Scale = 1 if i == x86.REG_SP { r, i = i, r } case ssa.OpAMD64LEAQ2: p.From.Scale = 2 case ssa.OpAMD64LEAQ4: p.From.Scale = 4 case ssa.OpAMD64LEAQ8: p.From.Scale = 8 } p.From.Type = obj.TYPE_MEM p.From.Reg = r p.From.Index = i gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64LEAQ: p := gc.Prog(x86.ALEAQ) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB, ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB: opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[0])) case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD: // Go assembler has swapped operands for UCOMISx relative to CMP, // must account for that right here. opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1])) case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) p.To.Type = obj.TYPE_CONST p.To.Offset = v.AuxInt case ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v.Args[0]) case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst: x := gc.SSARegNum(v) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = x // If flags are live at this instruction, suppress the // MOV $0,AX -> XOR AX,AX optimization. if v.Aux != nil { p.Mark |= x86.PRESERVEFLAGS } case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst: x := gc.SSARegNum(v) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_FCONST p.From.Val = math.Float64frombits(uint64(v.AuxInt)) p.To.Type = obj.TYPE_REG p.To.Reg = x case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.From.Scale = 8 p.From.Index = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.From.Scale = 4 p.From.Index = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVWloadidx2: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.From.Scale = 2 p.From.Index = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1: r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) if i == x86.REG_SP { r, i = i, r } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = r p.From.Scale = 1 p.From.Index = i gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) p.To.Scale = 8 p.To.Index = gc.SSARegNum(v.Args[1]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) p.To.Scale = 4 p.To.Index = gc.SSARegNum(v.Args[1]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVWstoreidx2: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) p.To.Scale = 2 p.To.Index = gc.SSARegNum(v.Args[1]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1: r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) if i == x86.REG_SP { r, i = i, r } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = r p.To.Scale = 1 p.To.Index = i gc.AddAux(&p.To, v) case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST sc := v.AuxValAndOff() p.From.Offset = sc.Val() p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux2(&p.To, v, sc.Off()) case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST sc := v.AuxValAndOff() p.From.Offset = sc.Val() r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) switch v.Op { case ssa.OpAMD64MOVBstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx1: p.To.Scale = 1 if i == x86.REG_SP { r, i = i, r } case ssa.OpAMD64MOVWstoreconstidx2: p.To.Scale = 2 case ssa.OpAMD64MOVLstoreconstidx4: p.To.Scale = 4 case ssa.OpAMD64MOVQstoreconstidx8: p.To.Scale = 8 } p.To.Type = obj.TYPE_MEM p.To.Reg = r p.To.Index = i gc.AddAux2(&p.To, v, sc.Off()) case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX, ssa.OpAMD64CVTSL2SS, ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ, ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS: opregreg(v.Op.Asm(), gc.SSARegNum(v), gc.SSARegNum(v.Args[0])) case ssa.OpAMD64DUFFZERO: off := duffStart(v.AuxInt) adj := duffAdj(v.AuxInt) var p *obj.Prog if adj != 0 { p = gc.Prog(x86.AADDQ) p.From.Type = obj.TYPE_CONST p.From.Offset = adj p.To.Type = obj.TYPE_REG p.To.Reg = x86.REG_DI } p = gc.Prog(obj.ADUFFZERO) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Offset = off case ssa.OpAMD64MOVOconst: if v.AuxInt != 0 { v.Unimplementedf("MOVOconst can only do constant=0") } r := gc.SSARegNum(v) opregreg(x86.AXORPS, r, r) case ssa.OpAMD64DUFFCOPY: p := gc.Prog(obj.ADUFFCOPY) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) p.To.Offset = v.AuxInt case ssa.OpCopy, ssa.OpAMD64MOVQconvert: // TODO: use MOVQreg for reg->reg copies instead of OpCopy? if v.Type.IsMemory() { return } x := gc.SSARegNum(v.Args[0]) y := gc.SSARegNum(v) if x != y { opregreg(moveByType(v.Type), y, x) } case ssa.OpLoadReg: if v.Type.IsFlags() { v.Unimplementedf("load flags not implemented: %v", v.LongString()) return } p := gc.Prog(loadByType(v.Type)) n, off := gc.AutoVar(v.Args[0]) p.From.Type = obj.TYPE_MEM p.From.Node = n p.From.Sym = gc.Linksym(n.Sym) p.From.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.From.Name = obj.NAME_PARAM p.From.Offset += n.Xoffset } else { p.From.Name = obj.NAME_AUTO } p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpStoreReg: if v.Type.IsFlags() { v.Unimplementedf("store flags not implemented: %v", v.LongString()) return } p := gc.Prog(storeByType(v.Type)) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) n, off := gc.AutoVar(v) p.To.Type = obj.TYPE_MEM p.To.Node = n p.To.Sym = gc.Linksym(n.Sym) p.To.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.To.Name = obj.NAME_PARAM p.To.Offset += n.Xoffset } else { p.To.Name = obj.NAME_AUTO } case ssa.OpPhi: // just check to make sure regalloc and stackalloc did it right if v.Type.IsMemory() { return } f := v.Block.Func loc := f.RegAlloc[v.ID] for _, a := range v.Args { if aloc := f.RegAlloc[a.ID]; aloc != loc { // TODO: .Equal() instead? v.Fatalf("phi arg at different location than phi: %v @ %v, but arg %v @ %v\n%s\n", v, loc, a, aloc, v.Block.Func) } } case ssa.OpInitMem: // memory arg needs no code case ssa.OpArg: // input args need no code case ssa.OpAMD64LoweredGetClosurePtr: // Output is hardwired to DX only, // and DX contains the closure pointer on // closure entry, and this "instruction" // is scheduled to the very beginning // of the entry block. case ssa.OpAMD64LoweredGetG: r := gc.SSARegNum(v) // See the comments in cmd/avail/obj/x86/obj6.go // near CanUse1InsnTLS for a detailed explanation of these instructions. if x86.CanUse1InsnTLS(gc.Ctxt) { // MOVQ (TLS), r p := gc.Prog(x86.AMOVQ) p.From.Type = obj.TYPE_MEM p.From.Reg = x86.REG_TLS p.To.Type = obj.TYPE_REG p.To.Reg = r } else { // MOVQ TLS, r // MOVQ (r)(TLS*1), r p := gc.Prog(x86.AMOVQ) p.From.Type = obj.TYPE_REG p.From.Reg = x86.REG_TLS p.To.Type = obj.TYPE_REG p.To.Reg = r q := gc.Prog(x86.AMOVQ) q.From.Type = obj.TYPE_MEM q.From.Reg = r q.From.Index = x86.REG_TLS q.From.Scale = 1 q.To.Type = obj.TYPE_REG q.To.Reg = r } case ssa.OpAMD64CALLstatic: if v.Aux.(*gc.Sym) == gc.Deferreturn.Sym { // Deferred calls will appear to be returning to // the CALL deferreturn(SB) that we are about to emit. // However, the stack trace code will show the line // of the instruction byte before the return PC. // To avoid that being an unrelated instruction, // insert an actual hardware NOP that will have the right line number. // This is different from obj.ANOP, which is a virtual no-op // that doesn't make it into the instruction stream. ginsnop() } p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym)) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLclosure: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v.Args[0]) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLdefer: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(gc.Deferproc.Sym) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLgo: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(gc.Newproc.Sym) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLinter: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v.Args[0]) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL, ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL, ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64BSFQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSFW, ssa.OpAMD64BSRQ, ssa.OpAMD64BSRL, ssa.OpAMD64BSRW, ssa.OpAMD64SQRTSD: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpSP, ssa.OpSB: // nothing to do case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE, ssa.OpAMD64SETL, ssa.OpAMD64SETLE, ssa.OpAMD64SETG, ssa.OpAMD64SETGE, ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF, ssa.OpAMD64SETB, ssa.OpAMD64SETBE, ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN, ssa.OpAMD64SETA, ssa.OpAMD64SETAE: p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64SETNEF: p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) q := gc.Prog(x86.ASETPS) q.To.Type = obj.TYPE_REG q.To.Reg = x86.REG_AX // ORL avoids partial register write and is smaller than ORQ, used by old compiler opregreg(x86.AORL, gc.SSARegNum(v), x86.REG_AX) case ssa.OpAMD64SETEQF: p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) q := gc.Prog(x86.ASETPC) q.To.Type = obj.TYPE_REG q.To.Reg = x86.REG_AX // ANDL avoids partial register write and is smaller than ANDQ, used by old compiler opregreg(x86.AANDL, gc.SSARegNum(v), x86.REG_AX) case ssa.OpAMD64InvertFlags: v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString()) case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT: v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString()) case ssa.OpAMD64REPSTOSQ: gc.Prog(x86.AREP) gc.Prog(x86.ASTOSQ) case ssa.OpAMD64REPMOVSQ: gc.Prog(x86.AREP) gc.Prog(x86.AMOVSQ) case ssa.OpVarDef: gc.Gvardef(v.Aux.(*gc.Node)) case ssa.OpVarKill: gc.Gvarkill(v.Aux.(*gc.Node)) case ssa.OpVarLive: gc.Gvarlive(v.Aux.(*gc.Node)) case ssa.OpKeepAlive: if !v.Args[0].Type.IsPtrShaped() { v.Fatalf("keeping non-pointer alive %v", v.Args[0]) } n, off := gc.AutoVar(v.Args[0]) if n == nil { v.Fatalf("KeepLive with non-spilled value %s %s", v, v.Args[0]) } if off != 0 { v.Fatalf("KeepLive with non-zero offset spill location %s:%d", n, off) } gc.Gvarlive(n) case ssa.OpAMD64LoweredNilCheck: // Optimization - if the subsequent block has a load or store // at the same address, we don't need to issue this instruction. mem := v.Args[1] for _, w := range v.Block.Succs[0].Block().Values { if w.Op == ssa.OpPhi { if w.Type.IsMemory() { mem = w } continue } if len(w.Args) == 0 || !w.Args[len(w.Args)-1].Type.IsMemory() { // w doesn't use a store - can't be a memory op. continue } if w.Args[len(w.Args)-1] != mem { v.Fatalf("wrong store after nilcheck v=%s w=%s", v, w) } switch w.Op { case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVOload, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVOstore: if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage { if gc.Debug_checknil != 0 && int(v.Line) > 1 { gc.Warnl(v.Line, "removed nil check") } return } case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst: off := ssa.ValAndOff(v.AuxInt).Off() if w.Args[0] == v.Args[0] && w.Aux == nil && off >= 0 && off < minZeroPage { if gc.Debug_checknil != 0 && int(v.Line) > 1 { gc.Warnl(v.Line, "removed nil check") } return } } if w.Type.IsMemory() { if w.Op == ssa.OpVarDef || w.Op == ssa.OpVarKill || w.Op == ssa.OpVarLive { // these ops are OK mem = w continue } // We can't delay the nil check past the next store. break } } // Issue a load which will fault if the input is nil. // TODO: We currently use the 2-byte instruction TESTB AX, (reg). // Should we use the 3-byte TESTB $0, (reg) instead? It is larger // but it doesn't have false dependency on AX. // Or maybe allocate an output register and use MOVL (reg),reg2 ? // That trades clobbering flags for clobbering a register. p := gc.Prog(x86.ATESTB) p.From.Type = obj.TYPE_REG p.From.Reg = x86.REG_AX p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.To, v) if gc.Debug_checknil != 0 && v.Line > 1 { // v.Line==1 in generated wrappers gc.Warnl(v.Line, "generated nil check") } default: v.Unimplementedf("genValue not implemented: %s", v.LongString()) } }
// blockcopy copies w bytes from &n to &res func blockcopy(n, res *gc.Node, osrc, odst, w int64) { var dst gc.Node var src gc.Node if n.Ullman >= res.Ullman { gc.Agenr(n, &dst, res) // temporarily use dst gc.Regalloc(&src, gc.Types[gc.Tptr], nil) gins(s390x.AMOVD, &dst, &src) if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agen(res, &dst) } else { if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agenr(res, &dst, res) gc.Agenr(n, &src, nil) } defer gc.Regfree(&src) defer gc.Regfree(&dst) var tmp gc.Node gc.Regalloc(&tmp, gc.Types[gc.Tptr], nil) defer gc.Regfree(&tmp) offset := int64(0) dir := _FORWARDS if osrc < odst && odst < osrc+w { // Reverse. Can't use MVC, fall back onto basic moves. dir = _BACKWARDS const copiesPerIter = 2 if w >= 8*copiesPerIter { cnt := w - (w % (8 * copiesPerIter)) ginscon(s390x.AADD, w, &src) ginscon(s390x.AADD, w, &dst) var end gc.Node gc.Regalloc(&end, gc.Types[gc.Tptr], nil) p := gins(s390x.ASUB, nil, &end) p.From.Type = obj.TYPE_CONST p.From.Offset = cnt p.Reg = src.Reg var label *obj.Prog for i := 0; i < copiesPerIter; i++ { offset := int64(-8 * (i + 1)) p := gins(s390x.AMOVD, &src, &tmp) p.From.Type = obj.TYPE_MEM p.From.Offset = offset if i == 0 { label = p } p = gins(s390x.AMOVD, &tmp, &dst) p.To.Type = obj.TYPE_MEM p.To.Offset = offset } ginscon(s390x.ASUB, 8*copiesPerIter, &src) ginscon(s390x.ASUB, 8*copiesPerIter, &dst) gins(s390x.ACMP, &src, &end) gc.Patch(gc.Gbranch(s390x.ABNE, nil, 0), label) gc.Regfree(&end) w -= cnt } else { offset = w } } if dir == _FORWARDS && w > 1024 { // Loop over MVCs cnt := w - (w % 256) var end gc.Node gc.Regalloc(&end, gc.Types[gc.Tptr], nil) add := gins(s390x.AADD, nil, &end) add.From.Type = obj.TYPE_CONST add.From.Offset = cnt add.Reg = src.Reg mvc := gins(s390x.AMVC, &src, &dst) mvc.From.Type = obj.TYPE_MEM mvc.From.Offset = 0 mvc.To.Type = obj.TYPE_MEM mvc.To.Offset = 0 mvc.From3 = new(obj.Addr) mvc.From3.Type = obj.TYPE_CONST mvc.From3.Offset = 256 ginscon(s390x.AADD, 256, &src) ginscon(s390x.AADD, 256, &dst) gins(s390x.ACMP, &src, &end) gc.Patch(gc.Gbranch(s390x.ABNE, nil, 0), mvc) gc.Regfree(&end) w -= cnt } for w > 0 { cnt := w // If in reverse we can only do 8, 4, 2 or 1 bytes at a time. if dir == _BACKWARDS { switch { case cnt >= 8: cnt = 8 case cnt >= 4: cnt = 4 case cnt >= 2: cnt = 2 } } else if cnt > 256 { cnt = 256 } switch cnt { case 8, 4, 2, 1: op := s390x.AMOVB switch cnt { case 8: op = s390x.AMOVD case 4: op = s390x.AMOVW case 2: op = s390x.AMOVH } load := gins(op, &src, &tmp) load.From.Type = obj.TYPE_MEM load.From.Offset = offset store := gins(op, &tmp, &dst) store.To.Type = obj.TYPE_MEM store.To.Offset = offset if dir == _BACKWARDS { load.From.Offset -= cnt store.To.Offset -= cnt } default: p := gins(s390x.AMVC, &src, &dst) p.From.Type = obj.TYPE_MEM p.From.Offset = offset p.To.Type = obj.TYPE_MEM p.To.Offset = offset p.From3 = new(obj.Addr) p.From3.Type = obj.TYPE_CONST p.From3.Offset = cnt } switch dir { case _FORWARDS: offset += cnt case _BACKWARDS: offset -= cnt } w -= cnt } }
func blockcopy(n, res *gc.Node, osrc, odst, w int64) { // determine alignment. // want to avoid unaligned access, so have to use // smaller operations for less aligned types. // for example moving [4]byte must use 4 MOVB not 1 MOVW. align := int(n.Type.Align) var op obj.As switch align { default: gc.Fatalf("sgen: invalid alignment %d for %v", align, n.Type) case 1: op = ppc64.AMOVBU case 2: op = ppc64.AMOVHU case 4: op = ppc64.AMOVWZU // there is no lwau, only lwaux case 8: op = ppc64.AMOVDU } if w%int64(align) != 0 { gc.Fatalf("sgen: unaligned size %d (align=%d) for %v", w, align, n.Type) } c := int32(w / int64(align)) // if we are copying forward on the stack and // the src and dst overlap, then reverse direction dir := align if osrc < odst && odst < osrc+w { dir = -dir } var dst gc.Node var src gc.Node if n.Ullman >= res.Ullman { gc.Agenr(n, &dst, res) // temporarily use dst gc.Regalloc(&src, gc.Types[gc.Tptr], nil) gins(ppc64.AMOVD, &dst, &src) if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agen(res, &dst) } else { if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agenr(res, &dst, res) gc.Agenr(n, &src, nil) } var tmp gc.Node gc.Regalloc(&tmp, gc.Types[gc.Tptr], nil) // set up end marker var nend gc.Node // move src and dest to the end of block if necessary if dir < 0 { if c >= 4 { gc.Regalloc(&nend, gc.Types[gc.Tptr], nil) gins(ppc64.AMOVD, &src, &nend) } p := gins(ppc64.AADD, nil, &src) p.From.Type = obj.TYPE_CONST p.From.Offset = w p = gins(ppc64.AADD, nil, &dst) p.From.Type = obj.TYPE_CONST p.From.Offset = w } else { p := gins(ppc64.AADD, nil, &src) p.From.Type = obj.TYPE_CONST p.From.Offset = int64(-dir) p = gins(ppc64.AADD, nil, &dst) p.From.Type = obj.TYPE_CONST p.From.Offset = int64(-dir) if c >= 4 { gc.Regalloc(&nend, gc.Types[gc.Tptr], nil) p := gins(ppc64.AMOVD, &src, &nend) p.From.Type = obj.TYPE_ADDR p.From.Offset = w } } // move // TODO: enable duffcopy for larger copies. if c >= 4 { p := gins(op, &src, &tmp) p.From.Type = obj.TYPE_MEM p.From.Offset = int64(dir) ploop := p p = gins(op, &tmp, &dst) p.To.Type = obj.TYPE_MEM p.To.Offset = int64(dir) p = gins(ppc64.ACMP, &src, &nend) gc.Patch(gc.Gbranch(ppc64.ABNE, nil, 0), ploop) gc.Regfree(&nend) } else { // TODO(austin): Instead of generating ADD $-8,R8; ADD // $-8,R7; n*(MOVDU 8(R8),R9; MOVDU R9,8(R7);) just // generate the offsets directly and eliminate the // ADDs. That will produce shorter, more // pipeline-able code. var p *obj.Prog for ; c > 0; c-- { p = gins(op, &src, &tmp) p.From.Type = obj.TYPE_MEM p.From.Offset = int64(dir) p = gins(op, &tmp, &dst) p.To.Type = obj.TYPE_MEM p.To.Offset = int64(dir) } } gc.Regfree(&dst) gc.Regfree(&src) gc.Regfree(&tmp) }
func blockcopy(n, ns *gc.Node, osrc, odst, w int64) { var noddi gc.Node gc.Nodreg(&noddi, gc.Types[gc.Tptr], x86.REG_DI) var nodsi gc.Node gc.Nodreg(&nodsi, gc.Types[gc.Tptr], x86.REG_SI) var nodl gc.Node var nodr gc.Node if n.Ullman >= ns.Ullman { gc.Agenr(n, &nodr, &nodsi) if ns.Op == gc.ONAME { gc.Gvardef(ns) } gc.Agenr(ns, &nodl, &noddi) } else { if ns.Op == gc.ONAME { gc.Gvardef(ns) } gc.Agenr(ns, &nodl, &noddi) gc.Agenr(n, &nodr, &nodsi) } if nodl.Reg != x86.REG_DI { gmove(&nodl, &noddi) } if nodr.Reg != x86.REG_SI { gmove(&nodr, &nodsi) } gc.Regfree(&nodl) gc.Regfree(&nodr) c := w % 8 // bytes q := w / 8 // quads var oldcx gc.Node var cx gc.Node savex(x86.REG_CX, &cx, &oldcx, nil, gc.Types[gc.TINT64]) // if we are copying forward on the stack and // the src and dst overlap, then reverse direction if osrc < odst && odst < osrc+w { // reverse direction gins(x86.ASTD, nil, nil) // set direction flag if c > 0 { gconreg(addptr, w-1, x86.REG_SI) gconreg(addptr, w-1, x86.REG_DI) gconreg(movptr, c, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSB, nil, nil) // MOVB *(SI)-,*(DI)- } if q > 0 { if c > 0 { gconreg(addptr, -7, x86.REG_SI) gconreg(addptr, -7, x86.REG_DI) } else { gconreg(addptr, w-8, x86.REG_SI) gconreg(addptr, w-8, x86.REG_DI) } gconreg(movptr, q, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)-,*(DI)- } // we leave with the flag clear gins(x86.ACLD, nil, nil) } else { // normal direction if q > 128 || (gc.Nacl && q >= 4) || (obj.Getgoos() == "plan9" && q >= 4) { gconreg(movptr, q, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ } else if q >= 4 { var oldx0 gc.Node var x0 gc.Node savex(x86.REG_X0, &x0, &oldx0, nil, gc.Types[gc.TFLOAT64]) p := gins(obj.ADUFFCOPY, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) // 64 blocks taking 14 bytes each // see ../../../../runtime/mkduff.go p.To.Offset = 14 * (64 - q/2) restx(&x0, &oldx0) if q%2 != 0 { gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ } } else if !gc.Nacl && c == 0 { // We don't need the MOVSQ side-effect of updating SI and DI, // and issuing a sequence of MOVQs directly is faster. nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG for q > 0 { gmove(&nodsi, &cx) // MOVQ x+(SI),CX gmove(&cx, &noddi) // MOVQ CX,x+(DI) nodsi.Xoffset += 8 noddi.Xoffset += 8 q-- } } else { for q > 0 { gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ q-- } } // copy the remaining c bytes if w < 4 || c <= 1 || (odst < osrc && osrc < odst+w) { for c > 0 { gins(x86.AMOVSB, nil, nil) // MOVB *(SI)+,*(DI)+ c-- } } else if w < 8 || c <= 4 { nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG cx.Type = gc.Types[gc.TINT32] nodsi.Type = gc.Types[gc.TINT32] noddi.Type = gc.Types[gc.TINT32] if c > 4 { nodsi.Xoffset = 0 noddi.Xoffset = 0 gmove(&nodsi, &cx) gmove(&cx, &noddi) } nodsi.Xoffset = c - 4 noddi.Xoffset = c - 4 gmove(&nodsi, &cx) gmove(&cx, &noddi) } else { nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG cx.Type = gc.Types[gc.TINT64] nodsi.Type = gc.Types[gc.TINT64] noddi.Type = gc.Types[gc.TINT64] nodsi.Xoffset = c - 8 noddi.Xoffset = c - 8 gmove(&nodsi, &cx) gmove(&cx, &noddi) } } restx(&cx, &oldcx) }
func blockcopy(n, res *gc.Node, osrc, odst, w int64) { var dst gc.Node gc.Nodreg(&dst, gc.Types[gc.Tptr], x86.REG_DI) var src gc.Node gc.Nodreg(&src, gc.Types[gc.Tptr], x86.REG_SI) var tsrc gc.Node gc.Tempname(&tsrc, gc.Types[gc.Tptr]) var tdst gc.Node gc.Tempname(&tdst, gc.Types[gc.Tptr]) if !n.Addable { gc.Agen(n, &tsrc) } if !res.Addable { gc.Agen(res, &tdst) } if n.Addable { gc.Agen(n, &src) } else { gmove(&tsrc, &src) } if res.Op == gc.ONAME { gc.Gvardef(res) } if res.Addable { gc.Agen(res, &dst) } else { gmove(&tdst, &dst) } c := int32(w % 4) // bytes q := int32(w / 4) // doublewords // if we are copying forward on the stack and // the src and dst overlap, then reverse direction if osrc < odst && odst < osrc+w { // reverse direction gins(x86.ASTD, nil, nil) // set direction flag if c > 0 { gconreg(x86.AADDL, w-1, x86.REG_SI) gconreg(x86.AADDL, w-1, x86.REG_DI) gconreg(x86.AMOVL, int64(c), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSB, nil, nil) // MOVB *(SI)-,*(DI)- } if q > 0 { if c > 0 { gconreg(x86.AADDL, -3, x86.REG_SI) gconreg(x86.AADDL, -3, x86.REG_DI) } else { gconreg(x86.AADDL, w-4, x86.REG_SI) gconreg(x86.AADDL, w-4, x86.REG_DI) } gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSL, nil, nil) // MOVL *(SI)-,*(DI)- } // we leave with the flag clear gins(x86.ACLD, nil, nil) } else { gins(x86.ACLD, nil, nil) // paranoia. TODO(rsc): remove? // normal direction if q > 128 || (q >= 4 && gc.Nacl) { gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSL, nil, nil) // MOVL *(SI)+,*(DI)+ } else if q >= 4 { p := gins(obj.ADUFFCOPY, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) // 10 and 128 = magic constants: see ../../runtime/asm_386.s p.To.Offset = 10 * (128 - int64(q)) } else if !gc.Nacl && c == 0 { var cx gc.Node gc.Nodreg(&cx, gc.Types[gc.TINT32], x86.REG_CX) // We don't need the MOVSL side-effect of updating SI and DI, // and issuing a sequence of MOVLs directly is faster. src.Op = gc.OINDREG dst.Op = gc.OINDREG for q > 0 { gmove(&src, &cx) // MOVL x+(SI),CX gmove(&cx, &dst) // MOVL CX,x+(DI) src.Xoffset += 4 dst.Xoffset += 4 q-- } } else { for q > 0 { gins(x86.AMOVSL, nil, nil) // MOVL *(SI)+,*(DI)+ q-- } } for c > 0 { gins(x86.AMOVSB, nil, nil) // MOVB *(SI)+,*(DI)+ c-- } } }
func blockcopy(n, res *gc.Node, osrc, odst, w int64) { // determine alignment. // want to avoid unaligned access, so have to use // smaller operations for less aligned types. // for example moving [4]byte must use 4 MOVB not 1 MOVW. align := int(n.Type.Align) var op obj.As switch align { default: gc.Fatalf("sgen: invalid alignment %d for %v", align, n.Type) case 1: op = arm.AMOVB case 2: op = arm.AMOVH case 4: op = arm.AMOVW } if w%int64(align) != 0 { gc.Fatalf("sgen: unaligned size %d (align=%d) for %v", w, align, n.Type) } c := int32(w / int64(align)) if osrc%int64(align) != 0 || odst%int64(align) != 0 { gc.Fatalf("sgen: unaligned offset src %d or dst %d (align %d)", osrc, odst, align) } // if we are copying forward on the stack and // the src and dst overlap, then reverse direction dir := align if osrc < odst && odst < osrc+w { dir = -dir } if op == arm.AMOVW && !gc.Nacl && dir > 0 && c >= 4 && c <= 128 { var r0 gc.Node r0.Op = gc.OREGISTER r0.Reg = arm.REG_R0 var r1 gc.Node r1.Op = gc.OREGISTER r1.Reg = arm.REG_R0 + 1 var r2 gc.Node r2.Op = gc.OREGISTER r2.Reg = arm.REG_R0 + 2 var src gc.Node gc.Regalloc(&src, gc.Types[gc.Tptr], &r1) var dst gc.Node gc.Regalloc(&dst, gc.Types[gc.Tptr], &r2) if n.Ullman >= res.Ullman { // eval n first gc.Agen(n, &src) if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agen(res, &dst) } else { // eval res first if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agen(res, &dst) gc.Agen(n, &src) } var tmp gc.Node gc.Regalloc(&tmp, gc.Types[gc.Tptr], &r0) f := gc.Sysfunc("duffcopy") p := gins(obj.ADUFFCOPY, nil, f) gc.Afunclit(&p.To, f) // 8 and 128 = magic constants: see ../../runtime/asm_arm.s p.To.Offset = 8 * (128 - int64(c)) gc.Regfree(&tmp) gc.Regfree(&src) gc.Regfree(&dst) return } var dst gc.Node var src gc.Node if n.Ullman >= res.Ullman { gc.Agenr(n, &dst, res) // temporarily use dst gc.Regalloc(&src, gc.Types[gc.Tptr], nil) gins(arm.AMOVW, &dst, &src) if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agen(res, &dst) } else { if res.Op == gc.ONAME { gc.Gvardef(res) } gc.Agenr(res, &dst, res) gc.Agenr(n, &src, nil) } var tmp gc.Node gc.Regalloc(&tmp, gc.Types[gc.TUINT32], nil) // set up end marker var nend gc.Node if c >= 4 { gc.Regalloc(&nend, gc.Types[gc.TUINT32], nil) p := gins(arm.AMOVW, &src, &nend) p.From.Type = obj.TYPE_ADDR if dir < 0 { p.From.Offset = int64(dir) } else { p.From.Offset = w } } // move src and dest to the end of block if necessary if dir < 0 { p := gins(arm.AMOVW, &src, &src) p.From.Type = obj.TYPE_ADDR p.From.Offset = w + int64(dir) p = gins(arm.AMOVW, &dst, &dst) p.From.Type = obj.TYPE_ADDR p.From.Offset = w + int64(dir) } // move if c >= 4 { p := gins(op, &src, &tmp) p.From.Type = obj.TYPE_MEM p.From.Offset = int64(dir) p.Scond |= arm.C_PBIT ploop := p p = gins(op, &tmp, &dst) p.To.Type = obj.TYPE_MEM p.To.Offset = int64(dir) p.Scond |= arm.C_PBIT p = gins(arm.ACMP, &src, nil) raddr(&nend, p) gc.Patch(gc.Gbranch(arm.ABNE, nil, 0), ploop) gc.Regfree(&nend) } else { var p *obj.Prog for ; c > 0; c-- { p = gins(op, &src, &tmp) p.From.Type = obj.TYPE_MEM p.From.Offset = int64(dir) p.Scond |= arm.C_PBIT p = gins(op, &tmp, &dst) p.To.Type = obj.TYPE_MEM p.To.Offset = int64(dir) p.Scond |= arm.C_PBIT } } gc.Regfree(&dst) gc.Regfree(&src) gc.Regfree(&tmp) }
func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { s.SetLineno(v.Line) switch v.Op { case ssa.OpInitMem: // memory arg needs no code case ssa.OpArg: // input args need no code case ssa.OpSP, ssa.OpSB: // nothing to do case ssa.OpCopy: case ssa.OpLoadReg: // TODO: by type p := gc.Prog(arm.AMOVW) n, off := gc.AutoVar(v.Args[0]) p.From.Type = obj.TYPE_MEM p.From.Node = n p.From.Sym = gc.Linksym(n.Sym) p.From.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.From.Name = obj.NAME_PARAM p.From.Offset += n.Xoffset } else { p.From.Name = obj.NAME_AUTO } p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpStoreReg: // TODO: by type p := gc.Prog(arm.AMOVW) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) n, off := gc.AutoVar(v) p.To.Type = obj.TYPE_MEM p.To.Node = n p.To.Sym = gc.Linksym(n.Sym) p.To.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.To.Name = obj.NAME_PARAM p.To.Offset += n.Xoffset } else { p.To.Name = obj.NAME_AUTO } case ssa.OpARMADD: r := gc.SSARegNum(v) r1 := gc.SSARegNum(v.Args[0]) r2 := gc.SSARegNum(v.Args[1]) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r1 p.Reg = r2 p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpARMADDconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt if v.Aux != nil { panic("can't handle symbolic constant yet") } p.Reg = gc.SSARegNum(v.Args[0]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpARMMOVWconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpARMCMP: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG // Special layout in ARM assembly // Comparing to x86, the operands of ARM's CMP are reversed. p.From.Reg = gc.SSARegNum(v.Args[1]) p.Reg = gc.SSARegNum(v.Args[0]) case ssa.OpARMMOVWload: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpARMMOVWstore: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.To, v) case ssa.OpARMCALLstatic: // TODO: deferreturn p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym)) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpVarDef: gc.Gvardef(v.Aux.(*gc.Node)) case ssa.OpVarKill: gc.Gvarkill(v.Aux.(*gc.Node)) case ssa.OpVarLive: gc.Gvarlive(v.Aux.(*gc.Node)) case ssa.OpARMLessThan: v.Fatalf("pseudo-op made it to output: %s", v.LongString()) default: v.Unimplementedf("genValue not implemented: %s", v.LongString()) } }