func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32) *obj.Prog { cnt := hi - lo if cnt == 0 { return p } if *ax == 0 { p = appendpp(p, x86.AMOVL, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } if cnt <= int64(4*gc.Widthreg) { for i := int64(0); i < cnt; i += int64(gc.Widthreg) { p = appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i) } } else if !gc.Nacl && cnt <= int64(128*gc.Widthreg) { p = appendpp(p, x86.ALEAL, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, 1*(128-cnt/int64(gc.Widthreg))) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) } else { p = appendpp(p, x86.AMOVL, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0) p = appendpp(p, x86.ALEAL, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) p = appendpp(p, x86.ASTOSL, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) } return p }
func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { s.SetLineno(v.Line) switch v.Op { case ssa.OpAMD64ADDQ, ssa.OpAMD64ADDL: r := gc.SSARegNum(v) r1 := gc.SSARegNum(v.Args[0]) r2 := gc.SSARegNum(v.Args[1]) switch { case r == r1: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r2 p.To.Type = obj.TYPE_REG p.To.Reg = r case r == r2: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r1 p.To.Type = obj.TYPE_REG p.To.Reg = r default: var asm obj.As if v.Op == ssa.OpAMD64ADDQ { asm = x86.ALEAQ } else { asm = x86.ALEAL } p := gc.Prog(asm) p.From.Type = obj.TYPE_MEM p.From.Reg = r1 p.From.Scale = 1 p.From.Index = r2 p.To.Type = obj.TYPE_REG p.To.Reg = r } // 2-address opcode arithmetic case ssa.OpAMD64SUBQ, ssa.OpAMD64SUBL, ssa.OpAMD64MULQ, ssa.OpAMD64MULL, ssa.OpAMD64ANDQ, ssa.OpAMD64ANDL, ssa.OpAMD64ORQ, ssa.OpAMD64ORL, ssa.OpAMD64XORQ, ssa.OpAMD64XORL, ssa.OpAMD64SHLQ, ssa.OpAMD64SHLL, ssa.OpAMD64SHRQ, ssa.OpAMD64SHRL, ssa.OpAMD64SHRW, ssa.OpAMD64SHRB, ssa.OpAMD64SARQ, ssa.OpAMD64SARL, ssa.OpAMD64SARW, ssa.OpAMD64SARB, ssa.OpAMD64ADDSS, ssa.OpAMD64ADDSD, ssa.OpAMD64SUBSS, ssa.OpAMD64SUBSD, ssa.OpAMD64MULSS, ssa.OpAMD64MULSD, ssa.OpAMD64DIVSS, ssa.OpAMD64DIVSD, ssa.OpAMD64PXOR: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } opregreg(v.Op.Asm(), r, gc.SSARegNum(v.Args[1])) case ssa.OpAMD64DIVQ, ssa.OpAMD64DIVL, ssa.OpAMD64DIVW, ssa.OpAMD64DIVQU, ssa.OpAMD64DIVLU, ssa.OpAMD64DIVWU, ssa.OpAMD64MODQ, ssa.OpAMD64MODL, ssa.OpAMD64MODW, ssa.OpAMD64MODQU, ssa.OpAMD64MODLU, ssa.OpAMD64MODWU: // Arg[0] is already in AX as it's the only register we allow // and AX is the only output x := gc.SSARegNum(v.Args[1]) // CPU faults upon signed overflow, which occurs when most // negative int is divided by -1. var j *obj.Prog if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL || v.Op == ssa.OpAMD64DIVW || v.Op == ssa.OpAMD64MODQ || v.Op == ssa.OpAMD64MODL || v.Op == ssa.OpAMD64MODW { var c *obj.Prog switch v.Op { case ssa.OpAMD64DIVQ, ssa.OpAMD64MODQ: c = gc.Prog(x86.ACMPQ) j = gc.Prog(x86.AJEQ) // go ahead and sign extend to save doing it later gc.Prog(x86.ACQO) case ssa.OpAMD64DIVL, ssa.OpAMD64MODL: c = gc.Prog(x86.ACMPL) j = gc.Prog(x86.AJEQ) gc.Prog(x86.ACDQ) case ssa.OpAMD64DIVW, ssa.OpAMD64MODW: c = gc.Prog(x86.ACMPW) j = gc.Prog(x86.AJEQ) gc.Prog(x86.ACWD) } c.From.Type = obj.TYPE_REG c.From.Reg = x c.To.Type = obj.TYPE_CONST c.To.Offset = -1 j.To.Type = obj.TYPE_BRANCH } // for unsigned ints, we sign extend by setting DX = 0 // signed ints were sign extended above if v.Op == ssa.OpAMD64DIVQU || v.Op == ssa.OpAMD64MODQU || v.Op == ssa.OpAMD64DIVLU || v.Op == ssa.OpAMD64MODLU || v.Op == ssa.OpAMD64DIVWU || v.Op == ssa.OpAMD64MODWU { c := gc.Prog(x86.AXORQ) c.From.Type = obj.TYPE_REG c.From.Reg = x86.REG_DX c.To.Type = obj.TYPE_REG c.To.Reg = x86.REG_DX } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = x // signed division, rest of the check for -1 case if j != nil { j2 := gc.Prog(obj.AJMP) j2.To.Type = obj.TYPE_BRANCH var n *obj.Prog if v.Op == ssa.OpAMD64DIVQ || v.Op == ssa.OpAMD64DIVL || v.Op == ssa.OpAMD64DIVW { // n * -1 = -n n = gc.Prog(x86.ANEGQ) n.To.Type = obj.TYPE_REG n.To.Reg = x86.REG_AX } else { // n % -1 == 0 n = gc.Prog(x86.AXORQ) n.From.Type = obj.TYPE_REG n.From.Reg = x86.REG_DX n.To.Type = obj.TYPE_REG n.To.Reg = x86.REG_DX } j.To.Val = n j2.To.Val = s.Pc() } case ssa.OpAMD64HMULQ, ssa.OpAMD64HMULL, ssa.OpAMD64HMULW, ssa.OpAMD64HMULB, ssa.OpAMD64HMULQU, ssa.OpAMD64HMULLU, ssa.OpAMD64HMULWU, ssa.OpAMD64HMULBU: // the frontend rewrites constant division by 8/16/32 bit integers into // HMUL by a constant // SSA rewrites generate the 64 bit versions // Arg[0] is already in AX as it's the only register we allow // and DX is the only output we care about (the high bits) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[1]) // IMULB puts the high portion in AH instead of DL, // so move it to DL for consistency if v.Type.Size() == 1 { m := gc.Prog(x86.AMOVB) m.From.Type = obj.TYPE_REG m.From.Reg = x86.REG_AH m.To.Type = obj.TYPE_REG m.To.Reg = x86.REG_DX } case ssa.OpAMD64AVGQU: // compute (x+y)/2 unsigned. // Do a 64-bit add, the overflow goes into the carry. // Shift right once and pull the carry back into the 63rd bit. r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(x86.AADDQ) p.From.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG p.To.Reg = r p.From.Reg = gc.SSARegNum(v.Args[1]) p = gc.Prog(x86.ARCRQ) p.From.Type = obj.TYPE_CONST p.From.Offset = 1 p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64ADDQconst, ssa.OpAMD64ADDLconst: r := gc.SSARegNum(v) a := gc.SSARegNum(v.Args[0]) if r == a { if v.AuxInt == 1 { var asm obj.As // Software optimization manual recommends add $1,reg. // But inc/dec is 1 byte smaller. ICC always uses inc // Clang/GCC choose depending on flags, but prefer add. // Experiments show that inc/dec is both a little faster // and make a binary a little smaller. if v.Op == ssa.OpAMD64ADDQconst { asm = x86.AINCQ } else { asm = x86.AINCL } p := gc.Prog(asm) p.To.Type = obj.TYPE_REG p.To.Reg = r return } if v.AuxInt == -1 { var asm obj.As if v.Op == ssa.OpAMD64ADDQconst { asm = x86.ADECQ } else { asm = x86.ADECL } p := gc.Prog(asm) p.To.Type = obj.TYPE_REG p.To.Reg = r return } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r return } var asm obj.As if v.Op == ssa.OpAMD64ADDQconst { asm = x86.ALEAQ } else { asm = x86.ALEAL } p := gc.Prog(asm) p.From.Type = obj.TYPE_MEM p.From.Reg = a p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64CMOVQEQconst, ssa.OpAMD64CMOVLEQconst, ssa.OpAMD64CMOVWEQconst, ssa.OpAMD64CMOVQNEconst, ssa.OpAMD64CMOVLNEconst, ssa.OpAMD64CMOVWNEconst: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } // Constant into AX p := gc.Prog(moveByType(v.Type)) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = x86.REG_AX p = gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = x86.REG_AX p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64MULQconst, ssa.OpAMD64MULLconst: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r // TODO: Teach doasm to compile the three-address multiply imul $c, r1, r2 // then we don't need to use resultInArg0 for these ops. //p.From3 = new(obj.Addr) //p.From3.Type = obj.TYPE_REG //p.From3.Reg = gc.SSARegNum(v.Args[0]) case ssa.OpAMD64SUBQconst, ssa.OpAMD64SUBLconst, ssa.OpAMD64ANDQconst, ssa.OpAMD64ANDLconst, ssa.OpAMD64ORQconst, ssa.OpAMD64ORLconst, ssa.OpAMD64XORQconst, ssa.OpAMD64XORLconst, ssa.OpAMD64SHLQconst, ssa.OpAMD64SHLLconst, ssa.OpAMD64SHRQconst, ssa.OpAMD64SHRLconst, ssa.OpAMD64SHRWconst, ssa.OpAMD64SHRBconst, ssa.OpAMD64SARQconst, ssa.OpAMD64SARLconst, ssa.OpAMD64SARWconst, ssa.OpAMD64SARBconst, ssa.OpAMD64ROLQconst, ssa.OpAMD64ROLLconst, ssa.OpAMD64ROLWconst, ssa.OpAMD64ROLBconst: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64SBBQcarrymask, ssa.OpAMD64SBBLcarrymask: r := gc.SSARegNum(v) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64LEAQ1, ssa.OpAMD64LEAQ2, ssa.OpAMD64LEAQ4, ssa.OpAMD64LEAQ8: r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) p := gc.Prog(x86.ALEAQ) switch v.Op { case ssa.OpAMD64LEAQ1: p.From.Scale = 1 if i == x86.REG_SP { r, i = i, r } case ssa.OpAMD64LEAQ2: p.From.Scale = 2 case ssa.OpAMD64LEAQ4: p.From.Scale = 4 case ssa.OpAMD64LEAQ8: p.From.Scale = 8 } p.From.Type = obj.TYPE_MEM p.From.Reg = r p.From.Index = i gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64LEAQ: p := gc.Prog(x86.ALEAQ) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64CMPQ, ssa.OpAMD64CMPL, ssa.OpAMD64CMPW, ssa.OpAMD64CMPB, ssa.OpAMD64TESTQ, ssa.OpAMD64TESTL, ssa.OpAMD64TESTW, ssa.OpAMD64TESTB: opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[1]), gc.SSARegNum(v.Args[0])) case ssa.OpAMD64UCOMISS, ssa.OpAMD64UCOMISD: // Go assembler has swapped operands for UCOMISx relative to CMP, // must account for that right here. opregreg(v.Op.Asm(), gc.SSARegNum(v.Args[0]), gc.SSARegNum(v.Args[1])) case ssa.OpAMD64CMPQconst, ssa.OpAMD64CMPLconst, ssa.OpAMD64CMPWconst, ssa.OpAMD64CMPBconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) p.To.Type = obj.TYPE_CONST p.To.Offset = v.AuxInt case ssa.OpAMD64TESTQconst, ssa.OpAMD64TESTLconst, ssa.OpAMD64TESTWconst, ssa.OpAMD64TESTBconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v.Args[0]) case ssa.OpAMD64MOVLconst, ssa.OpAMD64MOVQconst: x := gc.SSARegNum(v) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = x // If flags are live at this instruction, suppress the // MOV $0,AX -> XOR AX,AX optimization. if v.Aux != nil { p.Mark |= x86.PRESERVEFLAGS } case ssa.OpAMD64MOVSSconst, ssa.OpAMD64MOVSDconst: x := gc.SSARegNum(v) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_FCONST p.From.Val = math.Float64frombits(uint64(v.AuxInt)) p.To.Type = obj.TYPE_REG p.To.Reg = x case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVOload: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVQloadidx8, ssa.OpAMD64MOVSDloadidx8: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.From.Scale = 8 p.From.Index = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVLloadidx4, ssa.OpAMD64MOVSSloadidx4: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.From.Scale = 4 p.From.Index = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVWloadidx2: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.From.Scale = 2 p.From.Index = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVBloadidx1, ssa.OpAMD64MOVWloadidx1, ssa.OpAMD64MOVLloadidx1, ssa.OpAMD64MOVQloadidx1, ssa.OpAMD64MOVSSloadidx1, ssa.OpAMD64MOVSDloadidx1: r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) if i == x86.REG_SP { r, i = i, r } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = r p.From.Scale = 1 p.From.Index = i gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVOstore: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVQstoreidx8, ssa.OpAMD64MOVSDstoreidx8: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) p.To.Scale = 8 p.To.Index = gc.SSARegNum(v.Args[1]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVSSstoreidx4, ssa.OpAMD64MOVLstoreidx4: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) p.To.Scale = 4 p.To.Index = gc.SSARegNum(v.Args[1]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVWstoreidx2: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) p.To.Scale = 2 p.To.Index = gc.SSARegNum(v.Args[1]) gc.AddAux(&p.To, v) case ssa.OpAMD64MOVBstoreidx1, ssa.OpAMD64MOVWstoreidx1, ssa.OpAMD64MOVLstoreidx1, ssa.OpAMD64MOVQstoreidx1, ssa.OpAMD64MOVSSstoreidx1, ssa.OpAMD64MOVSDstoreidx1: r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) if i == x86.REG_SP { r, i = i, r } p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[2]) p.To.Type = obj.TYPE_MEM p.To.Reg = r p.To.Scale = 1 p.To.Index = i gc.AddAux(&p.To, v) case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST sc := v.AuxValAndOff() p.From.Offset = sc.Val() p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux2(&p.To, v, sc.Off()) case ssa.OpAMD64MOVQstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx8, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx4, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx2, ssa.OpAMD64MOVBstoreconstidx1: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST sc := v.AuxValAndOff() p.From.Offset = sc.Val() r := gc.SSARegNum(v.Args[0]) i := gc.SSARegNum(v.Args[1]) switch v.Op { case ssa.OpAMD64MOVBstoreconstidx1, ssa.OpAMD64MOVWstoreconstidx1, ssa.OpAMD64MOVLstoreconstidx1, ssa.OpAMD64MOVQstoreconstidx1: p.To.Scale = 1 if i == x86.REG_SP { r, i = i, r } case ssa.OpAMD64MOVWstoreconstidx2: p.To.Scale = 2 case ssa.OpAMD64MOVLstoreconstidx4: p.To.Scale = 4 case ssa.OpAMD64MOVQstoreconstidx8: p.To.Scale = 8 } p.To.Type = obj.TYPE_MEM p.To.Reg = r p.To.Index = i gc.AddAux2(&p.To, v, sc.Off()) case ssa.OpAMD64MOVLQSX, ssa.OpAMD64MOVWQSX, ssa.OpAMD64MOVBQSX, ssa.OpAMD64MOVLQZX, ssa.OpAMD64MOVWQZX, ssa.OpAMD64MOVBQZX, ssa.OpAMD64CVTSL2SS, ssa.OpAMD64CVTSL2SD, ssa.OpAMD64CVTSQ2SS, ssa.OpAMD64CVTSQ2SD, ssa.OpAMD64CVTTSS2SL, ssa.OpAMD64CVTTSD2SL, ssa.OpAMD64CVTTSS2SQ, ssa.OpAMD64CVTTSD2SQ, ssa.OpAMD64CVTSS2SD, ssa.OpAMD64CVTSD2SS: opregreg(v.Op.Asm(), gc.SSARegNum(v), gc.SSARegNum(v.Args[0])) case ssa.OpAMD64DUFFZERO: off := duffStart(v.AuxInt) adj := duffAdj(v.AuxInt) var p *obj.Prog if adj != 0 { p = gc.Prog(x86.AADDQ) p.From.Type = obj.TYPE_CONST p.From.Offset = adj p.To.Type = obj.TYPE_REG p.To.Reg = x86.REG_DI } p = gc.Prog(obj.ADUFFZERO) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Offset = off case ssa.OpAMD64MOVOconst: if v.AuxInt != 0 { v.Unimplementedf("MOVOconst can only do constant=0") } r := gc.SSARegNum(v) opregreg(x86.AXORPS, r, r) case ssa.OpAMD64DUFFCOPY: p := gc.Prog(obj.ADUFFCOPY) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) p.To.Offset = v.AuxInt case ssa.OpCopy, ssa.OpAMD64MOVQconvert: // TODO: use MOVQreg for reg->reg copies instead of OpCopy? if v.Type.IsMemory() { return } x := gc.SSARegNum(v.Args[0]) y := gc.SSARegNum(v) if x != y { opregreg(moveByType(v.Type), y, x) } case ssa.OpLoadReg: if v.Type.IsFlags() { v.Unimplementedf("load flags not implemented: %v", v.LongString()) return } p := gc.Prog(loadByType(v.Type)) n, off := gc.AutoVar(v.Args[0]) p.From.Type = obj.TYPE_MEM p.From.Node = n p.From.Sym = gc.Linksym(n.Sym) p.From.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.From.Name = obj.NAME_PARAM p.From.Offset += n.Xoffset } else { p.From.Name = obj.NAME_AUTO } p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpStoreReg: if v.Type.IsFlags() { v.Unimplementedf("store flags not implemented: %v", v.LongString()) return } p := gc.Prog(storeByType(v.Type)) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) n, off := gc.AutoVar(v) p.To.Type = obj.TYPE_MEM p.To.Node = n p.To.Sym = gc.Linksym(n.Sym) p.To.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.To.Name = obj.NAME_PARAM p.To.Offset += n.Xoffset } else { p.To.Name = obj.NAME_AUTO } case ssa.OpPhi: // just check to make sure regalloc and stackalloc did it right if v.Type.IsMemory() { return } f := v.Block.Func loc := f.RegAlloc[v.ID] for _, a := range v.Args { if aloc := f.RegAlloc[a.ID]; aloc != loc { // TODO: .Equal() instead? v.Fatalf("phi arg at different location than phi: %v @ %v, but arg %v @ %v\n%s\n", v, loc, a, aloc, v.Block.Func) } } case ssa.OpInitMem: // memory arg needs no code case ssa.OpArg: // input args need no code case ssa.OpAMD64LoweredGetClosurePtr: // Output is hardwired to DX only, // and DX contains the closure pointer on // closure entry, and this "instruction" // is scheduled to the very beginning // of the entry block. case ssa.OpAMD64LoweredGetG: r := gc.SSARegNum(v) // See the comments in cmd/avail/obj/x86/obj6.go // near CanUse1InsnTLS for a detailed explanation of these instructions. if x86.CanUse1InsnTLS(gc.Ctxt) { // MOVQ (TLS), r p := gc.Prog(x86.AMOVQ) p.From.Type = obj.TYPE_MEM p.From.Reg = x86.REG_TLS p.To.Type = obj.TYPE_REG p.To.Reg = r } else { // MOVQ TLS, r // MOVQ (r)(TLS*1), r p := gc.Prog(x86.AMOVQ) p.From.Type = obj.TYPE_REG p.From.Reg = x86.REG_TLS p.To.Type = obj.TYPE_REG p.To.Reg = r q := gc.Prog(x86.AMOVQ) q.From.Type = obj.TYPE_MEM q.From.Reg = r q.From.Index = x86.REG_TLS q.From.Scale = 1 q.To.Type = obj.TYPE_REG q.To.Reg = r } case ssa.OpAMD64CALLstatic: if v.Aux.(*gc.Sym) == gc.Deferreturn.Sym { // Deferred calls will appear to be returning to // the CALL deferreturn(SB) that we are about to emit. // However, the stack trace code will show the line // of the instruction byte before the return PC. // To avoid that being an unrelated instruction, // insert an actual hardware NOP that will have the right line number. // This is different from obj.ANOP, which is a virtual no-op // that doesn't make it into the instruction stream. ginsnop() } p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym)) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLclosure: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v.Args[0]) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLdefer: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(gc.Deferproc.Sym) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLgo: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(gc.Newproc.Sym) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64CALLinter: p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v.Args[0]) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpAMD64NEGQ, ssa.OpAMD64NEGL, ssa.OpAMD64BSWAPQ, ssa.OpAMD64BSWAPL, ssa.OpAMD64NOTQ, ssa.OpAMD64NOTL: r := gc.SSARegNum(v) if r != gc.SSARegNum(v.Args[0]) { v.Fatalf("input[0] and output not in same register %s", v.LongString()) } p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpAMD64BSFQ, ssa.OpAMD64BSFL, ssa.OpAMD64BSFW, ssa.OpAMD64BSRQ, ssa.OpAMD64BSRL, ssa.OpAMD64BSRW, ssa.OpAMD64SQRTSD: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpSP, ssa.OpSB: // nothing to do case ssa.OpAMD64SETEQ, ssa.OpAMD64SETNE, ssa.OpAMD64SETL, ssa.OpAMD64SETLE, ssa.OpAMD64SETG, ssa.OpAMD64SETGE, ssa.OpAMD64SETGF, ssa.OpAMD64SETGEF, ssa.OpAMD64SETB, ssa.OpAMD64SETBE, ssa.OpAMD64SETORD, ssa.OpAMD64SETNAN, ssa.OpAMD64SETA, ssa.OpAMD64SETAE: p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpAMD64SETNEF: p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) q := gc.Prog(x86.ASETPS) q.To.Type = obj.TYPE_REG q.To.Reg = x86.REG_AX // ORL avoids partial register write and is smaller than ORQ, used by old compiler opregreg(x86.AORL, gc.SSARegNum(v), x86.REG_AX) case ssa.OpAMD64SETEQF: p := gc.Prog(v.Op.Asm()) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) q := gc.Prog(x86.ASETPC) q.To.Type = obj.TYPE_REG q.To.Reg = x86.REG_AX // ANDL avoids partial register write and is smaller than ANDQ, used by old compiler opregreg(x86.AANDL, gc.SSARegNum(v), x86.REG_AX) case ssa.OpAMD64InvertFlags: v.Fatalf("InvertFlags should never make it to codegen %v", v.LongString()) case ssa.OpAMD64FlagEQ, ssa.OpAMD64FlagLT_ULT, ssa.OpAMD64FlagLT_UGT, ssa.OpAMD64FlagGT_ULT, ssa.OpAMD64FlagGT_UGT: v.Fatalf("Flag* ops should never make it to codegen %v", v.LongString()) case ssa.OpAMD64REPSTOSQ: gc.Prog(x86.AREP) gc.Prog(x86.ASTOSQ) case ssa.OpAMD64REPMOVSQ: gc.Prog(x86.AREP) gc.Prog(x86.AMOVSQ) case ssa.OpVarDef: gc.Gvardef(v.Aux.(*gc.Node)) case ssa.OpVarKill: gc.Gvarkill(v.Aux.(*gc.Node)) case ssa.OpVarLive: gc.Gvarlive(v.Aux.(*gc.Node)) case ssa.OpKeepAlive: if !v.Args[0].Type.IsPtrShaped() { v.Fatalf("keeping non-pointer alive %v", v.Args[0]) } n, off := gc.AutoVar(v.Args[0]) if n == nil { v.Fatalf("KeepLive with non-spilled value %s %s", v, v.Args[0]) } if off != 0 { v.Fatalf("KeepLive with non-zero offset spill location %s:%d", n, off) } gc.Gvarlive(n) case ssa.OpAMD64LoweredNilCheck: // Optimization - if the subsequent block has a load or store // at the same address, we don't need to issue this instruction. mem := v.Args[1] for _, w := range v.Block.Succs[0].Block().Values { if w.Op == ssa.OpPhi { if w.Type.IsMemory() { mem = w } continue } if len(w.Args) == 0 || !w.Args[len(w.Args)-1].Type.IsMemory() { // w doesn't use a store - can't be a memory op. continue } if w.Args[len(w.Args)-1] != mem { v.Fatalf("wrong store after nilcheck v=%s w=%s", v, w) } switch w.Op { case ssa.OpAMD64MOVQload, ssa.OpAMD64MOVLload, ssa.OpAMD64MOVWload, ssa.OpAMD64MOVBload, ssa.OpAMD64MOVQstore, ssa.OpAMD64MOVLstore, ssa.OpAMD64MOVWstore, ssa.OpAMD64MOVBstore, ssa.OpAMD64MOVBQSXload, ssa.OpAMD64MOVWQSXload, ssa.OpAMD64MOVLQSXload, ssa.OpAMD64MOVSSload, ssa.OpAMD64MOVSDload, ssa.OpAMD64MOVOload, ssa.OpAMD64MOVSSstore, ssa.OpAMD64MOVSDstore, ssa.OpAMD64MOVOstore: if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage { if gc.Debug_checknil != 0 && int(v.Line) > 1 { gc.Warnl(v.Line, "removed nil check") } return } case ssa.OpAMD64MOVQstoreconst, ssa.OpAMD64MOVLstoreconst, ssa.OpAMD64MOVWstoreconst, ssa.OpAMD64MOVBstoreconst: off := ssa.ValAndOff(v.AuxInt).Off() if w.Args[0] == v.Args[0] && w.Aux == nil && off >= 0 && off < minZeroPage { if gc.Debug_checknil != 0 && int(v.Line) > 1 { gc.Warnl(v.Line, "removed nil check") } return } } if w.Type.IsMemory() { if w.Op == ssa.OpVarDef || w.Op == ssa.OpVarKill || w.Op == ssa.OpVarLive { // these ops are OK mem = w continue } // We can't delay the nil check past the next store. break } } // Issue a load which will fault if the input is nil. // TODO: We currently use the 2-byte instruction TESTB AX, (reg). // Should we use the 3-byte TESTB $0, (reg) instead? It is larger // but it doesn't have false dependency on AX. // Or maybe allocate an output register and use MOVL (reg),reg2 ? // That trades clobbering flags for clobbering a register. p := gc.Prog(x86.ATESTB) p.From.Type = obj.TYPE_REG p.From.Reg = x86.REG_AX p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.To, v) if gc.Debug_checknil != 0 && v.Line > 1 { // v.Line==1 in generated wrappers gc.Warnl(v.Line, "generated nil check") } default: v.Unimplementedf("genValue not implemented: %s", v.LongString()) } }
func ssaGenBlock(s *gc.SSAGenState, b, next *ssa.Block) { s.SetLineno(b.Line) switch b.Kind { case ssa.BlockPlain, ssa.BlockCall, ssa.BlockCheck: if b.Succs[0].Block() != next { p := gc.Prog(obj.AJMP) p.To.Type = obj.TYPE_BRANCH s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()}) } case ssa.BlockDefer: // defer returns in rax: // 0 if we should continue executing // 1 if we should jump to deferreturn call p := gc.Prog(x86.ATESTL) p.From.Type = obj.TYPE_REG p.From.Reg = x86.REG_AX p.To.Type = obj.TYPE_REG p.To.Reg = x86.REG_AX p = gc.Prog(x86.AJNE) p.To.Type = obj.TYPE_BRANCH s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()}) if b.Succs[0].Block() != next { p := gc.Prog(obj.AJMP) p.To.Type = obj.TYPE_BRANCH s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()}) } case ssa.BlockExit: gc.Prog(obj.AUNDEF) // tell plive.go that we never reach here case ssa.BlockRet: gc.Prog(obj.ARET) case ssa.BlockRetJmp: p := gc.Prog(obj.AJMP) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(b.Aux.(*gc.Sym)) case ssa.BlockAMD64EQF: gc.SSAGenFPJump(s, b, next, &eqfJumps) case ssa.BlockAMD64NEF: gc.SSAGenFPJump(s, b, next, &nefJumps) case ssa.BlockAMD64EQ, ssa.BlockAMD64NE, ssa.BlockAMD64LT, ssa.BlockAMD64GE, ssa.BlockAMD64LE, ssa.BlockAMD64GT, ssa.BlockAMD64ULT, ssa.BlockAMD64UGT, ssa.BlockAMD64ULE, ssa.BlockAMD64UGE: jmp := blockJump[b.Kind] likely := b.Likely var p *obj.Prog switch next { case b.Succs[0].Block(): p = gc.Prog(jmp.invasm) likely *= -1 p.To.Type = obj.TYPE_BRANCH s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[1].Block()}) case b.Succs[1].Block(): p = gc.Prog(jmp.asm) p.To.Type = obj.TYPE_BRANCH s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()}) default: p = gc.Prog(jmp.asm) p.To.Type = obj.TYPE_BRANCH s.Branches = append(s.Branches, gc.Branch{P: p, B: b.Succs[0].Block()}) q := gc.Prog(obj.AJMP) q.To.Type = obj.TYPE_BRANCH s.Branches = append(s.Branches, gc.Branch{P: q, B: b.Succs[1].Block()}) } // liblink reorders the instruction stream as it sees fit. // Pass along what we know so liblink can make use of it. // TODO: Once we've fully switched to SSA, // make liblink leave our output alone. switch likely { case ssa.BranchUnlikely: p.From.Type = obj.TYPE_CONST p.From.Offset = 0 case ssa.BranchLikely: p.From.Type = obj.TYPE_CONST p.From.Offset = 1 } default: b.Unimplementedf("branch not implemented: %s. Control: %s", b.LongString(), b.Control.LongString()) } }
func blockcopy(n, ns *gc.Node, osrc, odst, w int64) { var noddi gc.Node gc.Nodreg(&noddi, gc.Types[gc.Tptr], x86.REG_DI) var nodsi gc.Node gc.Nodreg(&nodsi, gc.Types[gc.Tptr], x86.REG_SI) var nodl gc.Node var nodr gc.Node if n.Ullman >= ns.Ullman { gc.Agenr(n, &nodr, &nodsi) if ns.Op == gc.ONAME { gc.Gvardef(ns) } gc.Agenr(ns, &nodl, &noddi) } else { if ns.Op == gc.ONAME { gc.Gvardef(ns) } gc.Agenr(ns, &nodl, &noddi) gc.Agenr(n, &nodr, &nodsi) } if nodl.Reg != x86.REG_DI { gmove(&nodl, &noddi) } if nodr.Reg != x86.REG_SI { gmove(&nodr, &nodsi) } gc.Regfree(&nodl) gc.Regfree(&nodr) c := w % 8 // bytes q := w / 8 // quads var oldcx gc.Node var cx gc.Node savex(x86.REG_CX, &cx, &oldcx, nil, gc.Types[gc.TINT64]) // if we are copying forward on the stack and // the src and dst overlap, then reverse direction if osrc < odst && odst < osrc+w { // reverse direction gins(x86.ASTD, nil, nil) // set direction flag if c > 0 { gconreg(addptr, w-1, x86.REG_SI) gconreg(addptr, w-1, x86.REG_DI) gconreg(movptr, c, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSB, nil, nil) // MOVB *(SI)-,*(DI)- } if q > 0 { if c > 0 { gconreg(addptr, -7, x86.REG_SI) gconreg(addptr, -7, x86.REG_DI) } else { gconreg(addptr, w-8, x86.REG_SI) gconreg(addptr, w-8, x86.REG_DI) } gconreg(movptr, q, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)-,*(DI)- } // we leave with the flag clear gins(x86.ACLD, nil, nil) } else { // normal direction if q > 128 || (gc.Nacl && q >= 4) || (obj.Getgoos() == "plan9" && q >= 4) { gconreg(movptr, q, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ } else if q >= 4 { var oldx0 gc.Node var x0 gc.Node savex(x86.REG_X0, &x0, &oldx0, nil, gc.Types[gc.TFLOAT64]) p := gins(obj.ADUFFCOPY, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) // 64 blocks taking 14 bytes each // see ../../../../runtime/mkduff.go p.To.Offset = 14 * (64 - q/2) restx(&x0, &oldx0) if q%2 != 0 { gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ } } else if !gc.Nacl && c == 0 { // We don't need the MOVSQ side-effect of updating SI and DI, // and issuing a sequence of MOVQs directly is faster. nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG for q > 0 { gmove(&nodsi, &cx) // MOVQ x+(SI),CX gmove(&cx, &noddi) // MOVQ CX,x+(DI) nodsi.Xoffset += 8 noddi.Xoffset += 8 q-- } } else { for q > 0 { gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ q-- } } // copy the remaining c bytes if w < 4 || c <= 1 || (odst < osrc && osrc < odst+w) { for c > 0 { gins(x86.AMOVSB, nil, nil) // MOVB *(SI)+,*(DI)+ c-- } } else if w < 8 || c <= 4 { nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG cx.Type = gc.Types[gc.TINT32] nodsi.Type = gc.Types[gc.TINT32] noddi.Type = gc.Types[gc.TINT32] if c > 4 { nodsi.Xoffset = 0 noddi.Xoffset = 0 gmove(&nodsi, &cx) gmove(&cx, &noddi) } nodsi.Xoffset = c - 4 noddi.Xoffset = c - 4 gmove(&nodsi, &cx) gmove(&cx, &noddi) } else { nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG cx.Type = gc.Types[gc.TINT64] nodsi.Type = gc.Types[gc.TINT64] noddi.Type = gc.Types[gc.TINT64] nodsi.Xoffset = c - 8 noddi.Xoffset = c - 8 gmove(&nodsi, &cx) gmove(&cx, &noddi) } } restx(&cx, &oldcx) }
func clearfat(nl *gc.Node) { /* clear a fat object */ if gc.Debug['g'] != 0 { gc.Dump("\nclearfat", nl) } w := uint32(nl.Type.Width) // Avoid taking the address for simple enough types. if gc.Componentgen(nil, nl) { return } c := w % 4 // bytes q := w / 4 // quads if q < 4 { // Write sequence of MOV 0, off(base) instead of using STOSL. // The hope is that although the code will be slightly longer, // the MOVs will have no dependencies and pipeline better // than the unrolled STOSL loop. // NOTE: Must use agen, not igen, so that optimizer sees address // being taken. We are not writing on field boundaries. var n1 gc.Node gc.Regalloc(&n1, gc.Types[gc.Tptr], nil) gc.Agen(nl, &n1) n1.Op = gc.OINDREG var z gc.Node gc.Nodconst(&z, gc.Types[gc.TUINT64], 0) for ; q > 0; q-- { n1.Type = z.Type gins(x86.AMOVL, &z, &n1) n1.Xoffset += 4 } gc.Nodconst(&z, gc.Types[gc.TUINT8], 0) for ; c > 0; c-- { n1.Type = z.Type gins(x86.AMOVB, &z, &n1) n1.Xoffset++ } gc.Regfree(&n1) return } var n1 gc.Node gc.Nodreg(&n1, gc.Types[gc.Tptr], x86.REG_DI) gc.Agen(nl, &n1) gconreg(x86.AMOVL, 0, x86.REG_AX) if q > 128 || (q >= 4 && gc.Nacl) { gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.ASTOSL, nil, nil) // STOL AL,*(DI)+ } else if q >= 4 { p := gins(obj.ADUFFZERO, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) // 1 and 128 = magic constants: see ../../runtime/asm_386.s p.To.Offset = 1 * (128 - int64(q)) } else { for q > 0 { gins(x86.ASTOSL, nil, nil) // STOL AL,*(DI)+ q-- } } for c > 0 { gins(x86.ASTOSB, nil, nil) // STOB AL,*(DI)+ c-- } }
func blockcopy(n, res *gc.Node, osrc, odst, w int64) { var dst gc.Node gc.Nodreg(&dst, gc.Types[gc.Tptr], x86.REG_DI) var src gc.Node gc.Nodreg(&src, gc.Types[gc.Tptr], x86.REG_SI) var tsrc gc.Node gc.Tempname(&tsrc, gc.Types[gc.Tptr]) var tdst gc.Node gc.Tempname(&tdst, gc.Types[gc.Tptr]) if !n.Addable { gc.Agen(n, &tsrc) } if !res.Addable { gc.Agen(res, &tdst) } if n.Addable { gc.Agen(n, &src) } else { gmove(&tsrc, &src) } if res.Op == gc.ONAME { gc.Gvardef(res) } if res.Addable { gc.Agen(res, &dst) } else { gmove(&tdst, &dst) } c := int32(w % 4) // bytes q := int32(w / 4) // doublewords // if we are copying forward on the stack and // the src and dst overlap, then reverse direction if osrc < odst && odst < osrc+w { // reverse direction gins(x86.ASTD, nil, nil) // set direction flag if c > 0 { gconreg(x86.AADDL, w-1, x86.REG_SI) gconreg(x86.AADDL, w-1, x86.REG_DI) gconreg(x86.AMOVL, int64(c), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSB, nil, nil) // MOVB *(SI)-,*(DI)- } if q > 0 { if c > 0 { gconreg(x86.AADDL, -3, x86.REG_SI) gconreg(x86.AADDL, -3, x86.REG_DI) } else { gconreg(x86.AADDL, w-4, x86.REG_SI) gconreg(x86.AADDL, w-4, x86.REG_DI) } gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSL, nil, nil) // MOVL *(SI)-,*(DI)- } // we leave with the flag clear gins(x86.ACLD, nil, nil) } else { gins(x86.ACLD, nil, nil) // paranoia. TODO(rsc): remove? // normal direction if q > 128 || (q >= 4 && gc.Nacl) { gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSL, nil, nil) // MOVL *(SI)+,*(DI)+ } else if q >= 4 { p := gins(obj.ADUFFCOPY, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) // 10 and 128 = magic constants: see ../../runtime/asm_386.s p.To.Offset = 10 * (128 - int64(q)) } else if !gc.Nacl && c == 0 { var cx gc.Node gc.Nodreg(&cx, gc.Types[gc.TINT32], x86.REG_CX) // We don't need the MOVSL side-effect of updating SI and DI, // and issuing a sequence of MOVLs directly is faster. src.Op = gc.OINDREG dst.Op = gc.OINDREG for q > 0 { gmove(&src, &cx) // MOVL x+(SI),CX gmove(&cx, &dst) // MOVL CX,x+(DI) src.Xoffset += 4 dst.Xoffset += 4 q-- } } else { for q > 0 { gins(x86.AMOVSL, nil, nil) // MOVL *(SI)+,*(DI)+ q-- } } for c > 0 { gins(x86.AMOVSB, nil, nil) // MOVB *(SI)+,*(DI)+ c-- } } }
func clearfat(nl *gc.Node) { /* clear a fat object */ if gc.Debug['g'] != 0 { gc.Dump("\nclearfat", nl) } // Avoid taking the address for simple enough types. if gc.Componentgen(nil, nl) { return } w := nl.Type.Width if w > 1024 || (w >= 64 && (gc.Nacl || isPlan9)) { var oldn1 gc.Node var n1 gc.Node savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr]) gc.Agen(nl, &n1) var ax gc.Node var oldax gc.Node savex(x86.REG_AX, &ax, &oldax, nil, gc.Types[gc.Tptr]) gconreg(x86.AMOVL, 0, x86.REG_AX) gconreg(movptr, w/8, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.ASTOSQ, nil, nil) // STOQ AL,*(DI)+ if w%8 != 0 { n1.Op = gc.OINDREG clearfat_tail(&n1, w%8) } restx(&n1, &oldn1) restx(&ax, &oldax) return } if w >= 64 { var oldn1 gc.Node var n1 gc.Node savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr]) gc.Agen(nl, &n1) var vec_zero gc.Node var old_x0 gc.Node savex(x86.REG_X0, &vec_zero, &old_x0, nil, gc.Types[gc.TFLOAT64]) gins(x86.AXORPS, &vec_zero, &vec_zero) if di := dzDI(w); di != 0 { gconreg(addptr, di, x86.REG_DI) } p := gins(obj.ADUFFZERO, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Offset = dzOff(w) if w%16 != 0 { n1.Op = gc.OINDREG n1.Xoffset -= 16 - w%16 gins(x86.AMOVUPS, &vec_zero, &n1) } restx(&vec_zero, &old_x0) restx(&n1, &oldn1) return } // NOTE: Must use agen, not igen, so that optimizer sees address // being taken. We are not writing on field boundaries. var n1 gc.Node gc.Agenr(nl, &n1, nil) n1.Op = gc.OINDREG clearfat_tail(&n1, w) gc.Regfree(&n1) }
func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32, x0 *uint32) *obj.Prog { cnt := hi - lo if cnt == 0 { return p } if cnt%int64(gc.Widthreg) != 0 { // should only happen with nacl if cnt%int64(gc.Widthptr) != 0 { gc.Fatalf("zerorange count not a multiple of widthptr %d", cnt) } if *ax == 0 { p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } p = appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo) lo += int64(gc.Widthptr) cnt -= int64(gc.Widthptr) } if cnt == 8 { if *ax == 0 { p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } p = appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo) } else if !isPlan9 && cnt <= int64(8*gc.Widthreg) { if *x0 == 0 { p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0) *x0 = 1 } for i := int64(0); i < cnt/16; i++ { p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i*16) } if cnt%16 != 0 { p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+cnt-int64(16)) } } else if !gc.Nacl && !isPlan9 && (cnt <= int64(128*gc.Widthreg)) { if *x0 == 0 { p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0) *x0 = 1 } p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt)) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) if cnt%16 != 0 { p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8)) } } else { if *ax == 0 { p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0) p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) p = appendpp(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) } return p }
func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { s.SetLineno(v.Line) switch v.Op { case ssa.OpInitMem: // memory arg needs no code case ssa.OpArg: // input args need no code case ssa.OpSP, ssa.OpSB: // nothing to do case ssa.OpCopy: case ssa.OpLoadReg: // TODO: by type p := gc.Prog(arm.AMOVW) n, off := gc.AutoVar(v.Args[0]) p.From.Type = obj.TYPE_MEM p.From.Node = n p.From.Sym = gc.Linksym(n.Sym) p.From.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.From.Name = obj.NAME_PARAM p.From.Offset += n.Xoffset } else { p.From.Name = obj.NAME_AUTO } p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpStoreReg: // TODO: by type p := gc.Prog(arm.AMOVW) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[0]) n, off := gc.AutoVar(v) p.To.Type = obj.TYPE_MEM p.To.Node = n p.To.Sym = gc.Linksym(n.Sym) p.To.Offset = off if n.Class == gc.PPARAM || n.Class == gc.PPARAMOUT { p.To.Name = obj.NAME_PARAM p.To.Offset += n.Xoffset } else { p.To.Name = obj.NAME_AUTO } case ssa.OpARMADD: r := gc.SSARegNum(v) r1 := gc.SSARegNum(v.Args[0]) r2 := gc.SSARegNum(v.Args[1]) p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = r1 p.Reg = r2 p.To.Type = obj.TYPE_REG p.To.Reg = r case ssa.OpARMADDconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt if v.Aux != nil { panic("can't handle symbolic constant yet") } p.Reg = gc.SSARegNum(v.Args[0]) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpARMMOVWconst: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_CONST p.From.Offset = v.AuxInt p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpARMCMP: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG // Special layout in ARM assembly // Comparing to x86, the operands of ARM's CMP are reversed. p.From.Reg = gc.SSARegNum(v.Args[1]) p.Reg = gc.SSARegNum(v.Args[0]) case ssa.OpARMMOVWload: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_MEM p.From.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.From, v) p.To.Type = obj.TYPE_REG p.To.Reg = gc.SSARegNum(v) case ssa.OpARMMOVWstore: p := gc.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG p.From.Reg = gc.SSARegNum(v.Args[1]) p.To.Type = obj.TYPE_MEM p.To.Reg = gc.SSARegNum(v.Args[0]) gc.AddAux(&p.To, v) case ssa.OpARMCALLstatic: // TODO: deferreturn p := gc.Prog(obj.ACALL) p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = gc.Linksym(v.Aux.(*gc.Sym)) if gc.Maxarg < v.AuxInt { gc.Maxarg = v.AuxInt } case ssa.OpVarDef: gc.Gvardef(v.Aux.(*gc.Node)) case ssa.OpVarKill: gc.Gvarkill(v.Aux.(*gc.Node)) case ssa.OpVarLive: gc.Gvarlive(v.Aux.(*gc.Node)) case ssa.OpARMLessThan: v.Fatalf("pseudo-op made it to output: %s", v.LongString()) default: v.Unimplementedf("genValue not implemented: %s", v.LongString()) } }