func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32) *obj.Prog { cnt := hi - lo if cnt == 0 { return p } if *ax == 0 { p = appendpp(p, x86.AMOVL, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } if cnt <= int64(4*gc.Widthreg) { for i := int64(0); i < cnt; i += int64(gc.Widthreg) { p = appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i) } } else if !gc.Nacl && cnt <= int64(128*gc.Widthreg) { p = appendpp(p, x86.ALEAL, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, 1*(128-cnt/int64(gc.Widthreg))) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) } else { p = appendpp(p, x86.AMOVL, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0) p = appendpp(p, x86.ALEAL, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) p = appendpp(p, x86.ASTOSL, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) } return p }
func clearfat(nl *gc.Node) { /* clear a fat object */ if gc.Debug['g'] != 0 { gc.Dump("\nclearfat", nl) } // Avoid taking the address for simple enough types. if gc.Componentgen(nil, nl) { return } w := nl.Type.Width if w > 1024 || (gc.Nacl && w >= 64) { var oldn1 gc.Node var n1 gc.Node savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr]) gc.Agen(nl, &n1) var ax gc.Node var oldax gc.Node savex(x86.REG_AX, &ax, &oldax, nil, gc.Types[gc.Tptr]) gconreg(x86.AMOVL, 0, x86.REG_AX) gconreg(movptr, w/8, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.ASTOSQ, nil, nil) // STOQ AL,*(DI)+ if w%8 != 0 { n1.Op = gc.OINDREG clearfat_tail(&n1, w%8) } restx(&n1, &oldn1) restx(&ax, &oldax) return } if w >= 64 { var oldn1 gc.Node var n1 gc.Node savex(x86.REG_DI, &n1, &oldn1, nil, gc.Types[gc.Tptr]) gc.Agen(nl, &n1) var vec_zero gc.Node var old_x0 gc.Node savex(x86.REG_X0, &vec_zero, &old_x0, nil, gc.Types[gc.TFLOAT64]) gins(x86.AXORPS, &vec_zero, &vec_zero) if di := dzDI(w); di != 0 { gconreg(addptr, di, x86.REG_DI) } p := gins(obj.ADUFFZERO, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Offset = dzOff(w) if w%16 != 0 { n1.Op = gc.OINDREG n1.Xoffset -= 16 - w%16 gins(x86.AMOVUPS, &vec_zero, &n1) } restx(&vec_zero, &old_x0) restx(&n1, &oldn1) return } // NOTE: Must use agen, not igen, so that optimizer sees address // being taken. We are not writing on field boundaries. var n1 gc.Node gc.Agenr(nl, &n1, nil) n1.Op = gc.OINDREG clearfat_tail(&n1, w) gc.Regfree(&n1) }
func zerorange(p *obj.Prog, frame int64, lo int64, hi int64, ax *uint32, x0 *uint32) *obj.Prog { cnt := hi - lo if cnt == 0 { return p } if cnt%int64(gc.Widthreg) != 0 { // should only happen with nacl if cnt%int64(gc.Widthptr) != 0 { gc.Fatalf("zerorange count not a multiple of widthptr %d", cnt) } if *ax == 0 { p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } p = appendpp(p, x86.AMOVL, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo) lo += int64(gc.Widthptr) cnt -= int64(gc.Widthptr) } if cnt == 8 { if *ax == 0 { p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } p = appendpp(p, x86.AMOVQ, obj.TYPE_REG, x86.REG_AX, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo) } else if cnt <= int64(8*gc.Widthreg) { if *x0 == 0 { p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0) *x0 = 1 } for i := int64(0); i < cnt/16; i++ { p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+i*16) } if cnt%16 != 0 { p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_SP, frame+lo+cnt-int64(16)) } } else if !gc.Nacl && (cnt <= int64(128*gc.Widthreg)) { if *x0 == 0 { p = appendpp(p, x86.AXORPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_REG, x86.REG_X0, 0) *x0 = 1 } p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo+dzDI(cnt), obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, obj.ADUFFZERO, obj.TYPE_NONE, 0, 0, obj.TYPE_ADDR, 0, dzOff(cnt)) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) if cnt%16 != 0 { p = appendpp(p, x86.AMOVUPS, obj.TYPE_REG, x86.REG_X0, 0, obj.TYPE_MEM, x86.REG_DI, -int64(8)) } } else { if *ax == 0 { p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, 0, obj.TYPE_REG, x86.REG_AX, 0) *ax = 1 } p = appendpp(p, x86.AMOVQ, obj.TYPE_CONST, 0, cnt/int64(gc.Widthreg), obj.TYPE_REG, x86.REG_CX, 0) p = appendpp(p, leaptr, obj.TYPE_MEM, x86.REG_SP, frame+lo, obj.TYPE_REG, x86.REG_DI, 0) p = appendpp(p, x86.AREP, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) p = appendpp(p, x86.ASTOSQ, obj.TYPE_NONE, 0, 0, obj.TYPE_NONE, 0, 0) } return p }
func blockcopy(n, res *gc.Node, osrc, odst, w int64) { var dst gc.Node gc.Nodreg(&dst, gc.Types[gc.Tptr], x86.REG_DI) var src gc.Node gc.Nodreg(&src, gc.Types[gc.Tptr], x86.REG_SI) var tsrc gc.Node gc.Tempname(&tsrc, gc.Types[gc.Tptr]) var tdst gc.Node gc.Tempname(&tdst, gc.Types[gc.Tptr]) if !n.Addable { gc.Agen(n, &tsrc) } if !res.Addable { gc.Agen(res, &tdst) } if n.Addable { gc.Agen(n, &src) } else { gmove(&tsrc, &src) } if res.Op == gc.ONAME { gc.Gvardef(res) } if res.Addable { gc.Agen(res, &dst) } else { gmove(&tdst, &dst) } c := int32(w % 4) // bytes q := int32(w / 4) // doublewords // if we are copying forward on the stack and // the src and dst overlap, then reverse direction if osrc < odst && int64(odst) < int64(osrc)+w { // reverse direction gins(x86.ASTD, nil, nil) // set direction flag if c > 0 { gconreg(x86.AADDL, w-1, x86.REG_SI) gconreg(x86.AADDL, w-1, x86.REG_DI) gconreg(x86.AMOVL, int64(c), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSB, nil, nil) // MOVB *(SI)-,*(DI)- } if q > 0 { if c > 0 { gconreg(x86.AADDL, -3, x86.REG_SI) gconreg(x86.AADDL, -3, x86.REG_DI) } else { gconreg(x86.AADDL, w-4, x86.REG_SI) gconreg(x86.AADDL, w-4, x86.REG_DI) } gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSL, nil, nil) // MOVL *(SI)-,*(DI)- } // we leave with the flag clear gins(x86.ACLD, nil, nil) } else { gins(x86.ACLD, nil, nil) // paranoia. TODO(rsc): remove? // normal direction if q > 128 || (q >= 4 && gc.Nacl) { gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSL, nil, nil) // MOVL *(SI)+,*(DI)+ } else if q >= 4 { p := gins(obj.ADUFFCOPY, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) // 10 and 128 = magic constants: see ../../runtime/asm_386.s p.To.Offset = 10 * (128 - int64(q)) } else if !gc.Nacl && c == 0 { var cx gc.Node gc.Nodreg(&cx, gc.Types[gc.TINT32], x86.REG_CX) // We don't need the MOVSL side-effect of updating SI and DI, // and issuing a sequence of MOVLs directly is faster. src.Op = gc.OINDREG dst.Op = gc.OINDREG for q > 0 { gmove(&src, &cx) // MOVL x+(SI),CX gmove(&cx, &dst) // MOVL CX,x+(DI) src.Xoffset += 4 dst.Xoffset += 4 q-- } } else { for q > 0 { gins(x86.AMOVSL, nil, nil) // MOVL *(SI)+,*(DI)+ q-- } } for c > 0 { gins(x86.AMOVSB, nil, nil) // MOVB *(SI)+,*(DI)+ c-- } } }
func blockcopy(n, ns *gc.Node, osrc, odst, w int64) { var noddi gc.Node gc.Nodreg(&noddi, gc.Types[gc.Tptr], x86.REG_DI) var nodsi gc.Node gc.Nodreg(&nodsi, gc.Types[gc.Tptr], x86.REG_SI) var nodl gc.Node var nodr gc.Node if n.Ullman >= ns.Ullman { gc.Agenr(n, &nodr, &nodsi) if ns.Op == gc.ONAME { gc.Gvardef(ns) } gc.Agenr(ns, &nodl, &noddi) } else { if ns.Op == gc.ONAME { gc.Gvardef(ns) } gc.Agenr(ns, &nodl, &noddi) gc.Agenr(n, &nodr, &nodsi) } if nodl.Reg != x86.REG_DI { gmove(&nodl, &noddi) } if nodr.Reg != x86.REG_SI { gmove(&nodr, &nodsi) } gc.Regfree(&nodl) gc.Regfree(&nodr) c := w % 8 // bytes q := w / 8 // quads var oldcx gc.Node var cx gc.Node savex(x86.REG_CX, &cx, &oldcx, nil, gc.Types[gc.TINT64]) // if we are copying forward on the stack and // the src and dst overlap, then reverse direction if osrc < odst && odst < osrc+w { // reverse direction gins(x86.ASTD, nil, nil) // set direction flag if c > 0 { gconreg(addptr, w-1, x86.REG_SI) gconreg(addptr, w-1, x86.REG_DI) gconreg(movptr, c, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSB, nil, nil) // MOVB *(SI)-,*(DI)- } if q > 0 { if c > 0 { gconreg(addptr, -7, x86.REG_SI) gconreg(addptr, -7, x86.REG_DI) } else { gconreg(addptr, w-8, x86.REG_SI) gconreg(addptr, w-8, x86.REG_DI) } gconreg(movptr, q, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)-,*(DI)- } // we leave with the flag clear gins(x86.ACLD, nil, nil) } else { // normal direction if q > 128 || (gc.Nacl && q >= 4) || (obj.Getgoos() == "plan9" && q >= 4) { gconreg(movptr, q, x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ } else if q >= 4 { var oldx0 gc.Node var x0 gc.Node savex(x86.REG_X0, &x0, &oldx0, nil, gc.Types[gc.TFLOAT64]) p := gins(obj.ADUFFCOPY, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffcopy", gc.Runtimepkg)) // 64 blocks taking 14 bytes each // see ../../../../runtime/mkduff.go p.To.Offset = 14 * (64 - q/2) restx(&x0, &oldx0) if q%2 != 0 { gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ } } else if !gc.Nacl && c == 0 { // We don't need the MOVSQ side-effect of updating SI and DI, // and issuing a sequence of MOVQs directly is faster. nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG for q > 0 { gmove(&nodsi, &cx) // MOVQ x+(SI),CX gmove(&cx, &noddi) // MOVQ CX,x+(DI) nodsi.Xoffset += 8 noddi.Xoffset += 8 q-- } } else { for q > 0 { gins(x86.AMOVSQ, nil, nil) // MOVQ *(SI)+,*(DI)+ q-- } } // copy the remaining c bytes if w < 4 || c <= 1 || (odst < osrc && osrc < odst+w) { for c > 0 { gins(x86.AMOVSB, nil, nil) // MOVB *(SI)+,*(DI)+ c-- } } else if w < 8 || c <= 4 { nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG cx.Type = gc.Types[gc.TINT32] nodsi.Type = gc.Types[gc.TINT32] noddi.Type = gc.Types[gc.TINT32] if c > 4 { nodsi.Xoffset = 0 noddi.Xoffset = 0 gmove(&nodsi, &cx) gmove(&cx, &noddi) } nodsi.Xoffset = c - 4 noddi.Xoffset = c - 4 gmove(&nodsi, &cx) gmove(&cx, &noddi) } else { nodsi.Op = gc.OINDREG noddi.Op = gc.OINDREG cx.Type = gc.Types[gc.TINT64] nodsi.Type = gc.Types[gc.TINT64] noddi.Type = gc.Types[gc.TINT64] nodsi.Xoffset = c - 8 noddi.Xoffset = c - 8 gmove(&nodsi, &cx) gmove(&cx, &noddi) } } restx(&cx, &oldcx) }
func clearfat(nl *gc.Node) { /* clear a fat object */ if gc.Debug['g'] != 0 { gc.Dump("\nclearfat", nl) } w := uint32(nl.Type.Width) // Avoid taking the address for simple enough types. if gc.Componentgen(nil, nl) { return } c := w % 4 // bytes q := w / 4 // quads if q < 4 { // Write sequence of MOV 0, off(base) instead of using STOSL. // The hope is that although the code will be slightly longer, // the MOVs will have no dependencies and pipeline better // than the unrolled STOSL loop. // NOTE: Must use agen, not igen, so that optimizer sees address // being taken. We are not writing on field boundaries. var n1 gc.Node gc.Regalloc(&n1, gc.Types[gc.Tptr], nil) gc.Agen(nl, &n1) n1.Op = gc.OINDREG var z gc.Node gc.Nodconst(&z, gc.Types[gc.TUINT64], 0) for ; q > 0; q-- { n1.Type = z.Type gins(x86.AMOVL, &z, &n1) n1.Xoffset += 4 } gc.Nodconst(&z, gc.Types[gc.TUINT8], 0) for ; c > 0; c-- { n1.Type = z.Type gins(x86.AMOVB, &z, &n1) n1.Xoffset++ } gc.Regfree(&n1) return } var n1 gc.Node gc.Nodreg(&n1, gc.Types[gc.Tptr], x86.REG_DI) gc.Agen(nl, &n1) gconreg(x86.AMOVL, 0, x86.REG_AX) if q > 128 || (q >= 4 && gc.Nacl) { gconreg(x86.AMOVL, int64(q), x86.REG_CX) gins(x86.AREP, nil, nil) // repeat gins(x86.ASTOSL, nil, nil) // STOL AL,*(DI)+ } else if q >= 4 { p := gins(obj.ADUFFZERO, nil, nil) p.To.Type = obj.TYPE_ADDR p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) // 1 and 128 = magic constants: see ../../runtime/asm_386.s p.To.Offset = 1 * (128 - int64(q)) } else { for q > 0 { gins(x86.ASTOSL, nil, nil) // STOL AL,*(DI)+ q-- } } for c > 0 { gins(x86.ASTOSB, nil, nil) // STOB AL,*(DI)+ c-- } }