func RunSchedLocalQueueEmptyTest(iters int) { // Test that runq is not spuriously reported as empty. // Runq emptiness affects scheduling decisions and spurious emptiness // can lead to underutilization (both runnable Gs and idle Ps coexist // for arbitrary long time). done := make(chan bool, 1) p := new(p) gs := make([]g, 2) ready := new(uint32) for i := 0; i < iters; i++ { *ready = 0 next0 := (i & 1) == 0 next1 := (i & 2) == 0 runqput(p, &gs[0], next0) go func() { for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; { } if runqempty(p) { println("next:", next0, next1) throw("queue is empty") } done <- true }() for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; { } runqput(p, &gs[1], next1) runqget(p) <-done runqget(p) } }
func semrelease(addr *uint32) { root := semroot(addr) atomic.Xadd(addr, 1) // Easy case: no waiters? // This check must happen after the xadd, to avoid a missed wakeup // (see loop in semacquire). if atomic.Load(&root.nwait) == 0 { return } // Harder case: search for a waiter and wake it. lock(&root.lock) if atomic.Load(&root.nwait) == 0 { // The count is already consumed by another goroutine, // so no need to wake up another goroutine. unlock(&root.lock) return } s := root.head for ; s != nil; s = s.next { if s.elem == unsafe.Pointer(addr) { atomic.Xadd(&root.nwait, -1) root.dequeue(s) break } } unlock(&root.lock) if s != nil { if s.releasetime != 0 { s.releasetime = cputicks() } goready(s.g, 4) } }
// Called from runtime. func semacquire(addr *uint32, profile semaProfileFlags) { gp := getg() if gp != gp.m.curg { throw("semacquire not on the G stack") } // Easy case. if cansemacquire(addr) { return } // Harder case: // increment waiter count // try cansemacquire one more time, return if succeeded // enqueue itself as a waiter // sleep // (waiter descriptor is dequeued by signaler) s := acquireSudog() root := semroot(addr) t0 := int64(0) s.releasetime = 0 s.acquiretime = 0 if profile&semaBlockProfile != 0 && blockprofilerate > 0 { t0 = cputicks() s.releasetime = -1 } if profile&semaMutexProfile != 0 && mutexprofilerate > 0 { if t0 == 0 { t0 = cputicks() } s.acquiretime = t0 } for { lock(&root.lock) // Add ourselves to nwait to disable "easy case" in semrelease. atomic.Xadd(&root.nwait, 1) // Check cansemacquire to avoid missed wakeup. if cansemacquire(addr) { atomic.Xadd(&root.nwait, -1) unlock(&root.lock) break } // Any semrelease after the cansemacquire knows we're waiting // (we set nwait above), so go to sleep. root.queue(addr, s) goparkunlock(&root.lock, "semacquire", traceEvGoBlockSync, 4) if cansemacquire(addr) { break } } if s.releasetime > 0 { blockevent(s.releasetime-t0, 3) } releaseSudog(s) }
// Get a full work buffer off the work.full list. // If nothing is available wait until all the other gc helpers have // finished and then return nil. // getfull acts as a barrier for work.nproc helpers. As long as one // gchelper is actively marking objects it // may create a workbuffer that the other helpers can work on. // The for loop either exits when a work buffer is found // or when _all_ of the work.nproc GC helpers are in the loop // looking for work and thus not capable of creating new work. // This is in fact the termination condition for the STW mark // phase. //go:nowritebarrier func getfull(entry int) *workbuf { b := (*workbuf)(lfstackpop(&work.full)) if b != nil { b.logget(entry) b.checknonempty() return b } incnwait := atomic.Xadd(&work.nwait, +1) if incnwait > work.nproc { println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc) throw("work.nwait > work.nproc") } for i := 0; ; i++ { if work.full != 0 { decnwait := atomic.Xadd(&work.nwait, -1) if decnwait == work.nproc { println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc) throw("work.nwait > work.nproc") } b = (*workbuf)(lfstackpop(&work.full)) if b != nil { b.logget(entry) b.checknonempty() return b } incnwait := atomic.Xadd(&work.nwait, +1) if incnwait > work.nproc { println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc) throw("work.nwait > work.nproc") } } if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs { return nil } _g_ := getg() if i < 10 { _g_.m.gcstats.nprocyield++ procyield(20) } else if i < 20 { _g_.m.gcstats.nosyield++ osyield() } else { _g_.m.gcstats.nsleep++ usleep(100) } } }
// sweeps one span // returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep //go:nowritebarrier func sweepone() uintptr { _g_ := getg() // increment locks to ensure that the goroutine is not preempted // in the middle of sweep thus leaving the span in an inconsistent state for next GC _g_.m.locks++ sg := mheap_.sweepgen for { idx := atomic.Xadd(&sweep.spanidx, 1) - 1 if idx >= uint32(len(work.spans)) { mheap_.sweepdone = 1 _g_.m.locks-- return ^uintptr(0) } s := work.spans[idx] if s.state != mSpanInUse { s.sweepgen = sg continue } if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) { continue } npages := s.npages if !s.sweep(false) { npages = 0 } _g_.m.locks-- return npages } }
// sweeps one span // returns number of pages returned to heap, or ^uintptr(0) if there is nothing to sweep //go:nowritebarrier func sweepone() uintptr { _g_ := getg() // increment locks to ensure that the goroutine is not preempted // in the middle of sweep thus leaving the span in an inconsistent state for next GC _g_.m.locks++ sg := mheap_.sweepgen for { idx := atomic.Xadd(&sweep.spanidx, 1) - 1 if idx >= uint32(len(work.spans)) { mheap_.sweepdone = 1 _g_.m.locks-- if debug.gcpacertrace > 0 && idx == uint32(len(work.spans)) { print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", mheap_.spanBytesAlloc>>20, "MB of spans; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n") } return ^uintptr(0) } s := work.spans[idx] if s.state != mSpanInUse { s.sweepgen = sg continue } if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) { continue } npages := s.npages if !s.sweep(false) { npages = 0 } _g_.m.locks-- return npages } }
// lockextra locks the extra list and returns the list head. // The caller must unlock the list by storing a new list head // to extram. If nilokay is true, then lockextra will // return a nil list head if that's what it finds. If nilokay is false, // lockextra will keep waiting until the list head is no longer nil. //go:nosplit func lockextra(nilokay bool) *m { const locked = 1 incr := false for { old := atomic.Loaduintptr(&extram) if old == locked { yield := osyield yield() continue } if old == 0 && !nilokay { if !incr { // Add 1 to the number of threads // waiting for an M. // This is cleared by newextram. atomic.Xadd(&extraMWaiters, 1) incr = true } usleep(1) continue } if atomic.Casuintptr(&extram, old, locked) { return (*m)(unsafe.Pointer(old)) } yield := osyield yield() continue } }
// pop removes and returns a span from buffer b, or nil if b is empty. // pop is safe to call concurrently with other pop operations, but NOT // to call concurrently with push. func (b *gcSweepBuf) pop() *mspan { cursor := atomic.Xadd(&b.index, -1) if int32(cursor) < 0 { atomic.Xadd(&b.index, +1) return nil } // There are no concurrent spine or block modifications during // pop, so we can omit the atomics. top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries blockp := (**gcSweepBlock)(add(b.spine, sys.PtrSize*uintptr(top))) block := *blockp s := block.spans[bottom] // Clear the pointer for block(i). block.spans[bottom] = nil return s }
// gcDrainN blackens grey objects until it has performed roughly // scanWork units of scan work or the G is preempted. This is // best-effort, so it may perform less work if it fails to get a work // buffer. Otherwise, it will perform at least n units of work, but // may perform more because scanning is always done in whole object // increments. It returns the amount of scan work performed. // // The caller goroutine must be in a preemptible state (e.g., // _Gwaiting) to prevent deadlocks during stack scanning. As a // consequence, this must be called on the system stack. // //go:nowritebarrier //go:systemstack func gcDrainN(gcw *gcWork, scanWork int64) int64 { if !writeBarrier.needed { throw("gcDrainN phase incorrect") } // There may already be scan work on the gcw, which we don't // want to claim was done by this call. workFlushed := -gcw.scanWork gp := getg().m.curg for !gp.preempt && workFlushed+gcw.scanWork < scanWork { // See gcDrain comment. if work.full == 0 { gcw.balance() } // This might be a good place to add prefetch code... // if(wbuf.nobj > 4) { // PREFETCH(wbuf->obj[wbuf.nobj - 3]; // } // b := gcw.tryGetFast() if b == 0 { b = gcw.tryGet() } if b == 0 { // Try to do a root job. // // TODO: Assists should get credit for this // work. if work.markrootNext < work.markrootJobs { job := atomic.Xadd(&work.markrootNext, +1) - 1 if job < work.markrootJobs { markroot(gcw, job) continue } } // No heap or root jobs. break } scanobject(b, gcw) // Flush background scan work credit. if gcw.scanWork >= gcCreditSlack { atomic.Xaddint64(&gcController.scanWork, gcw.scanWork) workFlushed += gcw.scanWork gcw.scanWork = 0 } } // Unlike gcDrain, there's no need to flush remaining work // here because this never flushes to bgScanCredit and // gcw.dispose will flush any remaining work to scanWork. return workFlushed + gcw.scanWork }
func semrelease(addr *uint32) { root := semroot(addr) atomic.Xadd(addr, 1) // Easy case: no waiters? // This check must happen after the xadd, to avoid a missed wakeup // (see loop in semacquire). if atomic.Load(&root.nwait) == 0 { return } // Harder case: search for a waiter and wake it. lock(&root.lock) if atomic.Load(&root.nwait) == 0 { // The count is already consumed by another goroutine, // so no need to wake up another goroutine. unlock(&root.lock) return } s := root.head for ; s != nil; s = s.next { if s.elem == unsafe.Pointer(addr) { atomic.Xadd(&root.nwait, -1) root.dequeue(s) break } } if s != nil { if s.acquiretime != 0 { t0 := cputicks() for x := root.head; x != nil; x = x.next { if x.elem == unsafe.Pointer(addr) { x.acquiretime = t0 } } mutexevent(t0-s.acquiretime, 3) } } unlock(&root.lock) if s != nil { // May be slow, so unlock first readyWithTime(s, 5) } }
//go:nosplit func semawakeup(mp *m) { atomic.Xadd(&mp.waitsemacount, 1) ret := thrwakeup(uintptr(unsafe.Pointer(&mp.waitsemacount)), 1) if ret != 0 && ret != _ESRCH { // semawakeup can be called on signal stack. systemstack(func() { print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n") }) } }
//go:nosplit func semawakeup(mp *m) { atomic.Xadd(&mp.waitsemacount, 1) // From NetBSD's _lwp_unpark(2) manual: // "If the target LWP is not currently waiting, it will return // immediately upon the next call to _lwp_park()." ret := lwp_unpark(int32(mp.procid), unsafe.Pointer(&mp.waitsemacount)) if ret != 0 && ret != _ESRCH { // semawakeup can be called on signal stack. systemstack(func() { print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n") }) } }
func dopanic(unused int) { gp := getg() if gp.sig != 0 { signame := signame(gp.sig) if signame != "" { print("[signal ", signame) } else { print("[signal ", hex(gp.sig)) } print(" code=", hex(gp.sigcode0), " addr=", hex(gp.sigcode1), " pc=", hex(gp.sigpc), "]\n") } level, all, docrash := gotraceback() _g_ := getg() if level > 0 { if gp != gp.m.curg { all = true } if gp != gp.m.g0 { print("\n") goroutineheader(gp) traceback(0) } else if level >= 2 || _g_.m.throwing > 0 { print("\nruntime stack:\n") traceback(0) } if !didothers && all { didothers = true tracebackothers(gp) } } unlock(&paniclk) if atomic.Xadd(&panicking, -1) != 0 { // Some other m is panicking too. // Let it print what it needs to print. // Wait forever without chewing up cpu. // It will exit when it's done. lock(&deadlock) lock(&deadlock) } if docrash { crash() } exit(2) }
func startpanic() { _g_ := getg() // Uncomment when mheap_ is in Go. // if mheap_.cachealloc.size == 0 { // very early // print("runtime: panic before malloc heap initialized\n") // _g_.m.mallocing = 1 // tell rest of panic not to try to malloc // } else if _g_.m.mcache == nil { // can happen if called from signal handler or throw _g_.m.mcache = allocmcache() } switch _g_.m.dying { case 0: _g_.m.dying = 1 _g_.writebuf = nil atomic.Xadd(&panicking, 1) lock(&paniclk) if debug.schedtrace > 0 || debug.scheddetail > 0 { schedtrace(true) } freezetheworld() return case 1: // Something failed while panicing, probably the print of the // argument to panic(). Just print a stack trace and exit. _g_.m.dying = 2 print("panic during panic\n") dopanic(0) exit(3) fallthrough case 2: // This is a genuine bug in the runtime, we couldn't even // print the stack trace successfully. _g_.m.dying = 3 print("stack trace unavailable\n") exit(4) fallthrough default: // Can't even print! Just exit. exit(5) } }
func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot *ptrtype) { lock(&finlock) if finq == nil || finq.cnt == uint32(len(finq.fin)) { if finc == nil { finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys)) finc.alllink = allfin allfin = finc if finptrmask[0] == 0 { // Build pointer mask for Finalizer array in block. // Check assumptions made in finalizer1 array above. if (unsafe.Sizeof(finalizer{}) != 5*sys.PtrSize || unsafe.Offsetof(finalizer{}.fn) != 0 || unsafe.Offsetof(finalizer{}.arg) != sys.PtrSize || unsafe.Offsetof(finalizer{}.nret) != 2*sys.PtrSize || unsafe.Offsetof(finalizer{}.fint) != 3*sys.PtrSize || unsafe.Offsetof(finalizer{}.ot) != 4*sys.PtrSize) { throw("finalizer out of sync") } for i := range finptrmask { finptrmask[i] = finalizer1[i%len(finalizer1)] } } } block := finc finc = block.next block.next = finq finq = block } f := &finq.fin[finq.cnt] atomic.Xadd(&finq.cnt, +1) // Sync with markroots f.fn = fn f.nret = nret f.fint = fint f.ot = ot f.arg = p fingwake = true unlock(&finlock) }
// gcAssistAlloc1 is the part of gcAssistAlloc that runs on the system // stack. This is a separate function to make it easier to see that // we're not capturing anything from the user stack, since the user // stack may move while we're in this function. // // gcAssistAlloc1 indicates whether this assist completed the mark // phase by setting gp.param to non-nil. This can't be communicated on // the stack since it may move. // //go:systemstack func gcAssistAlloc1(gp *g, scanWork int64) { // Clear the flag indicating that this assist completed the // mark phase. gp.param = nil if atomic.Load(&gcBlackenEnabled) == 0 { // The gcBlackenEnabled check in malloc races with the // store that clears it but an atomic check in every malloc // would be a performance hit. // Instead we recheck it here on the non-preemptable system // stack to determine if we should preform an assist. // GC is done, so ignore any remaining debt. gp.gcAssistBytes = 0 return } // Track time spent in this assist. Since we're on the // system stack, this is non-preemptible, so we can // just measure start and end time. startTime := nanotime() decnwait := atomic.Xadd(&work.nwait, -1) if decnwait == work.nproc { println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc) throw("nwait > work.nprocs") } // gcDrainN requires the caller to be preemptible. casgstatus(gp, _Grunning, _Gwaiting) gp.waitreason = "GC assist marking" // drain own cached work first in the hopes that it // will be more cache friendly. gcw := &getg().m.p.ptr().gcw workDone := gcDrainN(gcw, scanWork) // If we are near the end of the mark phase // dispose of the gcw. if gcBlackenPromptly { gcw.dispose() } casgstatus(gp, _Gwaiting, _Grunning) // Record that we did this much scan work. // // Back out the number of bytes of assist credit that // this scan work counts for. The "1+" is a poor man's // round-up, to ensure this adds credit even if // assistBytesPerWork is very low. gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone)) // If this is the last worker and we ran out of work, // signal a completion point. incnwait := atomic.Xadd(&work.nwait, +1) if incnwait > work.nproc { println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc, "gcBlackenPromptly=", gcBlackenPromptly) throw("work.nwait > work.nproc") } if incnwait == work.nproc && !gcMarkWorkAvailable(nil) { // This has reached a background completion point. Set // gp.param to a non-nil value to indicate this. It // doesn't matter what we set it to (it just has to be // a valid pointer). gp.param = unsafe.Pointer(gp) } duration := nanotime() - startTime _p_ := gp.m.p.ptr() _p_.gcAssistTime += duration if _p_.gcAssistTime > gcAssistTimeSlack { atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime) _p_.gcAssistTime = 0 } }
// gcDrain scans roots and objects in work buffers, blackening grey // objects until all roots and work buffers have been drained. // // If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt // is set. This implies gcDrainNoBlock. // // If flags&gcDrainIdle != 0, gcDrain returns when there is other work // to do. This implies gcDrainNoBlock. // // If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is // unable to get more work. Otherwise, it will block until all // blocking calls are blocked in gcDrain. // // If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work // credit to gcController.bgScanCredit every gcCreditSlack units of // scan work. // //go:nowritebarrier func gcDrain(gcw *gcWork, flags gcDrainFlags) { if !writeBarrier.needed { throw("gcDrain phase incorrect") } gp := getg().m.curg preemptible := flags&gcDrainUntilPreempt != 0 blocking := flags&(gcDrainUntilPreempt|gcDrainIdle|gcDrainNoBlock) == 0 flushBgCredit := flags&gcDrainFlushBgCredit != 0 idle := flags&gcDrainIdle != 0 initScanWork := gcw.scanWork // idleCheck is the scan work at which to perform the next // idle check with the scheduler. idleCheck := initScanWork + idleCheckThreshold // Drain root marking jobs. if work.markrootNext < work.markrootJobs { for !(preemptible && gp.preempt) { job := atomic.Xadd(&work.markrootNext, +1) - 1 if job >= work.markrootJobs { break } markroot(gcw, job) if idle && pollWork() { goto done } } } // Drain heap marking jobs. for !(preemptible && gp.preempt) { // Try to keep work available on the global queue. We used to // check if there were waiting workers, but it's better to // just keep work available than to make workers wait. In the // worst case, we'll do O(log(_WorkbufSize)) unnecessary // balances. if work.full == 0 { gcw.balance() } var b uintptr if blocking { b = gcw.get() } else { b = gcw.tryGetFast() if b == 0 { b = gcw.tryGet() } } if b == 0 { // work barrier reached or tryGet failed. break } scanobject(b, gcw) // Flush background scan work credit to the global // account if we've accumulated enough locally so // mutator assists can draw on it. if gcw.scanWork >= gcCreditSlack { atomic.Xaddint64(&gcController.scanWork, gcw.scanWork) if flushBgCredit { gcFlushBgCredit(gcw.scanWork - initScanWork) initScanWork = 0 } idleCheck -= gcw.scanWork gcw.scanWork = 0 if idle && idleCheck <= 0 { idleCheck += idleCheckThreshold if pollWork() { break } } } } // In blocking mode, write barriers are not allowed after this // point because we must preserve the condition that the work // buffers are empty. done: // Flush remaining scan work credit. if gcw.scanWork > 0 { atomic.Xaddint64(&gcController.scanWork, gcw.scanWork) if flushBgCredit { gcFlushBgCredit(gcw.scanWork - initScanWork) } gcw.scanWork = 0 } }
// push adds span s to buffer b. push is safe to call concurrently // with other push operations, but NOT to call concurrently with pop. func (b *gcSweepBuf) push(s *mspan) { // Obtain our slot. cursor := uintptr(atomic.Xadd(&b.index, +1) - 1) top, bottom := cursor/gcSweepBlockEntries, cursor%gcSweepBlockEntries // Do we need to add a block? spineLen := atomic.Loaduintptr(&b.spineLen) var block *gcSweepBlock retry: if top < spineLen { spine := atomic.Loadp(unsafe.Pointer(&b.spine)) blockp := add(spine, sys.PtrSize*top) block = (*gcSweepBlock)(atomic.Loadp(blockp)) } else { // Add a new block to the spine, potentially growing // the spine. lock(&b.spineLock) // spineLen cannot change until we release the lock, // but may have changed while we were waiting. spineLen = atomic.Loaduintptr(&b.spineLen) if top < spineLen { unlock(&b.spineLock) goto retry } if spineLen == b.spineCap { // Grow the spine. newCap := b.spineCap * 2 if newCap == 0 { newCap = gcSweepBufInitSpineCap } newSpine := persistentalloc(newCap*sys.PtrSize, sys.CacheLineSize, &memstats.gc_sys) if b.spineCap != 0 { // Blocks are allocated off-heap, so // no write barriers. memmove(newSpine, b.spine, b.spineCap*sys.PtrSize) } // Spine is allocated off-heap, so no write barrier. atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine) b.spineCap = newCap // We can't immediately free the old spine // since a concurrent push with a lower index // could still be reading from it. We let it // leak because even a 1TB heap would waste // less than 2MB of memory on old spines. If // this is a problem, we could free old spines // during STW. } // Allocate a new block and add it to the spine. block = (*gcSweepBlock)(persistentalloc(unsafe.Sizeof(gcSweepBlock{}), sys.CacheLineSize, &memstats.gc_sys)) blockp := add(b.spine, sys.PtrSize*top) // Blocks are allocated off-heap, so no write barrier. atomic.StorepNoWB(blockp, unsafe.Pointer(block)) atomic.Storeuintptr(&b.spineLen, spineLen+1) unlock(&b.spineLock) } // We have a block. Insert the span. block.spans[bottom] = s }
// notifyListAdd adds the caller to a notify list such that it can receive // notifications. The caller must eventually call notifyListWait to wait for // such a notification, passing the returned ticket number. //go:linkname notifyListAdd sync.runtime_notifyListAdd func notifyListAdd(l *notifyList) uint32 { // This may be called concurrently, for example, when called from // sync.Cond.Wait while holding a RWMutex in read mode. return atomic.Xadd(&l.wait, 1) - 1 }
// gcDrain scans roots and objects in work buffers, blackening grey // objects until all roots and work buffers have been drained. // // If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt // is set. This implies gcDrainNoBlock. // // If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is // unable to get more work. Otherwise, it will block until all // blocking calls are blocked in gcDrain. // // If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work // credit to gcController.bgScanCredit every gcCreditSlack units of // scan work. // //go:nowritebarrier func gcDrain(gcw *gcWork, flags gcDrainFlags) { if !writeBarrier.needed { throw("gcDrain phase incorrect") } gp := getg() preemtible := flags&gcDrainUntilPreempt != 0 blocking := flags&(gcDrainUntilPreempt|gcDrainNoBlock) == 0 flushBgCredit := flags&gcDrainFlushBgCredit != 0 // Drain root marking jobs. if work.markrootNext < work.markrootJobs { for blocking || !gp.preempt { job := atomic.Xadd(&work.markrootNext, +1) - 1 if job >= work.markrootJobs { break } // TODO: Pass in gcw. markroot(job) } } initScanWork := gcw.scanWork // Drain heap marking jobs. for !(preemtible && gp.preempt) { // If another proc wants a pointer, give it some. if work.nwait > 0 && work.full == 0 { gcw.balance() } var b uintptr if blocking { b = gcw.get() } else { b = gcw.tryGet() } if b == 0 { // work barrier reached or tryGet failed. break } // If the current wbuf is filled by the scan a new wbuf might be // returned that could possibly hold only a single object. This // could result in each iteration draining only a single object // out of the wbuf passed in + a single object placed // into an empty wbuf in scanobject so there could be // a performance hit as we keep fetching fresh wbufs. scanobject(b, gcw) // Flush background scan work credit to the global // account if we've accumulated enough locally so // mutator assists can draw on it. if gcw.scanWork >= gcCreditSlack { atomic.Xaddint64(&gcController.scanWork, gcw.scanWork) if flushBgCredit { gcFlushBgCredit(gcw.scanWork - initScanWork) initScanWork = 0 } gcw.scanWork = 0 } } // In blocking mode, write barriers are not allowed after this // point because we must preserve the condition that the work // buffers are empty. // Flush remaining scan work credit. if gcw.scanWork > 0 { atomic.Xaddint64(&gcController.scanWork, gcw.scanWork) if flushBgCredit { gcFlushBgCredit(gcw.scanWork - initScanWork) } gcw.scanWork = 0 } }
func parfordo(desc *parfor) { // Obtain 0-based thread index. tid := atomic.Xadd(&desc.thrseq, 1) - 1 if tid >= desc.nthr { print("tid=", tid, " nthr=", desc.nthr, "\n") throw("parfor: invalid tid") } // If single-threaded, just execute the for serially. body := desc.body if desc.nthr == 1 { for i := uint32(0); i < desc.cnt; i++ { body(desc, i) } return } me := &desc.thr[tid] mypos := &me.pos for { for { // While there is local work, // bump low index and execute the iteration. pos := atomic.Xadd64(mypos, 1) begin := uint32(pos) - 1 end := uint32(pos >> 32) if begin < end { body(desc, begin) continue } break } // Out of work, need to steal something. idle := false for try := uint32(0); ; try++ { // If we don't see any work for long enough, // increment the done counter... if try > desc.nthr*4 && !idle { idle = true atomic.Xadd(&desc.done, 1) } // ...if all threads have incremented the counter, // we are done. extra := uint32(0) if !idle { extra = 1 } if desc.done+extra == desc.nthr { if !idle { atomic.Xadd(&desc.done, 1) } goto exit } // Choose a random victim for stealing. var begin, end uint32 victim := fastrand1() % (desc.nthr - 1) if victim >= tid { victim++ } victimpos := &desc.thr[victim].pos for { // See if it has any work. pos := atomic.Load64(victimpos) begin = uint32(pos) end = uint32(pos >> 32) if begin+1 >= end { end = 0 begin = end break } if idle { atomic.Xadd(&desc.done, -1) idle = false } begin2 := begin + (end-begin)/2 newpos := uint64(begin) | uint64(begin2)<<32 if atomic.Cas64(victimpos, pos, newpos) { begin = begin2 break } } if begin < end { // Has successfully stolen some work. if idle { throw("parfor: should not be idle") } atomic.Store64(mypos, uint64(begin)|uint64(end)<<32) me.nsteal++ me.nstealcnt += uint64(end) - uint64(begin) break } // Backoff. if try < desc.nthr { // nothing } else if try < 4*desc.nthr { me.nprocyield++ procyield(20) } else if !desc.wait { // If a caller asked not to wait for the others, exit now // (assume that most work is already done at this point). if !idle { atomic.Xadd(&desc.done, 1) } goto exit } else if try < 6*desc.nthr { me.nosyield++ osyield() } else { me.nsleep++ usleep(1) } } } exit: atomic.Xadd64(&desc.nsteal, int64(me.nsteal)) atomic.Xadd64(&desc.nstealcnt, int64(me.nstealcnt)) atomic.Xadd64(&desc.nprocyield, int64(me.nprocyield)) atomic.Xadd64(&desc.nosyield, int64(me.nosyield)) atomic.Xadd64(&desc.nsleep, int64(me.nsleep)) me.nsteal = 0 me.nstealcnt = 0 me.nprocyield = 0 me.nosyield = 0 me.nsleep = 0 }
// gcAssistAlloc performs GC work to make gp's assist debt positive. // gp must be the calling user gorountine. // // This must be called with preemption enabled. //go:nowritebarrier func gcAssistAlloc(gp *g) { // Don't assist in non-preemptible contexts. These are // generally fragile and won't allow the assist to block. if getg() == gp.m.g0 { return } if mp := getg().m; mp.locks > 0 || mp.preemptoff != "" { return } // Compute the amount of scan work we need to do to make the // balance positive. We over-assist to build up credit for // future allocations and amortize the cost of assisting. debtBytes := -gp.gcAssistBytes + gcOverAssistBytes scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes)) retry: // Steal as much credit as we can from the background GC's // scan credit. This is racy and may drop the background // credit below 0 if two mutators steal at the same time. This // will just cause steals to fail until credit is accumulated // again, so in the long run it doesn't really matter, but we // do have to handle the negative credit case. bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit) stolen := int64(0) if bgScanCredit > 0 { if bgScanCredit < scanWork { stolen = bgScanCredit gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen)) } else { stolen = scanWork gp.gcAssistBytes += debtBytes } atomic.Xaddint64(&gcController.bgScanCredit, -stolen) scanWork -= stolen if scanWork == 0 { // We were able to steal all of the credit we // needed. return } } // Perform assist work completed := false systemstack(func() { if atomic.Load(&gcBlackenEnabled) == 0 { // The gcBlackenEnabled check in malloc races with the // store that clears it but an atomic check in every malloc // would be a performance hit. // Instead we recheck it here on the non-preemptable system // stack to determine if we should preform an assist. // GC is done, so ignore any remaining debt. gp.gcAssistBytes = 0 return } // Track time spent in this assist. Since we're on the // system stack, this is non-preemptible, so we can // just measure start and end time. startTime := nanotime() decnwait := atomic.Xadd(&work.nwait, -1) if decnwait == work.nproc { println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc) throw("nwait > work.nprocs") } // drain own cached work first in the hopes that it // will be more cache friendly. gcw := &getg().m.p.ptr().gcw workDone := gcDrainN(gcw, scanWork) // If we are near the end of the mark phase // dispose of the gcw. if gcBlackenPromptly { gcw.dispose() } // Record that we did this much scan work. // // Back out the number of bytes of assist credit that // this scan work counts for. The "1+" is a poor man's // round-up, to ensure this adds credit even if // assistBytesPerWork is very low. gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone)) // If this is the last worker and we ran out of work, // signal a completion point. incnwait := atomic.Xadd(&work.nwait, +1) if incnwait > work.nproc { println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc, "gcBlackenPromptly=", gcBlackenPromptly) throw("work.nwait > work.nproc") } if incnwait == work.nproc && !gcMarkWorkAvailable(nil) { // This has reached a background completion // point. completed = true } duration := nanotime() - startTime _p_ := gp.m.p.ptr() _p_.gcAssistTime += duration if _p_.gcAssistTime > gcAssistTimeSlack { atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime) _p_.gcAssistTime = 0 } }) if completed { gcMarkDone() } if gp.gcAssistBytes < 0 { // We were unable steal enough credit or perform // enough work to pay off the assist debt. We need to // do one of these before letting the mutator allocate // more to prevent over-allocation. // // If this is because we were preempted, reschedule // and try some more. if gp.preempt { Gosched() goto retry } // Add this G to an assist queue and park. When the GC // has more background credit, it will satisfy queued // assists before flushing to the global credit pool. // // Note that this does *not* get woken up when more // work is added to the work list. The theory is that // there wasn't enough work to do anyway, so we might // as well let background marking take care of the // work that is available. lock(&work.assistQueue.lock) // If the GC cycle is over, just return. This is the // likely path if we completed above. We do this // under the lock to prevent a GC cycle from ending // between this check and queuing the assist. if atomic.Load(&gcBlackenEnabled) == 0 { unlock(&work.assistQueue.lock) return } oldHead, oldTail := work.assistQueue.head, work.assistQueue.tail if oldHead == 0 { work.assistQueue.head.set(gp) } else { oldTail.ptr().schedlink.set(gp) } work.assistQueue.tail.set(gp) gp.schedlink.set(nil) // Recheck for background credit now that this G is in // the queue, but can still back out. This avoids a // race in case background marking has flushed more // credit since we checked above. if atomic.Loadint64(&gcController.bgScanCredit) > 0 { work.assistQueue.head = oldHead work.assistQueue.tail = oldTail if oldTail != 0 { oldTail.ptr().schedlink.set(nil) } unlock(&work.assistQueue.lock) goto retry } // Park for real. goparkunlock(&work.assistQueue.lock, "GC assist wait", traceEvGoBlock, 2) // At this point either background GC has satisfied // this G's assist debt, or the GC cycle is over. } }