func readAllFiles(numReaders int, textFileCh chan string, fileCh chan *dm.File, ctrl GoRtnCntrl) { defer ctrl.wg.Done() nestedCtrl := ctrl nestedCtrl.wg = &sync.WaitGroup{} numReaders = dm.MaxInt(1, numReaders) for i := 0; i < numReaders; i++ { nestedCtrl.wg.Add(1) go readFiles(textFileCh, fileCh, nestedCtrl) } nestedCtrl.wg.Wait() close(fileCh) }
func expandArgsToTextFiles( args []string, textFileCh chan string, numFilters int, ftiMgr *FileTypeInfoManager, ctrl GoRtnCntrl) { defer ctrl.wg.Done() nestedCtrl := ctrl nestedCtrl.wg = &sync.WaitGroup{} candidateCh := make(chan string, runtime.NumCPU()) nestedCtrl.wg.Add(1) go expandArgs(args, candidateCh, nestedCtrl) numFilters = dm.MaxInt(1, numFilters) for i := 0; i < numFilters; i++ { nestedCtrl.wg.Add(1) go filterCandidates(candidateCh, textFileCh, ftiMgr, nestedCtrl) } nestedCtrl.wg.Wait() close(textFileCh) }
func MakeFileTypeInfoManager(ctrl GoRtnCntrl) *FileTypeInfoManager { p := &FileTypeInfoManager{ requestCh: make(chan *FileTypeInfoRequest, runtime.NumCPU()), ctrl: ctrl, } stopCh := ctrl.stopCh sendResponse := func(req *FileTypeInfoRequest, isText bool, updateCh chan *FileTypeInfoUpdate) { resp := &FileTypeInfoResponse{ fp: req.fp, isText: isText, } var update *FileTypeInfoUpdate if updateCh != nil { update = &FileTypeInfoUpdate{ ext: req.ext, isText: isText, } } respCh := req.respCh for !(respCh == nil && updateCh == nil) { // Do whichever is first possible. select { case <-stopCh: return case respCh <- resp: respCh = nil case updateCh <- update: updateCh = nil } } } // Create some go routines that will handle actually reading files // to determine if they are text or not. checkFileContentCh := make(chan *FileTypeInfoRequest, runtime.NumCPU()) updateCh := make(chan *FileTypeInfoUpdate, runtime.NumCPU()) checkFileContentGR := func() { defer ctrl.wg.Done() for { select { case <-stopCh: return case req, ok := <-checkFileContentCh: if !ok { // No more requests. return } isText := fileContainsText(req.fp) sendResponse(req, isText, updateCh) } } } numGoRoutines := dm.MaxInt(1, runtime.NumCPU()) for ; numGoRoutines > 0; numGoRoutines-- { ctrl.wg.Add(1) go checkFileContentGR() } // And create a go routine that will handle requests for file type. ctrl.wg.Add(1) go func() { defer ctrl.wg.Done() textTypes := make(map[string]int) nonTextTypes := make(map[string]int) defer func() { p.textTypes = textTypes p.nonTextTypes = nonTextTypes }() guesstimateType := func(ext string) (isText, isUnknown bool) { if ext == "" { isUnknown = true return } if v, ok := knownExtensions[ext]; ok { isText = v return } // Take a guess based on past files. ntt, tt := nonTextTypes[ext], textTypes[ext] if (ntt == 0 && tt > 10) || (tt-ntt > 20) { // Probably text. glog.V(1).Infof("Past evidence (%d, %d) indicates a text extension: %s", ntt, tt, ext) isText = true return } else if (tt == 0 && ntt > 10) || (ntt-tt > 20) { // Probably binary. glog.V(1).Infof("Past evidence (%d, %d) indicates a binary extension: %s", ntt, tt, ext) return } isText, isUnknown = fileExtImpliesText(ext) return } for { select { case <-stopCh: return case update := <-updateCh: if update.isText { textTypes[update.ext]++ } else { nonTextTypes[update.ext]++ } case req, ok := <-p.requestCh: if !ok { return } isText, isUnknown := guesstimateType(req.ext) if isUnknown { // Ask another go routine to read the file in order to figure out // the answer. That routine will take care of providing the answer // to select { case <-stopCh: return case checkFileContentCh <- req: } } else { sendResponse(req, isText, nil) } } } }() return p }
func makeLineInstance(file File, start FileOffset, lineNumber LineNo, lineBytes []byte) *lineInstance { length := len(lineBytes) // There are around 20 space characters in Unicode, but we're only handling // ASCII tab and space characters here. n := 0 for ; n < length; n++ { if lineBytes[n] != '\t' { break } } tabCount := n for ; n < length; n++ { if lineBytes[n] != ' ' { break } } spaceCount := n - tabCount // Content starts after leading whitespace, any mixture of spaces and tabs. for ; n < length; n++ { b := lineBytes[n] if b != ' ' && b != '\t' { break } } contentIndex := n if contentIndex > tabCount+spaceCount { // Line doesn't start with "well-formed" indentation, so we can't as readily // compare indentations. So, mark this line such that we can detect this. tabCount = 255 spaceCount = 255 } // Search from the end for the first non-whitespace character. n = length - 1 TrailingLoop: for ; n >= contentIndex; n-- { switch lineBytes[n] { case ' ', '\n', '\r', '\t', '\f', '\v': // Whitespace. continue default: break TrailingLoop } } contendEndIndex := dm.MaxInt(contentIndex, n) + 1 contentBytes := lineBytes[contentIndex:contendEndIndex] contentLength := len(contentBytes) lineHash, contentHash := theLineHasher.Compute2(lineBytes, contentBytes) p := &lineInstance{ file: file, lineNumber: lineNumber, start: start, end: start + FileOffset(length), contentStart: start + FileOffset(contentIndex), contentLength: FileOffset(contentLength), hash: HashType(lineHash), contentHash: HashType(contentHash), leadingTabs: uint8(dm.MinInt(255, tabCount)), leadingSpaces: uint8(dm.MinInt(255, spaceCount)), lineType: normalLine, } if contentLength == 0 || dm.ComputeIsProbablyCommon(contentBytes) { p.lineType = wellKnownLine } return p }