// WriteByteOffs generates a kv file with key-value pairs represented as a // slice of buffer and some int slices of key offsets, key ends, value offsets, // and value ends. func WriteByteOffs(fp sophie.FsPath, buffer []byte, keyOffs, keyEnds, valOffs, valEnds []int) error { writer, err := fp.Create() if err != nil { return err } defer writer.Close() for i, keyOff := range keyOffs { keyEnd, valOff, valEnd := keyEnds[i], valOffs[i], valEnds[i] if err := sophie.VInt(keyEnd - keyOff).WriteTo(writer); err != nil { return err } if _, err := writer.Write(buffer[keyOff:keyEnd]); err != nil { return err } if err := sophie.VInt(valEnd - valOff).WriteTo(writer); err != nil { return err } if _, err := writer.Write(buffer[valOff:valEnd]); err != nil { return err } } return nil }
// ReadAsByteOffs reads a kv file as a slice of buffer and some int slices // of key offsets, key ends, value offsets, and value ends. func ReadAsByteOffs(fp sophie.FsPath) (buffer villa.ByteSlice, keyOffs, keyEnds, valOffs, valEnds villa.IntSlice, err error) { fi, err := fp.Stat() if err != nil { return nil, nil, nil, nil, nil, err } reader, err := fp.Open() if err != nil { return nil, nil, nil, nil, nil, err } defer reader.Close() buffer = make([]byte, fi.Size()) if n, err := reader.Read(buffer); n != len(buffer) || err != nil { if err != nil { return nil, nil, nil, nil, nil, err } return nil, nil, nil, nil, nil, errors.New(fmt.Sprintf( "Expected %d bytes, but only read %d bytes", len(buffer), n)) } buf := countReadCloser(villa.NewPByteSlice(buffer)) for buf.Pos < int64(len(buffer)) { var l sophie.VInt if err := (&l).ReadFrom(buf, -1); err != nil { log.Printf("Failed to read key-lenth: %v", err) return nil, nil, nil, nil, nil, sophie.ErrBadFormat } keyOffs = append(keyOffs, int(buf.Pos)) if _, err := buf.Skip(int64(l)); err != nil { log.Printf("Failed to skip key: %v", err) return nil, nil, nil, nil, nil, sophie.ErrBadFormat } keyEnds = append(keyEnds, int(buf.Pos)) if err := (&l).ReadFrom(buf, -1); err != nil { log.Printf("Failed to read value-lenth: %v", err) return nil, nil, nil, nil, nil, sophie.ErrBadFormat } valOffs = append(valOffs, int(buf.Pos)) if _, err := buf.Skip(int64(l)); err != nil { log.Printf("Failed to skip value: %v", err) return nil, nil, nil, nil, nil, sophie.ErrBadFormat } valEnds = append(valEnds, int(buf.Pos)) } return }
func main() { defer func() { tmpFn := villa.Path("/tmp/gddo") if err := tmpFn.RemoveAll(); err != nil { log.Printf("Delete %v failed: %v", tmpFn, err) } }() singlePackge := "" singleETag := "" flag.StringVar(&singlePackge, "pkg", singlePackge, "Crawling single package") flag.StringVar(&singleETag, "etag", singleETag, "ETag for single package crawling") flag.Parse() httpClient := gcse.GenHttpClient("") if singlePackge != "" { log.Printf("Crawling single package %s ...", singlePackge) p, err := gcse.CrawlPackage(httpClient, singlePackge, singleETag) if err != nil { fmtp.Printfln("Crawling package %s failured: %v", singlePackge, err) } else { fmtp.Printfln("Package %s: %+v", singlePackge, p) } return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: gcse.DataRoot.S(), } fpDocs := fpDataRoot.Join(gcse.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(gcse.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(gcse.FnToCrawl) fpNewDocs := fpCrawler.Join(gcse.FnNewDocs) fpNewDocs.Remove() pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(gcse.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(gcse.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } syncDatabases() log.Println("crawler stopped...") }
func main() { runtime.GOMAXPROCS(2) log.Printf("Using personal: %v", configs.CrawlerGithubPersonal) gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal) if db, err := bh.Open(configs.DataRoot.Join("filecache.bolt").S(), 0644, nil); err == nil { log.Print("Using file cache!") gcse.GithubSpider.FileCache = spider.BoltFileCache{ DB: db, IncCounter: bi.Inc, } } else { log.Printf("Open file cache failed: %v", err) } cleanTempDir() defer cleanTempDir() singlePackage := flag.String("pkg", "", "Crawling a single package") singleETag := flag.String("etag", "", "ETag for the single package crawling") singlePerson := flag.String("person", "", "Crawling a single person") flag.Parse() httpClient := gcse.GenHttpClient("") if *singlePerson != "" { log.Printf("Crawling single person %s ...", *singlePerson) p, err := gcse.CrawlPerson(httpClient, *singlePerson) if err != nil { fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err) } else { fmtp.Printfln("Person %s: %+v", *singlePerson, p) } } if *singlePackage != "" { log.Printf("Crawling single package %s ...", *singlePackage) p, flds, err := gcse.CrawlPackage(httpClient, *singlePackage, *singleETag) if err != nil { fmtp.Printfln("Crawling package %s failed: %v, folders: %v", *singlePackage, err, flds) } else { fmtp.Printfln("Package %s: %+v, folders: %v", *singlePackage, p, flds) } } if *singlePackage != "" || *singlePerson != "" { return } log.Println("crawler started...") // Load CrawlerDB cDB = gcse.LoadCrawlerDB() fpDataRoot := sophie.FsPath{ Fs: sophie.LocalFS, Path: configs.DataRoot.S(), } fpDocs := fpDataRoot.Join(configs.FnDocs) if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil { log.Fatalf("loadAllDocsPkgs: %v", err) } log.Printf("%d docs loaded!", len(allDocsPkgs)) AppStopTime = time.Now().Add(configs.CrawlerDuePerRun) //pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl) fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB) fpToCrawl := fpDataRoot.Join(configs.FnToCrawl) fpNewDocs := fpCrawler.Join(configs.FnNewDocs) fpNewDocs.Remove() if err := processImports(); err != nil { log.Printf("processImports failed: %v", err) } pkgEnd := make(chan error, 1) go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd) psnEnd := make(chan error, 1) go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd) errPkg, errPsn := <-pkgEnd, <-psnEnd bi.Flush() bi.Process() syncDatabases() if errPkg != nil || errPsn != nil { log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn) } log.Println("crawler stopped...") }