Пример #1
0
// WriteByteOffs generates a kv file with key-value pairs represented as a
// slice of buffer and some int slices of key offsets, key ends, value offsets,
// and value ends.
func WriteByteOffs(fp sophie.FsPath, buffer []byte,
	keyOffs, keyEnds, valOffs, valEnds []int) error {
	writer, err := fp.Create()
	if err != nil {
		return err
	}
	defer writer.Close()

	for i, keyOff := range keyOffs {
		keyEnd, valOff, valEnd := keyEnds[i], valOffs[i], valEnds[i]
		if err := sophie.VInt(keyEnd - keyOff).WriteTo(writer); err != nil {
			return err
		}
		if _, err := writer.Write(buffer[keyOff:keyEnd]); err != nil {
			return err
		}
		if err := sophie.VInt(valEnd - valOff).WriteTo(writer); err != nil {
			return err
		}
		if _, err := writer.Write(buffer[valOff:valEnd]); err != nil {
			return err
		}
	}
	return nil
}
Пример #2
0
// ReadAsByteOffs reads a kv file as a slice of buffer and some int slices
// of key offsets, key ends, value offsets, and value ends.
func ReadAsByteOffs(fp sophie.FsPath) (buffer villa.ByteSlice,
	keyOffs, keyEnds, valOffs, valEnds villa.IntSlice, err error) {
	fi, err := fp.Stat()
	if err != nil {
		return nil, nil, nil, nil, nil, err
	}

	reader, err := fp.Open()
	if err != nil {
		return nil, nil, nil, nil, nil, err
	}
	defer reader.Close()

	buffer = make([]byte, fi.Size())
	if n, err := reader.Read(buffer); n != len(buffer) || err != nil {
		if err != nil {
			return nil, nil, nil, nil, nil, err
		}
		return nil, nil, nil, nil, nil, errors.New(fmt.Sprintf(
			"Expected %d bytes, but only read %d bytes", len(buffer), n))
	}
	buf := countReadCloser(villa.NewPByteSlice(buffer))
	for buf.Pos < int64(len(buffer)) {
		var l sophie.VInt
		if err := (&l).ReadFrom(buf, -1); err != nil {
			log.Printf("Failed to read key-lenth: %v", err)
			return nil, nil, nil, nil, nil, sophie.ErrBadFormat
		}
		keyOffs = append(keyOffs, int(buf.Pos))
		if _, err := buf.Skip(int64(l)); err != nil {
			log.Printf("Failed to skip key: %v", err)
			return nil, nil, nil, nil, nil, sophie.ErrBadFormat
		}
		keyEnds = append(keyEnds, int(buf.Pos))
		if err := (&l).ReadFrom(buf, -1); err != nil {
			log.Printf("Failed to read value-lenth: %v", err)
			return nil, nil, nil, nil, nil, sophie.ErrBadFormat
		}
		valOffs = append(valOffs, int(buf.Pos))
		if _, err := buf.Skip(int64(l)); err != nil {
			log.Printf("Failed to skip value: %v", err)
			return nil, nil, nil, nil, nil, sophie.ErrBadFormat
		}
		valEnds = append(valEnds, int(buf.Pos))
	}
	return
}
Пример #3
0
func main() {
	defer func() {
		tmpFn := villa.Path("/tmp/gddo")
		if err := tmpFn.RemoveAll(); err != nil {
			log.Printf("Delete %v failed: %v", tmpFn, err)
		}
	}()

	singlePackge := ""
	singleETag := ""
	flag.StringVar(&singlePackge, "pkg", singlePackge, "Crawling single package")
	flag.StringVar(&singleETag, "etag", singleETag, "ETag for single package crawling")

	flag.Parse()

	httpClient := gcse.GenHttpClient("")

	if singlePackge != "" {
		log.Printf("Crawling single package %s ...", singlePackge)
		p, err := gcse.CrawlPackage(httpClient, singlePackge, singleETag)
		if err != nil {
			fmtp.Printfln("Crawling package %s failured: %v", singlePackge, err)
		} else {
			fmtp.Printfln("Package %s: %+v", singlePackge, p)
		}
		return
	}

	log.Println("crawler started...")

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	fpDataRoot := sophie.FsPath{
		Fs:   sophie.LocalFS,
		Path: gcse.DataRoot.S(),
	}

	fpDocs := fpDataRoot.Join(gcse.FnDocs)
	if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil {
		log.Fatalf("loadAllDocsPkgs: %v", err)
	}
	log.Printf("%d docs loaded!", len(allDocsPkgs))

	AppStopTime = time.Now().Add(gcse.CrawlerDuePerRun)

	//pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)
	fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB)
	fpToCrawl := fpDataRoot.Join(gcse.FnToCrawl)

	fpNewDocs := fpCrawler.Join(gcse.FnNewDocs)
	fpNewDocs.Remove()

	pkgEnd := make(chan error, 1)
	go crawlPackages(httpClient, fpToCrawl.Join(gcse.FnPackage), fpNewDocs,
		pkgEnd)

	psnEnd := make(chan error, 1)
	go crawlPersons(httpClient, fpToCrawl.Join(gcse.FnPerson), psnEnd)

	errPkg, errPsn := <-pkgEnd, <-psnEnd
	if errPkg != nil || errPsn != nil {
		log.Fatalf("Some job may failed, package: %v, person: %v",
			errPkg, errPsn)
	}

	if err := processImports(); err != nil {
		log.Printf("processImports failed: %v", err)
	}

	syncDatabases()
	log.Println("crawler stopped...")
}
Пример #4
0
func main() {
	runtime.GOMAXPROCS(2)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	if db, err := bh.Open(configs.DataRoot.Join("filecache.bolt").S(), 0644, nil); err == nil {
		log.Print("Using file cache!")
		gcse.GithubSpider.FileCache = spider.BoltFileCache{
			DB:         db,
			IncCounter: bi.Inc,
		}
	} else {
		log.Printf("Open file cache failed: %v", err)
	}

	cleanTempDir()
	defer cleanTempDir()

	singlePackage := flag.String("pkg", "", "Crawling a single package")
	singleETag := flag.String("etag", "", "ETag for the single package crawling")
	singlePerson := flag.String("person", "", "Crawling a single person")

	flag.Parse()

	httpClient := gcse.GenHttpClient("")

	if *singlePerson != "" {
		log.Printf("Crawling single person %s ...", *singlePerson)
		p, err := gcse.CrawlPerson(httpClient, *singlePerson)
		if err != nil {
			fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err)
		} else {
			fmtp.Printfln("Person %s: %+v", *singlePerson, p)
		}
	}
	if *singlePackage != "" {
		log.Printf("Crawling single package %s ...", *singlePackage)
		p, flds, err := gcse.CrawlPackage(httpClient, *singlePackage, *singleETag)
		if err != nil {
			fmtp.Printfln("Crawling package %s failed: %v, folders: %v", *singlePackage, err, flds)
		} else {
			fmtp.Printfln("Package %s: %+v, folders: %v", *singlePackage, p, flds)
		}
	}
	if *singlePackage != "" || *singlePerson != "" {
		return
	}

	log.Println("crawler started...")

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	fpDataRoot := sophie.FsPath{
		Fs:   sophie.LocalFS,
		Path: configs.DataRoot.S(),
	}

	fpDocs := fpDataRoot.Join(configs.FnDocs)
	if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil {
		log.Fatalf("loadAllDocsPkgs: %v", err)
	}
	log.Printf("%d docs loaded!", len(allDocsPkgs))

	AppStopTime = time.Now().Add(configs.CrawlerDuePerRun)

	//pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)
	fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB)
	fpToCrawl := fpDataRoot.Join(configs.FnToCrawl)

	fpNewDocs := fpCrawler.Join(configs.FnNewDocs)
	fpNewDocs.Remove()

	if err := processImports(); err != nil {
		log.Printf("processImports failed: %v", err)
	}

	pkgEnd := make(chan error, 1)
	go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd)

	psnEnd := make(chan error, 1)
	go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd)

	errPkg, errPsn := <-pkgEnd, <-psnEnd
	bi.Flush()
	bi.Process()
	syncDatabases()
	if errPkg != nil || errPsn != nil {
		log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn)
	}
	log.Println("crawler stopped...")
}