Example #1
0
func main() {
	defer func() {
		tmpFn := villa.Path("/tmp/gddo")
		if err := tmpFn.RemoveAll(); err != nil {
			log.Printf("Delete %v failed: %v", tmpFn, err)
		}
	}()

	singlePackge := ""
	singleETag := ""
	flag.StringVar(&singlePackge, "pkg", singlePackge, "Crawling single package")
	flag.StringVar(&singleETag, "etag", singleETag, "ETag for single package crawling")

	flag.Parse()

	httpClient := gcse.GenHttpClient("")

	if singlePackge != "" {
		log.Printf("Crawling single package %s ...", singlePackge)
		p, err := gcse.CrawlPackage(httpClient, singlePackge, singleETag)
		if err != nil {
			fmtp.Printfln("Crawling package %s failured: %v", singlePackge, err)
		} else {
			fmtp.Printfln("Package %s: %+v", singlePackge, p)
		}
		return
	}

	log.Println("crawler started...")

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	fpDataRoot := sophie.FsPath{
		Fs:   sophie.LocalFS,
		Path: gcse.DataRoot.S(),
	}

	fpDocs := fpDataRoot.Join(gcse.FnDocs)
	if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil {
		log.Fatalf("loadAllDocsPkgs: %v", err)
	}
	log.Printf("%d docs loaded!", len(allDocsPkgs))

	AppStopTime = time.Now().Add(gcse.CrawlerDuePerRun)

	//pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)
	fpCrawler := fpDataRoot.Join(gcse.FnCrawlerDB)
	fpToCrawl := fpDataRoot.Join(gcse.FnToCrawl)

	fpNewDocs := fpCrawler.Join(gcse.FnNewDocs)
	fpNewDocs.Remove()

	pkgEnd := make(chan error, 1)
	go crawlPackages(httpClient, fpToCrawl.Join(gcse.FnPackage), fpNewDocs,
		pkgEnd)

	psnEnd := make(chan error, 1)
	go crawlPersons(httpClient, fpToCrawl.Join(gcse.FnPerson), psnEnd)

	errPkg, errPsn := <-pkgEnd, <-psnEnd
	if errPkg != nil || errPsn != nil {
		log.Fatalf("Some job may failed, package: %v, person: %v",
			errPkg, errPsn)
	}

	if err := processImports(); err != nil {
		log.Printf("processImports failed: %v", err)
	}

	syncDatabases()
	log.Println("crawler stopped...")
}
Example #2
0
// OnlyMapper.Map
func (pc *PackageCrawler) Map(key, val sophie.SophieWriter, c []sophie.Collector) error {
	if time.Now().After(AppStopTime) {
		log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
			pc.part, key)
		return mr.EOM
	}
	pkg := string(*key.(*sophie.RawString))
	ent := val.(*gcse.CrawlingEntry)
	if ent.Version < gcse.CrawlerVersion {
		// if gcse.CrawlerVersion is larger than Version, Etag is ignored.
		ent.Etag = ""
	}
	log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag)

	p, flds, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag)
	for _, fld := range flds {
		if spider.LikeGoSubFolder(fld.Name) {
			appendNewPackage(pkg+"/"+fld.Path, "parent")
		}
	}
	site, path := utils.SplitPackage(pkg)
	if err != nil && errorsp.Cause(err) != gcse.ErrPackageNotModifed {
		log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err)
		if gcse.IsBadPackage(err) {
			utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Invalid), "AppendPackageEvent %v %v failed", site, path)
			bi.AddValue(bi.Sum, "crawler.package.wrong-package", 1)
			// a wrong path
			nda := gcse.NewDocAction{
				Action: gcse.NDA_DEL,
			}
			c[0].Collect(sophie.RawString(pkg), &nda)
			cDB.PackageDB.Delete(pkg)
			log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg)
		} else {
			utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Failed), "AppendPackageEvent %v %v failed", site, path)
			bi.Inc("crawler.package.failed")
			if strings.HasPrefix(pkg, "github.com/") {
				bi.Inc("crawler.package.failed.github")
			}
			pc.failCount++

			cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag)

			if pc.failCount >= 10 || strings.Contains(err.Error(), "403") {
				durToSleep := 10 * time.Minute
				if time.Now().Add(durToSleep).After(AppStopTime) {
					log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
						pc.part, key)
					return mr.EOM
				}

				log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)",
					pc.part, pkg)
				time.Sleep(durToSleep)
				pc.failCount = 0
			}
		}
		return nil
	}
	utils.LogError(store.AppendPackageEvent(site, path, "", time.Now(), sppb.HistoryEvent_Action_Success), "AppendPackageEvent %v %v failed", site, path)
	pc.failCount = 0
	if errorsp.Cause(err) == gcse.ErrPackageNotModifed {
		// TODO crawling stars for unchanged project
		log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg)
		schedulePackageNextCrawl(pkg, ent.Etag)
		bi.AddValue(bi.Sum, "crawler.package.not-modified", 1)
		return nil
	}
	bi.AddValue(bi.Sum, "crawler.package.success", 1)
	if strings.HasPrefix(pkg, "github.com/") {
		bi.AddValue(bi.Sum, "crawler.package.success.github", 1)
	}
	log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg)

	var pkgInfo *stpb.PackageInfo
	if err := store.UpdatePackage(site, path, func(pi *stpb.PackageInfo) error {
		fillPackageInfo(p, pi)
		pkgInfo = pi
		return nil
	}); err != nil {
		log.Printf("UpdatePackage %v %v failed: %v", site, path, err)
	}
	saveRelatedInfo(pkgInfo)

	nda := gcse.NewDocAction{
		Action:  gcse.NDA_UPDATE,
		DocInfo: packageToDoc(p),
	}
	c[0].Collect(sophie.RawString(pkg), &nda)
	log.Printf("[Part %d] Package %s saved!", pc.part, pkg)

	if !strings.HasPrefix(pkg, "github.com/") {
		// github.com throttling is done within the GithubSpider.
		time.Sleep(10 * time.Second)
	}
	return nil
}
Example #3
0
func crawlEntriesLoop() {
	httpClient := gcse.GenHttpClient("")

	for time.Now().Before(AppStopTime) {
		checkImports()

		if gcse.CrawlByGodocApi {
			processGodoc(httpClient)
		}

		didSomething := false
		var wg sync.WaitGroup

		pkgGroups := listPackagesByHost(5, 50)
		if len(pkgGroups) > 0 {
			didSomething = true

			log.Printf("Crawling packages of %d groups", len(pkgGroups))

			wg.Add(len(pkgGroups))

			for host, ents := range pkgGroups {
				go func(host string, ents []EntryInfo) {
					failCount := 0
					for _, ent := range ents {
						if time.Now().After(AppStopTime) {
							break
						}
						runtime.GC()
						p, err := gcse.CrawlPackage(httpClient, ent.ID, ent.Etag)
						if err != nil && err != gcse.ErrPackageNotModifed {
							log.Printf("Crawling pkg %s failed: %v", ent.ID, err)

							if gcse.IsBadPackage(err) {
								// a wrong path
								deletePackage(ent.ID)
								log.Printf("Remove wrong package %s", ent.ID)
							} else {
								failCount++

								schedulePackage(ent.ID, time.Now().Add(
									12*time.Hour), ent.Etag)

								if failCount >= 10 {
									durToSleep := 10 * time.Minute
									if time.Now().Add(durToSleep).After(AppStopTime) {
										break
									}

									log.Printf("Last ten crawling %s packages failed, sleep for a while...",
										host)
									time.Sleep(durToSleep)
									failCount = 0
								}
							}
							continue
						}

						failCount = 0
						if err == gcse.ErrPackageNotModifed {
							log.Printf("Package %s unchanged!", ent.ID)
							schedulePackageNextCrawl(ent.ID, ent.Etag)
							continue
						}

						log.Printf("Crawled package %s success!", ent.ID)

						pushPackage(p)
						log.Printf("Package %s saved!", ent.ID)
					}

					wg.Done()
				}(host, ents)
			}
		}

		personGroups := listPersonsByHost(5, 100)
		if len(personGroups) > 0 {
			didSomething = true

			log.Printf("Crawling persons of %d groups", len(personGroups))

			wg.Add(len(personGroups))

			for host, ents := range personGroups {
				go func(host string, ents []EntryInfo) {
					failCount := 0
					for _, ent := range ents {
						if time.Now().After(AppStopTime) {
							break
						}

						p, err := gcse.CrawlPerson(httpClient, ent.ID)
						if err != nil {
							failCount++
							log.Printf("Crawling person %s failed: %v", ent.ID, err)

							schedulePerson(ent.ID, time.Now().Add(12*time.Hour))

							if failCount >= 10 {
								durToSleep := 10 * time.Minute
								if time.Now().Add(durToSleep).After(AppStopTime) {
									break
								}

								log.Printf("Last ten crawling %s persons failed, sleep for a while...",
									host)
								time.Sleep(durToSleep)
								failCount = 0
							}
							continue
						}

						log.Printf("Crawled person %s success!", ent.ID)
						pushPerson(p)
						log.Printf("Push person %s success", ent.ID)
						failCount = 0
					}

					wg.Done()
				}(host, ents)
			}
		}
		wg.Wait()

		syncDatabases()

		if gcse.CrawlGithubUpdate {
			if touchByGithubUpdates() {
				didSomething = true
			}
		}

		if !didSomething {
			log.Printf("Nothing to crawl sleep for a while...")
			time.Sleep(2 * time.Minute)
		}
	}
}
Example #4
0
// OnlyMapper.Map
func (pc *PackageCrawler) Map(key, val sophie.SophieWriter,
	c []sophie.Collector) error {
	if time.Now().After(AppStopTime) {
		log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
			pc.part, key)
		return mr.EOM
	}

	pkg := string(*key.(*sophie.RawString))
	ent := val.(*gcse.CrawlingEntry)
	if ent.Version < gcse.CrawlerVersion {
		// if gcse.CrawlerVersion is larger than Version, Etag is ignored.
		ent.Etag = ""
	}
	log.Printf("[Part %d] Crawling package %v with etag %s\n", pc.part, pkg, ent.Etag)

	p, err := gcse.CrawlPackage(pc.httpClient, pkg, ent.Etag)
	_ = p
	if err != nil && err != gcse.ErrPackageNotModifed {
		log.Printf("[Part %d] Crawling pkg %s failed: %v", pc.part, pkg, err)
		if gcse.IsBadPackage(err) {
			// a wrong path
			nda := gcse.NewDocAction{
				Action: gcse.NDA_DEL,
			}
			c[0].Collect(sophie.RawString(pkg), &nda)
			cDB.PackageDB.Delete(pkg)
			log.Printf("[Part %d] Remove wrong package %s", pc.part, pkg)
		} else {
			pc.failCount++

			cDB.SchedulePackage(pkg, time.Now().Add(12*time.Hour), ent.Etag)

			if pc.failCount >= 10 || strings.Contains(err.Error(), "403") {
				durToSleep := 10 * time.Minute
				if time.Now().Add(durToSleep).After(AppStopTime) {
					log.Printf("[Part %d] Timeout(key = %v), PackageCrawler returns EOM",
						pc.part, key)
					return mr.EOM
				}

				log.Printf("[Part %d] Last ten crawling packages failed, sleep for a while...(current: %s)",
					pc.part, pkg)
				time.Sleep(durToSleep)
				pc.failCount = 0
			}
		}
		return nil
	}

	pc.failCount = 0
	if err == gcse.ErrPackageNotModifed {
		// TODO crawling stars for unchanged project
		log.Printf("[Part %d] Package %s unchanged!", pc.part, pkg)
		schedulePackageNextCrawl(pkg, ent.Etag)
		return nil
	}

	log.Printf("[Part %d] Crawled package %s success!", pc.part, pkg)

	nda := gcse.NewDocAction{
		Action:  gcse.NDA_UPDATE,
		DocInfo: packageToDoc(p),
	}
	c[0].Collect(sophie.RawString(pkg), &nda)
	log.Printf("[Part %d] Package %s saved!", pc.part, pkg)

	time.Sleep(10 * time.Second)

	return nil
}
Example #5
0
func main() {
	runtime.GOMAXPROCS(2)

	log.Printf("Using personal: %v", configs.CrawlerGithubPersonal)
	gcse.GithubSpider = github.NewSpiderWithToken(configs.CrawlerGithubPersonal)

	if db, err := bh.Open(configs.DataRoot.Join("filecache.bolt").S(), 0644, nil); err == nil {
		log.Print("Using file cache!")
		gcse.GithubSpider.FileCache = spider.BoltFileCache{
			DB:         db,
			IncCounter: bi.Inc,
		}
	} else {
		log.Printf("Open file cache failed: %v", err)
	}

	cleanTempDir()
	defer cleanTempDir()

	singlePackage := flag.String("pkg", "", "Crawling a single package")
	singleETag := flag.String("etag", "", "ETag for the single package crawling")
	singlePerson := flag.String("person", "", "Crawling a single person")

	flag.Parse()

	httpClient := gcse.GenHttpClient("")

	if *singlePerson != "" {
		log.Printf("Crawling single person %s ...", *singlePerson)
		p, err := gcse.CrawlPerson(httpClient, *singlePerson)
		if err != nil {
			fmtp.Printfln("Crawling person %s failed: %v", *singlePerson, err)
		} else {
			fmtp.Printfln("Person %s: %+v", *singlePerson, p)
		}
	}
	if *singlePackage != "" {
		log.Printf("Crawling single package %s ...", *singlePackage)
		p, flds, err := gcse.CrawlPackage(httpClient, *singlePackage, *singleETag)
		if err != nil {
			fmtp.Printfln("Crawling package %s failed: %v, folders: %v", *singlePackage, err, flds)
		} else {
			fmtp.Printfln("Package %s: %+v, folders: %v", *singlePackage, p, flds)
		}
	}
	if *singlePackage != "" || *singlePerson != "" {
		return
	}

	log.Println("crawler started...")

	// Load CrawlerDB
	cDB = gcse.LoadCrawlerDB()

	fpDataRoot := sophie.FsPath{
		Fs:   sophie.LocalFS,
		Path: configs.DataRoot.S(),
	}

	fpDocs := fpDataRoot.Join(configs.FnDocs)
	if err := loadAllDocsPkgs(kv.DirInput(fpDocs)); err != nil {
		log.Fatalf("loadAllDocsPkgs: %v", err)
	}
	log.Printf("%d docs loaded!", len(allDocsPkgs))

	AppStopTime = time.Now().Add(configs.CrawlerDuePerRun)

	//pathToCrawl := gcse.DataRoot.Join(gcse.FnToCrawl)
	fpCrawler := fpDataRoot.Join(configs.FnCrawlerDB)
	fpToCrawl := fpDataRoot.Join(configs.FnToCrawl)

	fpNewDocs := fpCrawler.Join(configs.FnNewDocs)
	fpNewDocs.Remove()

	if err := processImports(); err != nil {
		log.Printf("processImports failed: %v", err)
	}

	pkgEnd := make(chan error, 1)
	go crawlPackages(httpClient, fpToCrawl.Join(configs.FnPackage), fpNewDocs, pkgEnd)

	psnEnd := make(chan error, 1)
	go crawlPersons(httpClient, fpToCrawl.Join(configs.FnPerson), psnEnd)

	errPkg, errPsn := <-pkgEnd, <-psnEnd
	bi.Flush()
	bi.Process()
	syncDatabases()
	if errPkg != nil || errPsn != nil {
		log.Fatalf("Some job may failed, package: %v, person: %v", errPkg, errPsn)
	}
	log.Println("crawler stopped...")
}