コード例 #1
0
ファイル: Frontier.go プロジェクト: prachibhansali/WebCrawler
func FetchAllOutgoingLinks(uname string) {
	getResp, err := http.Get("http://localhost:8080/JSoupRestAPIService/jsoup-api/jsoupapi/" + uname)

	if err != nil {
		fmt.Printf("Error in jsoup parsing %s \n", err.Error())
	} else {
		rb, err := ioutil.ReadAll(getResp.Body)
		//fmt.Printf("%s \n",rb)
		// Check for error
		if err != nil {
			fmt.Printf("Error in read parsing \n")
		} else {

			var outlinks []LinkStructure
			json.Unmarshal(rb, &outlinks)
			var fetchedurls map[string]bool = make(map[string]bool)

			for i := 0; i < len(outlinks); i++ {
				//fmt.Printf("\nurl = %s\n",outlinks[0].Link)
				str := strings.TrimSpace(outlinks[i].Link)
				obj := priorityQueue.NewURL(str, 0, false)
				obj.Canonicalize()
				strname := obj.Geturlname()
				count, ok := seenURLS[strname]
				if !ok {
					count, ok := queuedURLS[strname]
					if !ok {
						_, ok := fetchedurls[strname]
						if !ok && Isurlok(strname) && IsHTMLDoc(strname) {
							heap.Push(pq, obj)
							//fmt.Printf("\nadd %s to heap\n",strname)
							fetchedurls[strname] = true
							queuedURLS[strname] = 1
						}
					} else {
						queuedURLS[strname] = count + 1
					}
				} else {
					seenURLS[strname] = count + 1
				}
			}
			f, err := os.OpenFile("linkGraph.txt", os.O_RDWR|os.O_APPEND|os.O_CREATE, 0660)
			if err != nil {
				fmt.Printf("Error writing to file %s \n", err)
			} else {
				w := bufio.NewWriter(f)
				w.WriteString(uname + "\t")
				for key, _ := range fetchedurls {
					w.WriteString(key + "\t")
					//fmt.Printf("\nadd %s to heap",key)
				}
				w.WriteString("\n")
				w.Flush()
			}
		}
	}
}
コード例 #2
0
ファイル: Frontier.go プロジェクト: prachibhansali/WebCrawler
func main() {
	fmt.Println("Hello World!")
	heap.Init(pq)

	file, err := os.Open("seedurls")
	if err != nil {
		log.Fatal(err)
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	for scanner.Scan() {
		obj := priorityQueue.NewURL(strings.TrimSpace(scanner.Text()), 0, true)
		obj.Canonicalize()
		queuedURLS[obj.Geturlname()] = 0
		heap.Push(pq, obj)
	}

	for pq.Len() > 0 && len(seenURLS) <= 10 {
		var temp map[*priorityQueue.URL]bool = make(map[*priorityQueue.URL]bool)
		currUrl := heap.Pop(pq).(*priorityQueue.URL)
		var name string = currUrl.Geturlname()
		delete(queuedURLS, name)
		//fmt.Printf("considering for url %s\n",name)
		hostname, _ := url.Parse(name)
		_, ok := domainTimes[hostname.Host]
		for ok && pq.Len() > 0 {
			//fmt.Printf("did not reach here\n")
			temp[currUrl] = true
			currUrl := heap.Pop(pq).(*priorityQueue.URL)
			var name string = currUrl.Geturlname()
			delete(queuedURLS, name)
			hostname, _ := url.Parse(name)
			_, ok1 := domainTimes[hostname.Host]
			ok = ok1
		}
		if !ok {
			//fmt.Printf("not found in timestamps \n")
			ProcessURL(currUrl)
			seenURLS[currUrl.Geturlname()] = 0
			domainTimes[hostname.Host] = time.Now().Unix()
		} else {
			temp[currUrl] = true
			time.Sleep(time.Second * 2)
		}
		for key, _ := range temp {
			queuedURLS[key.Geturlname()] = 0
			heap.Push(pq, key)
		}
		removeUnallowedDomains()
	}
}