func FetchAllOutgoingLinks(uname string) { getResp, err := http.Get("http://localhost:8080/JSoupRestAPIService/jsoup-api/jsoupapi/" + uname) if err != nil { fmt.Printf("Error in jsoup parsing %s \n", err.Error()) } else { rb, err := ioutil.ReadAll(getResp.Body) //fmt.Printf("%s \n",rb) // Check for error if err != nil { fmt.Printf("Error in read parsing \n") } else { var outlinks []LinkStructure json.Unmarshal(rb, &outlinks) var fetchedurls map[string]bool = make(map[string]bool) for i := 0; i < len(outlinks); i++ { //fmt.Printf("\nurl = %s\n",outlinks[0].Link) str := strings.TrimSpace(outlinks[i].Link) obj := priorityQueue.NewURL(str, 0, false) obj.Canonicalize() strname := obj.Geturlname() count, ok := seenURLS[strname] if !ok { count, ok := queuedURLS[strname] if !ok { _, ok := fetchedurls[strname] if !ok && Isurlok(strname) && IsHTMLDoc(strname) { heap.Push(pq, obj) //fmt.Printf("\nadd %s to heap\n",strname) fetchedurls[strname] = true queuedURLS[strname] = 1 } } else { queuedURLS[strname] = count + 1 } } else { seenURLS[strname] = count + 1 } } f, err := os.OpenFile("linkGraph.txt", os.O_RDWR|os.O_APPEND|os.O_CREATE, 0660) if err != nil { fmt.Printf("Error writing to file %s \n", err) } else { w := bufio.NewWriter(f) w.WriteString(uname + "\t") for key, _ := range fetchedurls { w.WriteString(key + "\t") //fmt.Printf("\nadd %s to heap",key) } w.WriteString("\n") w.Flush() } } } }
func main() { fmt.Println("Hello World!") heap.Init(pq) file, err := os.Open("seedurls") if err != nil { log.Fatal(err) } defer file.Close() scanner := bufio.NewScanner(file) for scanner.Scan() { obj := priorityQueue.NewURL(strings.TrimSpace(scanner.Text()), 0, true) obj.Canonicalize() queuedURLS[obj.Geturlname()] = 0 heap.Push(pq, obj) } for pq.Len() > 0 && len(seenURLS) <= 10 { var temp map[*priorityQueue.URL]bool = make(map[*priorityQueue.URL]bool) currUrl := heap.Pop(pq).(*priorityQueue.URL) var name string = currUrl.Geturlname() delete(queuedURLS, name) //fmt.Printf("considering for url %s\n",name) hostname, _ := url.Parse(name) _, ok := domainTimes[hostname.Host] for ok && pq.Len() > 0 { //fmt.Printf("did not reach here\n") temp[currUrl] = true currUrl := heap.Pop(pq).(*priorityQueue.URL) var name string = currUrl.Geturlname() delete(queuedURLS, name) hostname, _ := url.Parse(name) _, ok1 := domainTimes[hostname.Host] ok = ok1 } if !ok { //fmt.Printf("not found in timestamps \n") ProcessURL(currUrl) seenURLS[currUrl.Geturlname()] = 0 domainTimes[hostname.Host] = time.Now().Unix() } else { temp[currUrl] = true time.Sleep(time.Second * 2) } for key, _ := range temp { queuedURLS[key.Geturlname()] = 0 heap.Push(pq, key) } removeUnallowedDomains() } }