// check is an non-exported function for better error handling. func (p *Page) check() (err error) { if settings.Verbose { fmt.Println("[/] Downloading:", p.ReqUrl.String()) } // Retrieve result from download or return timeout error. var r struct { *html.Node error } select { case r = <-errWrapDownload(p): if r.error != nil { return errutil.Err(r.error) } case <-time.After(settings.TimeoutDuration): return errutil.NewNoPosf("timeout: %s", p.ReqUrl.String()) } // Extract selection from downloaded source. selection, err := p.makeSelection(r.Node) if err != nil { return errutil.Err(err) } // Filename is the URL encoded and the protocol is stripped. linuxPath, err := filename.Encode(p.UrlAsFilename()) if err != nil { return errutil.Err(err) } // Debug - no selection. debug, err := htmlutil.RenderClean(r.Node) if err != nil { return errutil.Err(err) } // Update the debug comparison file. debugCachePathName := settings.DebugCacheRoot + linuxPath + ".htm" err = ioutil.WriteFile(debugCachePathName, []byte(debug), settings.Global.FilePerms) if err != nil { return errutil.Err(err) } // If the selection is empty, the CSS selection is probably wrong so we will // alert the user about this problem. if len(selection) == 0 { return errutil.NewNoPosf("Update was empty. URL: %s", p.ReqUrl) } cachePathName := settings.CacheRoot + linuxPath + ".htm" // Read in comparison. buf, err := ioutil.ReadFile(cachePathName) if err != nil { if !os.IsNotExist(err) { return errutil.Err(err) } // If the page hasn't been checked before, create a new comparison file. err = ioutil.WriteFile( cachePathName, []byte(selection), settings.Global.FilePerms, ) if err != nil { return errutil.Err(err) } readPathName := settings.ReadRoot + linuxPath + ".htm" // If the page hasn't been checked before, create a new comparison file. err = ioutil.WriteFile( readPathName, []byte(selection), settings.Global.FilePerms, ) if err != nil { return errutil.Err(err) } debugReadPathName := settings.DebugReadRoot + linuxPath + ".htm" // Update the debug prev file. err = ioutil.WriteFile(debugReadPathName, []byte(debug), settings.Global.FilePerms) if err != nil { return errutil.Err(err) } if settings.Verbose { fmt.Println("[+] New site added:", p.ReqUrl.String()) } return nil } // The distance between to strings in percentage. dist := distance.Approx(string(buf), selection) // If the distance is within the threshold level, i.e if the check was a // match. if dist > p.Settings.Threshold { u := p.ReqUrl.String() settings.Updates[u] = true if settings.Verbose { fmt.Println("[!] Updated:", p.ReqUrl.String()) } // If the page has a mail and all compulsory global mail settings are // set, send a mail to notify the user about an update. if p.Settings.RecvMail != "" && settings.Global.SenderMail.AuthServer != "" && settings.Global.SenderMail.OutServer != "" && settings.Global.SenderMail.Address != "" { // Mail the selection without the stripping functions, since their // only purpose is to remove false-positives. It will make the // output look better. mailPage := Page{p.ReqUrl, p.Settings} mailPage.Settings.StripFuncs = nil mailPage.Settings.Regexp = "" sel, err := mailPage.makeSelection(r.Node) if err != nil { return errutil.Err(err) } err = mail.Send(p.ReqUrl, p.Settings.RecvMail, sel) if err != nil { return errutil.Err(err) } delete(settings.Updates, u) } // Save updates to file. err = settings.SaveUpdates() if err != nil { return errutil.Err(err) } // Update the comparison file. err = ioutil.WriteFile(cachePathName, []byte(selection), settings.Global.FilePerms) if err != nil { return errutil.Err(err) } } else { if settings.Verbose { fmt.Println("[-] No update:", p.ReqUrl.String()) } } return nil }
func main() { app := cli.NewApp() app.Name = "stardust" app.Usage = "String similarity measures for tab separated values." app.Author = "Martin Czygan" app.Email = "*****@*****.**" app.Version = "0.1.1" app.Flags = []cli.Flag{ cli.StringFlag{ Name: "fields, f", Value: "1,2", Usage: "c1,c2 the two columns to use for the comparison", }, cli.StringFlag{ Name: "delimiter, d", Value: "\t", Usage: "column delimiter (defaults to tab)", }, } app.Commands = []cli.Command{ { Name: "adhoc", Usage: "Adhoc distance", Description: "Ad-hoc percentage difference found on the web (https://godoc.org/github.com/karlek/nyfiken/distance).", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure := distance.Approx(r.Left(), r.Right()) fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, }, { Name: "cosine", Usage: "Cosine word-wise", Description: "A a measure of similarity between two vectors. The bigger the return value is, the more similar the two texts are.", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure := similar.Cosine(r.Left(), r.Right()) fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, }, { Name: "coslev", Usage: "Cosine word-wise and levenshtein combined", Description: "Experimenal.", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure := similar.Get(r.Left(), r.Right()) fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, }, { Name: "dice", Usage: "Sørensen–Dice coefficient", Description: "Semimetric version of the Jaccard index.", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure, _ := stardust.SorensenDiceDistance(r.Left(), r.Right()) fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, }, { Name: "hamming", Usage: "Hamming distance", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure, err := stardust.HammingDistance(r.Left(), r.Right()) if err != nil { log.Fatal(err) } fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, }, { Name: "jaro", Usage: "Jaro distance", Description: "Similar to Ngram, but faster.", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure, err := stardust.JaroDistance(r.Left(), r.Right()) if err != nil { log.Fatal(err) } fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, }, { Name: "jaro-winkler", Usage: "Jaro-Winkler distance", Description: "It is a variant of the Jaro distance metric.", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure, err := stardust.JaroWinklerDistance(r.Left(), r.Right(), c.Float64("boost"), c.Int("size")) if err != nil { log.Fatal(err) } fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, Flags: []cli.Flag{ cli.Float64Flag{ Name: "boost, b", Value: 0.5, Usage: "boost factor", }, cli.IntFlag{ Name: "size, p", Value: 3, Usage: "prefix size", }, }, }, { Name: "levenshtein", Usage: "Levenshtein distance", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure, err := stardust.LevenshteinDistance(r.Left(), r.Right()) if err != nil { log.Fatal(err) } fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, }, { Name: "ngram", Usage: "Ngram distance", Description: "Compute Ngram distance, which lies between 0 and 1 (equal).", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { measure, err := stardust.NgramDistanceSize(r.Left(), r.Right(), c.Int("size")) if err != nil { log.Fatal(err) } fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure) } }, Flags: []cli.Flag{ cli.IntFlag{ Name: "size, s", Value: 3, Usage: "value of n", }, }, }, { Name: "plain", Usage: "Plain passthrough (for IO benchmarks)", Action: func(c *cli.Context) { records := stardust.RecordGenerator(c) for r := range records { fmt.Printf("%s\n", strings.Join(r.Fields, "\t")) } }, }, } app.Run(os.Args) }