Example #1
0
// check is an non-exported function for better error handling.
func (p *Page) check() (err error) {
	if settings.Verbose {
		fmt.Println("[/] Downloading:", p.ReqUrl.String())
	}

	// Retrieve result from download or return timeout error.
	var r struct {
		*html.Node
		error
	}
	select {
	case r = <-errWrapDownload(p):
		if r.error != nil {
			return errutil.Err(r.error)
		}
	case <-time.After(settings.TimeoutDuration):
		return errutil.NewNoPosf("timeout: %s", p.ReqUrl.String())
	}

	// Extract selection from downloaded source.
	selection, err := p.makeSelection(r.Node)
	if err != nil {
		return errutil.Err(err)
	}

	// Filename is the URL encoded and the protocol is stripped.
	linuxPath, err := filename.Encode(p.UrlAsFilename())
	if err != nil {
		return errutil.Err(err)
	}

	// Debug - no selection.
	debug, err := htmlutil.RenderClean(r.Node)
	if err != nil {
		return errutil.Err(err)
	}
	// Update the debug comparison file.
	debugCachePathName := settings.DebugCacheRoot + linuxPath + ".htm"
	err = ioutil.WriteFile(debugCachePathName, []byte(debug), settings.Global.FilePerms)
	if err != nil {
		return errutil.Err(err)
	}

	// If the selection is empty, the CSS selection is probably wrong so we will
	// alert the user about this problem.
	if len(selection) == 0 {
		return errutil.NewNoPosf("Update was empty. URL: %s", p.ReqUrl)
	}

	cachePathName := settings.CacheRoot + linuxPath + ".htm"

	// Read in comparison.
	buf, err := ioutil.ReadFile(cachePathName)
	if err != nil {
		if !os.IsNotExist(err) {
			return errutil.Err(err)
		}

		// If the page hasn't been checked before, create a new comparison file.
		err = ioutil.WriteFile(
			cachePathName,
			[]byte(selection),
			settings.Global.FilePerms,
		)
		if err != nil {
			return errutil.Err(err)
		}

		readPathName := settings.ReadRoot + linuxPath + ".htm"
		// If the page hasn't been checked before, create a new comparison file.
		err = ioutil.WriteFile(
			readPathName,
			[]byte(selection),
			settings.Global.FilePerms,
		)
		if err != nil {
			return errutil.Err(err)
		}

		debugReadPathName := settings.DebugReadRoot + linuxPath + ".htm"

		// Update the debug prev file.
		err = ioutil.WriteFile(debugReadPathName, []byte(debug), settings.Global.FilePerms)
		if err != nil {
			return errutil.Err(err)
		}

		if settings.Verbose {
			fmt.Println("[+] New site added:", p.ReqUrl.String())
		}

		return nil
	}

	// The distance between to strings in percentage.
	dist := distance.Approx(string(buf), selection)

	// If the distance is within the threshold level, i.e if the check was a
	// match.
	if dist > p.Settings.Threshold {
		u := p.ReqUrl.String()
		settings.Updates[u] = true

		if settings.Verbose {
			fmt.Println("[!] Updated:", p.ReqUrl.String())
		}

		// If the page has a mail and all compulsory global mail settings are
		// set, send a mail to notify the user about an update.
		if p.Settings.RecvMail != "" &&
			settings.Global.SenderMail.AuthServer != "" &&
			settings.Global.SenderMail.OutServer != "" &&
			settings.Global.SenderMail.Address != "" {

			// Mail the selection without the stripping functions, since their
			// only purpose is to remove false-positives. It will make the
			// output look better.
			mailPage := Page{p.ReqUrl, p.Settings}
			mailPage.Settings.StripFuncs = nil
			mailPage.Settings.Regexp = ""
			sel, err := mailPage.makeSelection(r.Node)
			if err != nil {
				return errutil.Err(err)
			}

			err = mail.Send(p.ReqUrl, p.Settings.RecvMail, sel)
			if err != nil {
				return errutil.Err(err)
			}
			delete(settings.Updates, u)
		}
		// Save updates to file.
		err = settings.SaveUpdates()
		if err != nil {
			return errutil.Err(err)
		}

		// Update the comparison file.
		err = ioutil.WriteFile(cachePathName, []byte(selection), settings.Global.FilePerms)
		if err != nil {
			return errutil.Err(err)
		}
	} else {
		if settings.Verbose {
			fmt.Println("[-] No update:", p.ReqUrl.String())
		}
	}
	return nil
}
Example #2
0
func main() {
	app := cli.NewApp()
	app.Name = "stardust"
	app.Usage = "String similarity measures for tab separated values."
	app.Author = "Martin Czygan"
	app.Email = "*****@*****.**"
	app.Version = "0.1.1"

	app.Flags = []cli.Flag{
		cli.StringFlag{
			Name:  "fields, f",
			Value: "1,2",
			Usage: "c1,c2 the two columns to use for the comparison",
		},
		cli.StringFlag{
			Name:  "delimiter, d",
			Value: "\t",
			Usage: "column delimiter (defaults to tab)",
		},
	}

	app.Commands = []cli.Command{
		{
			Name:        "adhoc",
			Usage:       "Adhoc distance",
			Description: "Ad-hoc percentage difference found on the web (https://godoc.org/github.com/karlek/nyfiken/distance).",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure := distance.Approx(r.Left(), r.Right())
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
		},
		{
			Name:        "cosine",
			Usage:       "Cosine word-wise",
			Description: "A a measure of similarity between two vectors. The bigger the return value is, the more similar the two texts are.",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure := similar.Cosine(r.Left(), r.Right())
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
		},
		{
			Name:        "coslev",
			Usage:       "Cosine word-wise and levenshtein combined",
			Description: "Experimenal.",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure := similar.Get(r.Left(), r.Right())
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
		},
		{
			Name:        "dice",
			Usage:       "Sørensen–Dice coefficient",
			Description: "Semimetric version of the Jaccard index.",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure, _ := stardust.SorensenDiceDistance(r.Left(), r.Right())
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
		},

		{
			Name:  "hamming",
			Usage: "Hamming distance",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure, err := stardust.HammingDistance(r.Left(), r.Right())
					if err != nil {
						log.Fatal(err)
					}
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
		},
		{
			Name:        "jaro",
			Usage:       "Jaro distance",
			Description: "Similar to Ngram, but faster.",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure, err := stardust.JaroDistance(r.Left(), r.Right())
					if err != nil {
						log.Fatal(err)
					}
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
		},
		{
			Name:        "jaro-winkler",
			Usage:       "Jaro-Winkler distance",
			Description: "It is a variant of the Jaro distance metric.",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure, err := stardust.JaroWinklerDistance(r.Left(), r.Right(), c.Float64("boost"), c.Int("size"))
					if err != nil {
						log.Fatal(err)
					}
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
			Flags: []cli.Flag{
				cli.Float64Flag{
					Name:  "boost, b",
					Value: 0.5,
					Usage: "boost factor",
				},
				cli.IntFlag{
					Name:  "size, p",
					Value: 3,
					Usage: "prefix size",
				},
			},
		},
		{
			Name:  "levenshtein",
			Usage: "Levenshtein distance",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure, err := stardust.LevenshteinDistance(r.Left(), r.Right())
					if err != nil {
						log.Fatal(err)
					}
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
		},
		{
			Name:        "ngram",
			Usage:       "Ngram distance",
			Description: "Compute Ngram distance, which lies between 0 and 1 (equal).",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					measure, err := stardust.NgramDistanceSize(r.Left(), r.Right(), c.Int("size"))
					if err != nil {
						log.Fatal(err)
					}
					fmt.Printf("%s\t%v\n", strings.Join(r.Fields, "\t"), measure)
				}
			},
			Flags: []cli.Flag{
				cli.IntFlag{
					Name:  "size, s",
					Value: 3,
					Usage: "value of n",
				},
			},
		},
		{
			Name:  "plain",
			Usage: "Plain passthrough (for IO benchmarks)",
			Action: func(c *cli.Context) {
				records := stardust.RecordGenerator(c)
				for r := range records {
					fmt.Printf("%s\n", strings.Join(r.Fields, "\t"))
				}
			},
		},
	}
	app.Run(os.Args)
}