// NewTransformer creates a new transform.Transformer that performs the PRECIS
// preparation and enforcement steps on the given UTF-8 encoded bytes.
func (p Profile) NewTransformer() *Transformer {
	var ts []transform.Transformer

	if p.options.allowwidechars {
		ts = append(ts, width.Fold)
	}

	ts = append(ts, checker{p: p})

	if p.options.width != nil {
		ts = append(ts, width.Fold)
	}

	for _, f := range p.options.additional {
		ts = append(ts, f())
	}

	if p.options.cases {
		ts = append(ts, transform.Chain(
			cases.Upper(language.Und), cases.Lower(language.Und),
		))
	}

	ts = append(ts, p.options.norm)

	// TODO: Apply directionality rule (blocking on the Bidi package)
	// TODO: Add the disallow empty rule with a dummy transformer?

	return &Transformer{transform.Chain(ts...)}
}
Exemple #2
0
func NormalizeTitle(title string) string {
	normalizedTitle := title
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = RomanizeHepburn(title)
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = RemoveTrailingApostrophe(normalizedTitle)
	normalizedTitle, _, _ = transform.String(transform.Chain(
		norm.NFD,
		transform.RemoveFunc(func(r rune) bool {
			return unicode.Is(unicode.Mn, r)
		}),
		norm.NFC), normalizedTitle)
	normalizedTitle = strings.ToLower(normalizedTitle)
	normalizedTitle = regexp.MustCompile(`\(\d+\)`).ReplaceAllString(normalizedTitle, " ")
	normalizedTitle = strings.Map(func(r rune) rune {
		if !unicode.IsLetter(r) && !unicode.IsDigit(r) && r != '.' {
			return ' '
		}
		return r
	}, normalizedTitle)
	normalizedTitle = regexp.MustCompile(`\s+`).ReplaceAllString(normalizedTitle, " ")
	normalizedTitle = strings.TrimSpace(normalizedTitle)

	return normalizedTitle
}
Exemple #3
0
// NewReader returns a reader which decode from the given encoding, to utf8.
//
// If enc is nil, then only an utf8-enforcing replacement reader
// (see http://godoc.org/code.google.com/p/go.text/encoding#pkg-variables)
// is used.
func NewReader(r io.Reader, enc encoding.Encoding) io.Reader {
	if enc == nil || enc == encoding.Replacement {
		return transform.NewReader(r, encoding.Replacement.NewEncoder())
	}
	return transform.NewReader(r,
		transform.Chain(enc.NewDecoder(), encoding.Replacement.NewEncoder()))
}
Exemple #4
0
// NewTransformer creates a new transform.Transformer that performs the PRECIS
// preparation and enforcement steps on the given UTF-8 encoded bytes.
func (p *Profile) NewTransformer() *Transformer {
	var ts []transform.Transformer

	// These transforms are applied in the order defined in
	// https://tools.ietf.org/html/rfc7564#section-7

	if p.options.foldWidth {
		ts = append(ts, width.Fold)
	}

	for _, f := range p.options.additional {
		ts = append(ts, f())
	}

	if p.options.cases != nil {
		ts = append(ts, p.options.cases)
	}

	ts = append(ts, p.options.norm)

	if p.options.bidiRule {
		ts = append(ts, bidirule.New())
	}

	ts = append(ts, &checker{p: p, allowed: p.Allowed()})

	// TODO: Add the disallow empty rule with a dummy transformer?

	return &Transformer{transform.Chain(ts...)}
}
Exemple #5
0
func main() {
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	r := transform.NewReader(os.Stdin, t)
	if _, err := io.Copy(os.Stdout, r); err != nil {
		log.Fatal(err)
	}
}
Exemple #6
0
func removeNlChars(str string) string {
	isOk := func(r rune) bool {
		return r < 32 || r >= 127
	}
	t := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk))
	str, _, _ = transform.String(t, str)
	return str
}
Exemple #7
0
func ExampleRemove() {
	t := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
	s, _, _ := transform.String(t, "résumé")
	fmt.Println(s)

	// Output:
	// resume
}
Exemple #8
0
// DecodeTransfer decodes base64, quoted-printable or plain text.
func decodeTransfer(r io.Reader, label string) io.Reader {
	switch strings.ToLower(label) {
	case "base64":
		return base64.NewDecoder(base64.StdEncoding, transform.NewReader(r, nonASCIITransformer{}))
	case "quoted-printable":
		return quotedprintable.NewReader(transform.NewReader(r, transform.Chain(nonASCIITransformer{}, newlineAppendTransformer{})))
	case "", "7bit", "8bit", "binary":
		return r
	default:
		return failReader{fmt.Errorf("unsupported transfer encoding: %v", label)}
	}
}
Exemple #9
0
//function to sanitize input
//from: http://rosettacode.org/wiki/Strip_control_codes_and_extended_characters_from_a_string#Go
func stripCtlAndExtFromUnicode(str string) string {
	isOk := func(r rune) bool {
		return r < 32 || r >= 127
	}
	// The isOk filter is such that there is no need to chain to norm.NFC
	t := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk))
	// This Transformer could also trivially be applied as an io.Reader
	// or io.Writer filter to automatically do such filtering when reading
	// or writing data anywhere.
	str, _, _ = transform.String(t, str)
	return str
}
Exemple #10
0
// GetCompatibleString removes all the special characters
// from the string name to create a new string compatible
// with different file names.
func GetCompatibleString(name string) string {
	// Replace all the & signs with and text
	name = strings.Replace(name, "&", "and", -1)
	// Change all the characters to ASCII
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	result, _, _ := transform.String(t, name)
	// Replace all the spaces with underscore
	s, _ := regexp.Compile(`\s+`)
	result = s.ReplaceAllString(result, "_")
	// Remove all the non alphanumeric characters
	r, _ := regexp.Compile(`\W`)
	result = r.ReplaceAllString(result, "")
	return result
}
Exemple #11
0
func normalize(name, src string) (string, error) {
	if name == "" {
		name = baseWithoutExt(src)
	}
	t := transform.Chain(norm.NFD, transform.RemoveFunc(remove), norm.NFC)
	name = strings.TrimSpace(name)
	name, _, err := transform.String(t, name)
	if err != nil {
		return "", err
	}
	name = strings.ToLower(name)
	name = strings.Replace(name, " ", "_", -1)
	return name, nil
}
Exemple #12
0
// normalize does unicode normalization.
func normalize(in []byte) ([]byte, error) {
	// We need a new transformer for each input as it cannot be reused.
	filter := func(r rune) bool {
		return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks (to be removed)
	}
	transformer := transform.Chain(norm.NFD, transform.RemoveFunc(filter), norm.NFC)
	out, _, err := transform.Bytes(transformer, in)
	out = bytes.Map(func(r rune) rune {
		if unicode.IsPunct(r) { // Replace punctuations with spaces.
			return ' '
		}
		return unicode.ToLower(r) // Convert to lower case.
	}, out)
	return out, err
}
Exemple #13
0
func ExampleUTF8Validator() {
	for i := 0; i < 2; i++ {
		transformer := charmap.Windows1252.NewEncoder()
		if i == 1 {
			transformer = transform.Chain(encoding.UTF8Validator, transformer)
		}
		dst := make([]byte, 256)
		src := []byte("abc\xffxyz") // src is invalid UTF-8.
		nDst, nSrc, err := transformer.Transform(dst, src, true)
		fmt.Printf("i=%d: produced %q, consumed %q, error %v\n",
			i, dst[:nDst], src[:nSrc], err)
	}
	// Output:
	// i=0: produced "abc\x1axyz", consumed "abc\xffxyz", error <nil>
	// i=1: produced "abc", consumed "abc", error encoding: invalid UTF-8
}
Exemple #14
0
// scanContent scans the content of a document for phrases,
// and updates tally.
func (conf *config) scanContent(content []byte, contentType, cs string, tally map[rule]int) {
	if strings.Contains(contentType, "javascript") {
		conf.scanJSContent(content, tally)
		return
	}

	transformers := make([]transform.Transformer, 0, 3)
	if cs != "utf-8" {
		e, _ := charset.Lookup(cs)
		transformers = append(transformers, e.NewDecoder())
	}

	if strings.Contains(contentType, "html") {
		transformers = append(transformers, entityDecoder{})
	}
	transformers = append(transformers, new(wordTransformer))

	ps := newPhraseScanner(conf.ContentPhraseList, func(s string) {
		tally[rule{t: contentPhrase, content: s}]++
	})
	ps.scanByte(' ')

	var t transform.Transformer
	if len(transformers) == 1 {
		t = transformers[0]
	} else {
		t = transform.Chain(transformers...)
	}

	r := transform.NewReader(bytes.NewReader(content), t)

	buf := make([]byte, 4096)
	for {
		n, err := r.Read(buf)
		for _, c := range buf[:n] {
			ps.scanByte(c)
		}
		if err != nil {
			if err != io.EOF {
				log.Println("Error decoding page content:", err)
			}
			break
		}
	}

	ps.scanByte(' ')
}
Exemple #15
0
func ExampleUTF8Validator() {
	for i := 0; i < 2; i++ {
		var transformer transform.Transformer
		transformer = unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM).NewEncoder()
		if i == 1 {
			transformer = transform.Chain(encoding.UTF8Validator, transformer)
		}
		dst := make([]byte, 256)
		src := []byte("abc\xffxyz") // src is invalid UTF-8.
		nDst, nSrc, err := transformer.Transform(dst, src, true)
		fmt.Printf("i=%d: produced %q, consumed %q, error %v\n",
			i, dst[:nDst], src[:nSrc], err)
	}
	// Output:
	// i=0: produced "\x00a\x00b\x00c\xff\xfd\x00x\x00y\x00z", consumed "abc\xffxyz", error <nil>
	// i=1: produced "\x00a\x00b\x00c", consumed "abc", error encoding: invalid UTF-8
}
func replace(path string) {
	copy := []string{}
	r := `(<script(\s|\S)*?<\/script>)|(<style(\s|\S)*?<\/style>)|(<!--(\s|\S)*?-->)|(<\/?(\s|\S)*?>)|(nbsp;)|((?:\s)\s)|(png)|(jpeg)|(jpg)|(mpg)|(\\u0026)|(\n)|(\v)|(\r)|(\0)|(\t)|(n°)
		|(à)|(wbe)|(_)`
	regex, err := regexp.Compile(r)
	if err != nil {
		return // there was a problem with the regular expression.
	}
	c, _ := readLines(path)
	for _, v := range c {
		reg := regex.ReplaceAllString(v, " ")
		slug := utils.GenerateSlug(reg)
		regex1, _ := regexp.Compile(`((\-){1,})|(\b\w{1}\b)`)
		reg = regex1.ReplaceAllString(slug, " ")
		t := stripchars(reg, `?,.!/©*@#~()$+"'&}]|:;[{²`)
		s := strings.TrimSpace(t)
		// fmt.Println(s)

		normalize := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
		normStr1, _, _ := transform.String(normalize, s)
		// fmt.Println(normStr1)

		if len(v) > 0 {
			copy = append(copy, normStr1)
		}
	}

	// fmt.Println(cleaned, "\n")

	j := strings.Replace(strings.Join((copy), " "), " ", ",", -1)
	// fmt.Println(j)
	regex2, err := regexp.Compile(`((\,){2,})`)
	j1 := regex2.ReplaceAllString(j, ",")
	// fmt.Println(j1)
	j2 := strings.Split(j1, ",")

	cleaned := []string{}

	for _, value := range j2 {
		if !stringInSlice(value, cleaned) {
			cleaned = append(cleaned, value)
		}
	}
	createCsv(path, filenameCsv, strings.Join(cleaned, ","))
}
Exemple #17
0
// UnicodeSanitize sanitizes string to be used in Hugo URL's, allowing only
// a predefined set of special Unicode characters.
// If RemovePathAccents configuration flag is enabled, Uniccode accents
// are also removed.
func UnicodeSanitize(s string) string {
	source := []rune(s)
	target := make([]rune, 0, len(source))

	for _, r := range source {
		if unicode.IsLetter(r) || unicode.IsDigit(r) || unicode.IsMark(r) || r == '%' || r == '.' || r == '/' || r == '\\' || r == '_' || r == '-' || r == '#' || r == '+' {
			target = append(target, r)
		}
	}

	var result string

	if viper.GetBool("RemovePathAccents") {
		// remove accents - see https://blog.golang.org/normalization
		t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
		result, _, _ = transform.String(t, string(target))
	} else {
		result = string(target)
	}

	return result
}
Exemple #18
0
func cleanSalary(input string) string {
	cleaner := transform.Chain(norm.NFD,
		transform.RemoveFunc(func(r rune) bool {
			return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
		}),
		norm.NFC)
	output, _, _ := transform.String(cleaner, input)
	output = strings.ToLower(output)
	m := reSalarySep.FindStringSubmatchIndex(output)
	if m != nil {
		output = output[:m[0]+1] + " - " + output[m[1]-1:]
	}
	for {
		m := reSalarySplit.FindStringSubmatchIndex(output)
		if m == nil {
			break
		}
		_, e1 := m[2], m[3]
		s2, _ := m[4], m[5]
		output = output[:e1] + output[s2:]
	}
	return output
}
Exemple #19
0
func cleanName(name string) string {
	name = strings.Replace(name, "ß", "ss", -1)
	name = strings.Replace(name, "Σ", "e", -1)
	name = strings.Replace(name, "æ", "a", -1)
	name = strings.Replace(name, "&", "and", -1)
	name = strings.Replace(name, "$", "s", -1)
	for _, c := range removeChars {
		name = strings.Replace(name, c, "", -1)
	}
	for _, c := range spaceChars {
		name = strings.Replace(name, c, " ", -1)
	}

	name = badChanRegex.ReplaceAllString(name, "")
	name = strings.Join(strings.Fields(name), " ")
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	unicodeCleanedName, _, err := transform.String(t, name)

	if err == nil {
		name = unicodeCleanedName
	}

	return strings.Trim(name, ` "`)
}
Exemple #20
0
	case '\u200C':
	case '\u200D':
	case '\u2060':
	case '\uFE00':
	case '\uFE01':
	case '\uFE02':
	case '\uFE03':
	case '\uFE04':
	case '\uFE05':
	case '\uFE06':
	case '\uFE07':
	case '\uFE08':
	case '\uFE09':
	case '\uFE0A':
	case '\uFE0B':
	case '\uFE0C':
	case '\uFE0D':
	case '\uFE0E':
	case '\uFE0F':
	case '\uFEFF':
	default:
		return false
	}

	return true
})

// Stringprep implements Stringprep Profile for User Names and Passwords (RFC 4013)
// as a transform.Transformer
var Stringprep = transform.Chain(nonASCIISpaceTransformer, mappedToNothing, norm.NFKC)
Exemple #21
0
func main() {
	flag.Parse()
	//	panic("Just Quit")
	getHostConfig()
	//	runtime.GOMAXPROCS(2)
	timeout = 1000
	fmt.Println("Feeds")
	//http://careers.stackoverflow.com/jobs/feed?searchTerm=big+data&location=san+francisco&range=100&distanceUnits=Miles
	//	feeds = append(feeds, Feed{index: 0, url: "http://careers.stackoverflow.com/jobs/feed?searchTerm=big+data&location=san+francisco&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false })

	feeds = append(feeds, Feed{index: 0, url: "http://careers.stackoverflow.com/jobs/feed?location=san+francisco%2c+ca&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 1, url: "http://careers.stackoverflow.com/jobs/feed?location=new+york+city%2c+ny&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 2, url: "http://careers.stackoverflow.com/jobs/feed?location=los+angeles%2c+ca&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 3, url: "http://careers.stackoverflow.com/jobs/feed?location=boston%2c+ma&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 4, url: "http://careers.stackoverflow.com/jobs/feed?location=seattle%2cwa&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 5, url: "http://careers.stackoverflow.com/jobs/feed?location=austin%2ctx&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	feeds = append(feeds, Feed{index: 6, url: "http://careers.stackoverflow.com/jobs/feed?location=chicago%2cil&range=100&distanceUnits=Miles", status: 0, itemCount: 0, complete: false, itemsComplete: false})
	mutex = &sync.Mutex{}
	skillMap = make(map[string]int, 200)
	loadSkillMapFile(skillMap)
	fmt.Println("GetRSS")
	getRSS2()
	saveSkillMapFile(skillMap)
	if conf.hbaseZkURL != "" {
		saveSkillsMapHBase(skillMap)
	}

	for i := 0; i < len(guidList); i++ {
		fmt.Println(guidList[i])
	}

	//	guidList := make([]string, 4)
	//	guidList[0] = "http://careers.stackoverflow.com/jobs/103310/senior-software-engineer-american-society-of-clinical"
	//	guidList[1] = "http://careers.stackoverflow.com/jobs/94152/senior-software-engineer-platform-flixster"
	//	guidList[2] = "http://careers.stackoverflow.com/jobs/103328/senior-full-stack-engineer-data-science-adroll"
	//	guidList[3] = "http://careers.stackoverflow.com/jobs/104086/enterprise-architect-new-relic"
	//	fmt.Printf("%v\n", s)

	// map random times & make s3names
	fw.Slice(guidList).Map(func(sURL string) URLTuple {
		fmt.Printf("Map1: %v\n", sURL)
		fName := "jobs_sof/" + strings.Replace(strings.TrimPrefix(sURL, "http://careers.stackoverflow.com/jobs/"), "/", "_", -1)
		ms := rand.Intn(3000)
		return URLTuple{sURL, fName, ms}
		//	Filter already-acquired URLs
	}).Filter(func(uTuple URLTuple) bool {
		// is file already stored in S3?
		//fmt.Printf("Filter:%s, %v\n", uTuple.s3Name, uTuple)
		svcS3 := s3.New(session.New(&aws.Config{Region: aws.String("us-east-1")}))
		var params *s3.HeadObjectInput

		params = &s3.HeadObjectInput{
			Bucket: aws.String("opps"),        // Required
			Key:    aws.String(uTuple.s3Name), // Required
		}
		hobj, _ := svcS3.HeadObject(params)

		fmt.Printf("Filter: %s => %v\n", uTuple.s3Name, hobj.ContentLength == nil)
		return hobj.ContentLength == nil
		//	get the URLs
	}).Map(func(uTuple URLTuple) statusTuple {
		fmt.Printf("Map3: %v\n", uTuple)
		// random sleep
		time.Sleep(time.Duration(uTuple.msWait) * time.Millisecond)

		// get URL
		resp, err := http.Get(uTuple.gURL)
		if err != nil {
			panic(err)
		}
		defer resp.Body.Close()

		//		fmt.Println("Body:", resp.Body)
		//		fmt.Println("Proto:", resp.Proto)
		//		fmt.Printf("response Status = <%s> / Length = %d\n", resp.Status, resp.ContentLength)
		//		fmt.Println("response Headers:", resp.Header)
		//		fmt.Printf("response %+v:\n", resp)
		//		fmt.Println("response Body:", string(body))
		failed := 0
		passed := 0
		if resp.StatusCode == 200 {
			passed = 1
		} else {
			failed = 1
		}
		// store in S3
		if passed == 1 {
			body, _ := ioutil.ReadAll(resp.Body)
			reader := strings.NewReader(string(body))
			root, err := html.Parse(reader)

			if err != nil {
				fmt.Printf("%+v\n", err)
			}

			var b bytes.Buffer
			html.Render(&b, root)
			fixedHtml := b.String()

			isOk := func(r rune) bool {
				return r < 32 || r >= 127
			}
			// The isOk filter is such that there is no need to chain to norm.NFC
			t2 := transform.Chain(norm.NFKD, transform.RemoveFunc(isOk))
			// This Transformer could also trivially be applied as an io.Reader
			// or io.Writer filter to automatically do such filtering when reading
			// or writing data anywhere.
			fixedUnicodeNFKD, _, _ := transform.String(t2, fixedHtml)

			//			fmt.Println("\n\n\n"+fixedUnicodeNFKD)
			reader = strings.NewReader(fixedUnicodeNFKD)

			xmlroot, xmlerr := xmlpath.ParseHTML(reader)
			if xmlerr != nil {
				log.Fatal(xmlerr)
			}
			//	fmt.Printf("xml root = %+v\n------\n", xmlroot)
			path := &xmlpath.Path{}
			pstr := string("")

			pstr = `/html/head/title`
			path = xmlpath.MustCompile(pstr)
			var ok bool

			title := ""
			if title, ok = path.String(xmlroot); ok {
				//		fmt.Printf("%s: %s\n", pstr, title)
			}
			fmt.Printf("**** Title: %s\n", title)
			var iter *xmlpath.Iter
			var list *xmlpath.Path
			var cnt int

			// Location - needs Trim
			pstr = `//*[@id="hed"]/ul[1]/li/text()`
			path = xmlpath.MustCompile(pstr)
			location := ""
			if location, ok = path.String(xmlroot); ok {
				//		fmt.Printf("Location - %s: %s\n", pstr, strings.Trim(location, " \n"))
				location = strings.Trim(location, " \n")
			}

			// Base Skills - LOOP from 1 until not ok
			var skills []string

			list = xmlpath.MustCompile(`//*[@id="hed"]/div[2]/p/a`)
			iter = list.Iter(xmlroot)
			for iter.Next() {
				ele := iter.Node().String()
				skills = append(skills, ele)
				//		fmt.Printf("Sk-Desc: %s\n", ele)
			}

			var desc []string
			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/p`)
			iter = list.Iter(xmlroot)
			for iter.Next() {
				ele := iter.Node().String()
				desc = append(desc, ele)
				//		fmt.Printf("it-Desc1: %s\n", ele)
			}

			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/ul/li`)
			iter = list.Iter(xmlroot)
			for iter.Next() {
				ele := iter.Node().String()
				desc = append(desc, ele)
				//		fmt.Printf("it-Desc2: %s\n", ele)
			}

			var sSNR []string
			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/p`)
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				sSNR = append(sSNR, ele)
				//		fmt.Printf("Skills1 (%d): %s\n", cnt, ele)
				cnt++
			}

			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/ul/li/text()`)
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				sSNR = append(sSNR, ele)
				//		fmt.Printf("Skills2(%d): %s\n", cnt, ele)
				cnt++
			}

			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[3]/ul/li/ul/li/text()`)
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				sSNR = append(sSNR, ele)
				//		fmt.Printf("Skills3(%d): %s\n", cnt, ele)
				cnt++
			}
			//
			//    // about company -
			//	pstr = `//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p/text()`
			//	//*[@id="jobdetailpage"]/div[2]/div[1]/div[2]/p[2]/text()[1]
			//	path = xmlpath.MustCompile(pstr)
			//	about := ""
			//	if about, ok = path.String(xmlroot); ok {
			//		fmt.Printf("About: %s - %s\n", pstr, about)
			//	}

			var about []string
			list = xmlpath.MustCompile(`//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p`)
			//*[@id="jobdetailpage"]/div[2]/div[1]/div[4]/p[2]/text()[1]
			iter = list.Iter(xmlroot)
			cnt = 0
			for iter.Next() {
				ele := iter.Node().String()
				about = append(about, ele)
				//		fmt.Printf("About(%d): %s\n", cnt, ele)
				cnt++
			}

			var sep string

			baseAbout := "ABOUT: "
			sep = ""
			for i := 0; i < len(about); i++ {
				baseAbout += sep + about[i]
				sep = "\n"
			}

			baseSkills := "BASESKILLS: "
			sep = ""
			//	fmt.Printf("base skills = %+v\n", skills)
			for i := 0; i < len(skills); i++ {
				baseSkills += sep + skills[i]
				sep = " "
			}

			baseReqs := "REQUIREMENTS: "
			sep = ""
			for i := 0; i < len(sSNR); i++ {
				baseReqs += sep + sSNR[i]
				sep = "\n"
			}

			baseDesc := "DESCRIPTION: "
			sep = ""
			for i := 0; i < len(desc); i++ {
				baseDesc += sep + desc[i]
				sep = "\n"
			}

			var storage string
			storage =
				uTuple.gURL + "\n\n" +
					"DATE: " + time.Now().Format(time.RFC850) + "\n\n" +
					"TITLE: " + html.UnescapeString(title) + "\n\n" +
					"LOCATION: " + html.UnescapeString(location) + "\n\n" +
					html.UnescapeString(baseSkills) + "\n\n" +
					html.UnescapeString(baseAbout) + "\n\n" +
					html.UnescapeString(baseDesc) + "\n\n" + // no second slash
					html.UnescapeString(baseReqs) + "\n"

			fmt.Printf("Storing (len = %d):\n***\n%s\n***\n", len(storage), storage)

			svcS3 := s3.New(session.New(&aws.Config{Region: aws.String("us-east-1")}))
			bucket := "opps"
			key := uTuple.s3Name
			_, err = svcS3.PutObject(&s3.PutObjectInput{
				Body:   strings.NewReader(string(storage)),
				Bucket: &bucket,
				Key:    &key,
			})
			if err != nil {
				fmt.Printf("Failed to upload data to %s/%s, %s\n", bucket, key, err)
				failed = 1
				passed = 0
			}
		}
		//		return statusTuple{passed, failed}
		return statusTuple{passed, failed}
		// count URLs
	}).Reduce(func(x statusTuple, y statusTuple) statusTuple {
		fmt.Printf("Red1: x= %v, y = %v\n", x, y)
		return statusTuple{x.pass + y.pass, x.fail + y.fail}
	}).Map(func(x statusTuple) {
		fmt.Printf("Map4 Result: passed = %d, failed = %d\n", x.pass, x.fail)
	}).Run()

}
Exemple #22
0
// Transform characters with accents into plan forms
func NeuterAccents(s string) string {
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	result, _, _ := transform.String(t, string(s))

	return result
}
Exemple #23
0
func NewReader(r io.Reader) io.Reader {
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	return transform.NewReader(r, t)
}
Exemple #24
0
func Bytes(b []byte) ([]byte, error) {
	t := transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
	res, _, err := transform.Bytes(t, b)
	return res, err
}
Exemple #25
0
func normalizer() transform.Transformer {
	isMn := func(r rune) bool {
		return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
	}
	return transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
}
Exemple #26
0
type PathSlice []Path

// Swap implements sort.Interface (and index.Swapper).
func (p PathSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] }

// Less implements sort.Interface.
func (p PathSlice) Less(i, j int) bool { return p[i].Encode() < p[j].Encode() }

// Len implements sort.Interface.
func (p PathSlice) Len() int { return len(p) }

func isMn(r rune) bool {
	return unicode.Is(unicode.Mn, r) // Mn: nonspacing marks
}

var transformer = transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)

func removeNonAlphaNumeric(s string) string {
	in := []rune(s)
	res := make([]rune, len(in))
	i := 0
	for _, x := range s {
		if x == '-' {
			res[i] = ' '
			i++
			continue
		}
		if unicode.IsLetter(x) || unicode.IsDigit(x) || unicode.IsSpace(x) {
			res[i] = unicode.ToLower(x)
			i++
		}
Exemple #27
0
// NewWriter returns a writer which encodes to the given encoding, utf8.
//
// If enc is nil, then only an utf8-enforcing replacement writer
// (see http://godoc.org/code.google.com/p/go.text/encoding#pkg-variables)
// is used.
func NewWriter(w io.Writer, enc encoding.Encoding) io.WriteCloser {
	if enc == nil || enc == encoding.Replacement {
		return transform.NewWriter(w, encoding.Replacement.NewEncoder())
	}
	return transform.NewWriter(w, transform.Chain(enc.NewEncoder()))
}
Exemple #28
0
func init() {
	stripT = transform.Chain(
		norm.NFD,
		transform.RemoveFunc(isMn),
		norm.NFC)
}