Ejemplo n.º 1
0
//「Golangで文字コード判定」qiita.com/nobuhito/items/ff782f64e32f7ed95e43
func transEnc(text string, encode string) (string, error) {
	body := []byte(text)
	var f []byte

	encodings := []string{"sjis", "utf-8"}
	if encode != "" {
		encodings = append([]string{encode}, encodings...)
	}
	for _, enc := range encodings {
		if enc != "" {
			ee, _ := charset.Lookup(enc)
			if ee == nil {
				continue
			}
			var buf bytes.Buffer
			ic := transform.NewWriter(&buf, ee.NewDecoder())
			_, err := ic.Write(body)
			if err != nil {
				continue
			}
			err = ic.Close()
			if err != nil {
				continue
			}
			f = buf.Bytes()
			break
		}
	}
	return string(f), nil
}
Ejemplo n.º 2
0
func defaultCharsetReader(cs string, input io.Reader) (io.Reader, error) {
	e, _ := charset.Lookup(cs)
	if e == nil {
		return nil, fmt.Errorf("cannot decode charset %v", cs)
	}
	return transform.NewReader(input, e.NewDecoder()), nil
}
Ejemplo n.º 3
0
// Encoding returns an Encoding for the response body.
func (res *Response) Encoding() (encoding.Encoding, error) {
	enc, _ := charset.Lookup(res.Charset)
	if enc == nil {
		return nil, fmt.Errorf("no encoding found for %s", res.Charset)
	}
	return enc, nil
}
Ejemplo n.º 4
0
func ToUtf8WithErr(content []byte) (error, string) {
	charsetLabel, err := DetectEncoding(content)
	if err != nil {
		return err, ""
	}

	if charsetLabel == "utf8" {
		return nil, string(content)
	}

	encoding, _ := charset.Lookup(charsetLabel)

	if encoding == nil {
		return fmt.Errorf("unknow char decoder %s", charsetLabel), string(content)
	}

	result, n, err := transform.String(encoding.NewDecoder(), string(content))

	// If there is an error, we concatenate the nicely decoded part and the
	// original left over. This way we won't loose data.
	if err != nil {
		result = result + string(content[n:])
	}

	return err, result
}
Ejemplo n.º 5
0
// Shift-JIS -> UTF-8
func to_utf8(str string) (string, error) {
	body, err := ioutil.ReadAll(transform.NewReader(strings.NewReader(str), japanese.ShiftJIS.NewEncoder()))
	if err != nil {
		return "", err
	}

	var f []byte
	encodings := []string{"sjis", "utf-8"}
	for _, enc := range encodings {
		if enc != "" {
			ee, _ := charset.Lookup(enc)
			if ee == nil {
				continue
			}
			var buf bytes.Buffer
			ic := transform.NewWriter(&buf, ee.NewDecoder())
			_, err := ic.Write(body)
			if err != nil {
				continue
			}
			err = ic.Close()
			if err != nil {
				continue
			}
			f = buf.Bytes()
			break
		}
	}
	return string(f), nil
}
Ejemplo n.º 6
0
func NewUTF8Reader(label string, r io.Reader) (io.Reader, error) {
	e, _ := charset.Lookup(label)
	if e == nil {
		return nil, fmt.Errorf("unsupported charset: %q", label)
	}
	return transform.NewReader(r, unicode.BOMOverride(e.NewDecoder())), nil
}
Ejemplo n.º 7
0
func encodeString(input_str, encode string) (output string, err_out error) {
	enc, _ := charset.Lookup(encode)
	r := transform.NewReader(strings.NewReader(input_str), enc.NewEncoder())
	b, err := ioutil.ReadAll(r)
	if err != nil {
		return
	}
	return string(b), nil
}
Ejemplo n.º 8
0
func parseHTML(content []byte, cs string) (*html.Node, error) {
	var r io.Reader = bytes.NewReader(content)

	if cs != "utf-8" {
		e, _ := charset.Lookup(cs)
		r = transform.NewReader(r, e.NewDecoder())
	}

	return html.Parse(r)
}
Ejemplo n.º 9
0
Archivo: mail.go Proyecto: kaey/mail
// DecodeCharset detects charset of str decodes it.
func decodeCharset(str, label string) (nstr string, err error) {
	enc, _ := charset.Lookup(label)
	if enc == nil {
		enc, _, _ = charset.DetermineEncoding([]byte(str), "text/plain")
	}

	nstr, _, err = transform.String(enc.NewDecoder(), str)
	if err != nil {
		return nstr, err
	}

	return stripNonUTF8(nstr), nil
}
Ejemplo n.º 10
0
func EncodeReader(s io.Reader, enc string) ([]byte, error) {
	e, _ := charset.Lookup(enc)
	if e == nil {
		return nil, errors.New(fmt.Sprintf("unsupported charset: %q", enc))
	}
	var buf bytes.Buffer
	writer := transform.NewWriter(&buf, e.NewEncoder())
	_, err := io.Copy(writer, s)
	if err != nil {
		return nil, err
	}
	return buf.Bytes(), nil
}
Ejemplo n.º 11
0
// scanContent scans the content of a document for phrases,
// and updates tally.
func (conf *config) scanContent(content []byte, contentType, cs string, tally map[rule]int) {
	if strings.Contains(contentType, "javascript") {
		conf.scanJSContent(content, tally)
		return
	}

	transformers := make([]transform.Transformer, 0, 3)
	if cs != "utf-8" {
		e, _ := charset.Lookup(cs)
		transformers = append(transformers, e.NewDecoder())
	}

	if strings.Contains(contentType, "html") {
		transformers = append(transformers, entityDecoder{})
	}
	transformers = append(transformers, new(wordTransformer))

	ps := newPhraseScanner(conf.ContentPhraseList, func(s string) {
		tally[rule{t: contentPhrase, content: s}]++
	})
	ps.scanByte(' ')

	var t transform.Transformer
	if len(transformers) == 1 {
		t = transformers[0]
	} else {
		t = transform.Chain(transformers...)
	}

	r := transform.NewReader(bytes.NewReader(content), t)

	buf := make([]byte, 4096)
	for {
		n, err := r.Read(buf)
		for _, c := range buf[:n] {
			ps.scanByte(c)
		}
		if err != nil {
			if err != io.EOF {
				log.Println("Error decoding page content:", err)
			}
			break
		}
	}

	ps.scanByte(' ')
}
Ejemplo n.º 12
0
func convertToUtf8(s string) string {
	b := []byte(s)
	d := chardet.NewTextDetector()
	r, err := d.DetectBest(b)
	if err != nil {
		return fmt.Sprintf("<Can't detect string charset: %s>", err.Error())
	}
	encoding, _ := charset.Lookup(r.Charset)
	if encoding == nil {
		return fmt.Sprintf("<Can't find encoding: %s>", r.Charset)
	}
	str, _, err := transform.String(encoding.NewDecoder(), s)
	if err != nil {
		return fmt.Sprintf("<Can't convert string from encoding %s to UTF8: %s>", r.Charset, err.Error())
	}
	return str
}
Ejemplo n.º 13
0
func (r *Response) convToUTF8(preview []byte, query func(*url.URL) string) {
	// Convert to UTF-8
	if media.IsHTML(r.ContentType) {
		e, name, certain := charset.DetermineEncoding(
			preview, r.ContentType,
		)
		// according to charset package source, default unknown charset is windows-1252.
		if !certain && name == "windows-1252" {
			if e, name = charset.Lookup(query(r.URL)); e != nil {
				certain = true
			}
		}
		r.Charset, r.CertainCharset, r.Encoding = name, certain, e
		if name != "" && e != nil {
			r.Body, _ = util.NewUTF8Reader(name, r.Body)
		}
	}
}
Ejemplo n.º 14
0
func decode(data []byte, charsetName string) ([]byte, error) {
	encoding, _ := charset.Lookup(charsetName)
	if encoding == nil {
		return nil, fmt.Errorf("Unsupported charset: %v", charsetName)
	}

	reader := bytes.NewReader(data)
	var b bytes.Buffer
	writer := bufio.NewWriter(&b)

	decodeReader := transform.NewReader(reader, encoding.NewDecoder())
	if _, err := io.Copy(writer, decodeReader); err != nil {
		return nil, err
	}
	if err := writer.Flush(); err != nil {
		return nil, err
	}

	if isUTF8Charset(charsetName) {
		return stripBOM(b.Bytes()), nil
	}
	return b.Bytes(), nil
}
Ejemplo n.º 15
0
func main() {
	flag.Parse()

	var in io.Reader
	in = os.Stdin

	if *cs != "utf-8" {
		e, _ := charset.Lookup(*cs)
		in = transform.NewReader(in, e.NewDecoder())
	}

	s := bufio.NewScanner(in)
	for s.Scan() {
		line := s.Text()
		if strings.Contains(line, ">,<") {
			continue
		}

		endPhrase := strings.Index(line, "><")
		if endPhrase != -1 {
			phrase := line[:endPhrase+1]
			rest := line[endPhrase+2:]
			endScore := strings.Index(rest, ">")
			if endScore != -1 {
				score := rest[:endScore]
				rest = strings.TrimSpace(rest[endScore+1:])
				fmt.Println(phrase, score, rest)
				continue
			}
		}

		fmt.Println(line)
	}
	if err := s.Err(); err != nil {
		log.Println(err)
	}
}
Ejemplo n.º 16
0
// UTF8encode converts a string from the source character set to UTF-8, skipping invalid byte sequences
// @see http://stackoverflow.com/questions/32512500/ignore-illegal-bytes-when-decoding-text-with-go
func UTF8encode(raw string, sourceCharset string) string {
	enc, name := charset.Lookup(sourceCharset)
	if nil == enc {
		fmt.Println("Cannot convert from", sourceCharset, ":", name)
		return raw
	}

	dst := make([]byte, len(raw))
	d := enc.NewDecoder()

	var (
		in  int
		out int
	)
	for in < len(raw) {
		// Do the transformation
		ndst, nsrc, err := d.Transform(dst[out:], []byte(raw[in:]), true)
		in += nsrc
		out += ndst
		if err == nil {
			// Completed transformation
			break
		}
		if err == transform.ErrShortDst {
			// Our output buffer is too small, so we need to grow it
			t := make([]byte, (cap(dst)+1)*2)
			copy(t, dst)
			dst = t
			continue
		}
		// We're here because of at least one illegal character. Skip over the current rune
		// and try again.
		_, width := utf8.DecodeRuneInString(raw[in:])
		in += width
	}
	return string(dst)
}
Ejemplo n.º 17
0
func ParsePatch(pid int64, maxlines int, cmd *exec.Cmd, reader io.Reader) (*Diff, error) {
	scanner := bufio.NewScanner(reader)
	var (
		curFile    *DiffFile
		curSection = &DiffSection{
			Lines: make([]*DiffLine, 0, 10),
		}

		leftLine, rightLine int
		isTooLong           bool
		// FIXME: Should use cache in the future.
		buf bytes.Buffer
	)

	diff := &Diff{Files: make([]*DiffFile, 0)}
	var i int
	for scanner.Scan() {
		line := scanner.Text()
		// fmt.Println(i, line)
		if strings.HasPrefix(line, "+++ ") || strings.HasPrefix(line, "--- ") {
			continue
		}

		if line == "" {
			continue
		}

		i = i + 1

		// Diff data too large, we only show the first about maxlines lines
		if i == maxlines {
			isTooLong = true
			log.Warn("Diff data too large")
		}

		switch {
		case line[0] == ' ':
			diffLine := &DiffLine{Type: DIFF_LINE_PLAIN, Content: line, LeftIdx: leftLine, RightIdx: rightLine}
			leftLine++
			rightLine++
			curSection.Lines = append(curSection.Lines, diffLine)
			continue
		case line[0] == '@':
			if isTooLong {
				break
			}

			curSection = &DiffSection{}
			curFile.Sections = append(curFile.Sections, curSection)
			ss := strings.Split(line, "@@")
			diffLine := &DiffLine{Type: DIFF_LINE_SECTION, Content: line}
			curSection.Lines = append(curSection.Lines, diffLine)

			// Parse line number.
			ranges := strings.Split(ss[1][1:], " ")
			leftLine, _ = com.StrTo(strings.Split(ranges[0], ",")[0][1:]).Int()
			if len(ranges) > 1 {
				rightLine, _ = com.StrTo(strings.Split(ranges[1], ",")[0]).Int()
			} else {
				log.Warn("Parse line number failed: %v", line)
				rightLine = leftLine
			}
			continue
		case line[0] == '+':
			curFile.Addition++
			diff.TotalAddition++
			diffLine := &DiffLine{Type: DIFF_LINE_ADD, Content: line, RightIdx: rightLine}
			rightLine++
			curSection.Lines = append(curSection.Lines, diffLine)
			continue
		case line[0] == '-':
			curFile.Deletion++
			diff.TotalDeletion++
			diffLine := &DiffLine{Type: DIFF_LINE_DEL, Content: line, LeftIdx: leftLine}
			if leftLine > 0 {
				leftLine++
			}
			curSection.Lines = append(curSection.Lines, diffLine)
		case strings.HasPrefix(line, "Binary"):
			curFile.IsBin = true
			continue
		}

		// Get new file.
		if strings.HasPrefix(line, DIFF_HEAD) {
			if isTooLong {
				break
			}

			beg := len(DIFF_HEAD)
			a := line[beg : (len(line)-beg)/2+beg]

			// In case file name is surrounded by double quotes(it happens only in git-shell).
			if a[0] == '"' {
				a = a[1 : len(a)-1]
				a = strings.Replace(a, `\"`, `"`, -1)
			}

			curFile = &DiffFile{
				Name:     a[strings.Index(a, "/")+1:],
				Index:    len(diff.Files) + 1,
				Type:     DIFF_FILE_CHANGE,
				Sections: make([]*DiffSection, 0, 10),
			}
			diff.Files = append(diff.Files, curFile)

			// Check file diff type.
			for scanner.Scan() {
				switch {
				case strings.HasPrefix(scanner.Text(), "new file"):
					curFile.Type = DIFF_FILE_ADD
					curFile.IsDeleted = false
					curFile.IsCreated = true
				case strings.HasPrefix(scanner.Text(), "deleted"):
					curFile.Type = DIFF_FILE_DEL
					curFile.IsCreated = false
					curFile.IsDeleted = true
				case strings.HasPrefix(scanner.Text(), "index"):
					curFile.Type = DIFF_FILE_CHANGE
					curFile.IsCreated = false
					curFile.IsDeleted = false
				}
				if curFile.Type > 0 {
					break
				}
			}
		}
	}

	for _, f := range diff.Files {
		buf.Reset()
		for _, sec := range f.Sections {
			for _, l := range sec.Lines {
				buf.WriteString(l.Content)
				buf.WriteString("\n")
			}
		}
		charsetLabel, err := base.DetectEncoding(buf.Bytes())
		if charsetLabel != "UTF-8" && err == nil {
			encoding, _ := charset.Lookup(charsetLabel)
			if encoding != nil {
				d := encoding.NewDecoder()
				for _, sec := range f.Sections {
					for _, l := range sec.Lines {
						if c, _, err := transform.String(d, l.Content); err == nil {
							l.Content = c
						}
					}
				}
			}
		}
	}
	return diff, nil
}
Ejemplo n.º 18
0
func ParsePatch(maxLines, maxLineCharacteres, maxFiles int, reader io.Reader) (*Diff, error) {
	var (
		diff = &Diff{Files: make([]*DiffFile, 0)}

		curFile    *DiffFile
		curSection = &DiffSection{
			Lines: make([]*DiffLine, 0, 10),
		}

		leftLine, rightLine int
		lineCount           int
		curFileLinesCount   int
	)

	input := bufio.NewReader(reader)
	isEOF := false
	for !isEOF {
		line, err := input.ReadString('\n')
		if err != nil {
			if err == io.EOF {
				isEOF = true
			} else {
				return nil, fmt.Errorf("ReadString: %v", err)
			}
		}

		if len(line) > 0 && line[len(line)-1] == '\n' {
			// Remove line break.
			line = line[:len(line)-1]
		}

		if strings.HasPrefix(line, "+++ ") || strings.HasPrefix(line, "--- ") || len(line) == 0 {
			continue
		}

		curFileLinesCount++
		lineCount++

		// Diff data too large, we only show the first about maxlines lines
		if curFileLinesCount >= maxLines || len(line) >= maxLineCharacteres {
			curFile.IsIncomplete = true
		}

		switch {
		case line[0] == ' ':
			diffLine := &DiffLine{Type: DIFF_LINE_PLAIN, Content: line, LeftIdx: leftLine, RightIdx: rightLine}
			leftLine++
			rightLine++
			curSection.Lines = append(curSection.Lines, diffLine)
			continue
		case line[0] == '@':
			curSection = &DiffSection{}
			curFile.Sections = append(curFile.Sections, curSection)
			ss := strings.Split(line, "@@")
			diffLine := &DiffLine{Type: DIFF_LINE_SECTION, Content: line}
			curSection.Lines = append(curSection.Lines, diffLine)

			// Parse line number.
			ranges := strings.Split(ss[1][1:], " ")
			leftLine, _ = com.StrTo(strings.Split(ranges[0], ",")[0][1:]).Int()
			if len(ranges) > 1 {
				rightLine, _ = com.StrTo(strings.Split(ranges[1], ",")[0]).Int()
			} else {
				log.Warn("Parse line number failed: %v", line)
				rightLine = leftLine
			}
			continue
		case line[0] == '+':
			curFile.Addition++
			diff.TotalAddition++
			diffLine := &DiffLine{Type: DIFF_LINE_ADD, Content: line, RightIdx: rightLine}
			rightLine++
			curSection.Lines = append(curSection.Lines, diffLine)
			continue
		case line[0] == '-':
			curFile.Deletion++
			diff.TotalDeletion++
			diffLine := &DiffLine{Type: DIFF_LINE_DEL, Content: line, LeftIdx: leftLine}
			if leftLine > 0 {
				leftLine++
			}
			curSection.Lines = append(curSection.Lines, diffLine)
		case strings.HasPrefix(line, "Binary"):
			curFile.IsBin = true
			continue
		}

		// Get new file.
		if strings.HasPrefix(line, DIFF_HEAD) {
			middle := -1

			// Note: In case file name is surrounded by double quotes (it happens only in git-shell).
			// e.g. diff --git "a/xxx" "b/xxx"
			hasQuote := line[len(DIFF_HEAD)] == '"'
			if hasQuote {
				middle = strings.Index(line, ` "b/`)
			} else {
				middle = strings.Index(line, " b/")
			}

			beg := len(DIFF_HEAD)
			a := line[beg+2 : middle]
			b := line[middle+3:]
			if hasQuote {
				a = string(git.UnescapeChars([]byte(a[1 : len(a)-1])))
				b = string(git.UnescapeChars([]byte(b[1 : len(b)-1])))
			}

			curFile = &DiffFile{
				Name:     a,
				Index:    len(diff.Files) + 1,
				Type:     DIFF_FILE_CHANGE,
				Sections: make([]*DiffSection, 0, 10),
			}
			diff.Files = append(diff.Files, curFile)
			if len(diff.Files) >= maxFiles {
				diff.IsIncomplete = true
				io.Copy(ioutil.Discard, reader)
				break
			}
			curFileLinesCount = 0

			// Check file diff type and is submodule.
			for {
				line, err := input.ReadString('\n')
				if err != nil {
					if err == io.EOF {
						isEOF = true
					} else {
						return nil, fmt.Errorf("ReadString: %v", err)
					}
				}

				switch {
				case strings.HasPrefix(line, "new file"):
					curFile.Type = DIFF_FILE_ADD
					curFile.IsCreated = true
				case strings.HasPrefix(line, "deleted"):
					curFile.Type = DIFF_FILE_DEL
					curFile.IsDeleted = true
				case strings.HasPrefix(line, "index"):
					curFile.Type = DIFF_FILE_CHANGE
				case strings.HasPrefix(line, "similarity index 100%"):
					curFile.Type = DIFF_FILE_RENAME
					curFile.IsRenamed = true
					curFile.OldName = curFile.Name
					curFile.Name = b
				}
				if curFile.Type > 0 {
					if strings.HasSuffix(line, " 160000\n") {
						curFile.IsSubmodule = true
					}
					break
				}
			}
		}
	}

	// FIXME: detect encoding while parsing.
	var buf bytes.Buffer
	for _, f := range diff.Files {
		buf.Reset()
		for _, sec := range f.Sections {
			for _, l := range sec.Lines {
				buf.WriteString(l.Content)
				buf.WriteString("\n")
			}
		}
		charsetLabel, err := base.DetectEncoding(buf.Bytes())
		if charsetLabel != "UTF-8" && err == nil {
			encoding, _ := charset.Lookup(charsetLabel)
			if encoding != nil {
				d := encoding.NewDecoder()
				for _, sec := range f.Sections {
					for _, l := range sec.Lines {
						if c, _, err := transform.String(d, l.Content); err == nil {
							l.Content = c
						}
					}
				}
			}
		}
	}
	return diff, nil
}
Ejemplo n.º 19
0
func main() {
	var args []string

	argv := os.Args
	argc := len(argv)
	for n := 1; n < argc; n++ {
		if len(argv[n]) > 1 && argv[n][0] == '-' && argv[n][1] != '-' {
			switch argv[n][1] {
			case 'A':
				if n < argc-1 {
					after, _ = strconv.Atoi(argv[n+1])
					n++
					continue
				}
			case 'B':
				if n < argc-1 {
					before, _ = strconv.Atoi(argv[n+1])
					n++
					continue
				}
			case '8':
				utf8out = true
			case 'F':
				fixed = true
			case 'R':
				recursive = true
			case 'S':
				verbose = true
			case 'c':
				count = true
			case 'r':
				fullpath = false
			case 'i':
				ignorecase = true
			case 'I':
				ignorebinary = true
			case 'l':
				list = true
			case 'n':
				number = true
			case 'P':
				perl = true
			case 'G':
				basic = true
			case 'v':
				invert = true
			case 'o':
				only = true
			case 'f':
				if n < argc-1 {
					infile = argv[n+1]
					n++
					continue
				}
			case 'z':
				zeroData = true
			case 'Z':
				zeroFile = true
			case 'V':
				fmt.Fprintf(os.Stdout, "%s\n", version)
				os.Exit(0)
			default:
				usage(true)
			}
			if len(argv[n]) > 2 {
				argv[n] = "-" + argv[n][2:]
				n--
			}
		} else if len(argv[n]) > 1 && argv[n][0] == '-' && argv[n][1] == '-' {
			name := argv[n][2:]
			switch {
			case strings.HasPrefix(name, "enc="):
				encs = name[4:]
			case name == "enc" && n < argc-1:
				encs = argv[n+1]
				n++
			case strings.HasPrefix(name, "exclude="):
				exclude = name[8:]
			case name == "exclude" && n < argc-1:
				exclude = argv[n+1]
				n++
			case strings.HasPrefix(name, "color="):
				color = name[6:]
			case name == "color" && n < argc-1:
				color = argv[n+1]
				n++
			case strings.HasPrefix(name, "separator="):
				separator = name[10:]
			case name == "separator":
				separator = argv[n+1]
				n++
			case name == "null":
				zeroFile = true
			case name == "null-data":
				zeroData = true
			case name == "help":
				usage(false)
			default:
				usage(true)
			}
		} else {
			args = append(args, argv[n])
		}
	}

	if len(args) == 0 {
		usage(true)
	}

	var err error
	var pattern interface{}
	if encs != "" {
		encodings = strings.Split(encs, ",")
	} else {
		enc_env := os.Getenv("JVGREP_ENCODINGS")
		if enc_env != "" {
			encodings = strings.Split(enc_env, ",")
		}
	}
	out_enc := os.Getenv("JVGREP_OUTPUT_ENCODING")
	if out_enc != "" {
		ee, _ := charset.Lookup(out_enc)
		if ee == nil {
			errorline(fmt.Sprintf("unknown encoding: %s", out_enc))
			os.Exit(1)
		}
		oc = transform.NewWriter(os.Stdout, ee.NewEncoder())
	}

	instr := ""
	argindex := 0
	if len(infile) > 0 {
		b, err := ioutil.ReadFile(infile)
		if err != nil {
			errorline(err.Error())
			os.Exit(1)
		}
		instr = strings.TrimSpace(string(b))
	} else {
		instr = args[0]
		argindex = 1
	}
	if fixed {
		pattern = instr
	} else if perl {
		re, err := syntax.Parse(instr, syntax.Perl)
		if err != nil {
			errorline(err.Error())
			os.Exit(1)
		}
		rec, err := syntax.Compile(re)
		if err != nil {
			errorline(err.Error())
			os.Exit(1)
		}
		instr = rec.String()
		if ignorecase {
			instr = "(?i:" + instr + ")"
		}
		pattern, err = regexp.Compile(instr)
		if err != nil {
			errorline(err.Error())
			os.Exit(1)
		}
	} else {
		if ignorecase {
			instr = "(?i:" + instr + ")"
		}
		pattern, err = regexp.Compile(instr)
		if err != nil {
			errorline(err.Error())
			os.Exit(1)
		}
	}

	if exclude == "" {
		exclude = os.Getenv("JVGREP_EXCLUDE")
	}
	if exclude == "" {
		exclude = excludeDefaults
	}
	ere, err := regexp.Compile(exclude)
	if err != nil {
		errorline(err.Error())
		os.Exit(1)
	}

	atty := false
	if color == "" {
		color = os.Getenv("JVGREP_COLOR")
	}
	if color == "" || color == "auto" {
		atty = isatty.IsTerminal(os.Stdout.Fd())
	} else if color == "always" {
		atty = true
	} else if color == "never" {
		atty = false
	} else {
		usage(true)
	}

	if atty {
		sc := make(chan os.Signal, 10)
		signal.Notify(sc, syscall.SIGTERM, syscall.SIGINT, syscall.SIGHUP)
		go func() {
			for _ = range sc {
				ct.ResetColor()
				os.Exit(0)
			}
		}()
	}

	if len(args) == 1 && argindex != 0 {
		Grep(&GrepArg{
			pattern: pattern,
			input:   os.Stdin,
			single:  true,
			atty:    atty,
		})
		return
	}

	envre := regexp.MustCompile(`^(\$[a-zA-Z][a-zA-Z0-9_]+|\$\([a-zA-Z][a-zA-Z0-9_]+\))$`)
	globmask := ""

	ch := make(chan *GrepArg, 10)
	done := make(chan int)
	go GoGrep(ch, done)
	nargs := len(args[argindex:])
	for _, arg := range args[argindex:] {
		globmask = ""
		root := ""
		arg = strings.Trim(arg, `"`)
		for n, i := range strings.Split(filepath.ToSlash(arg), "/") {
			if root == "" && strings.Index(i, "*") != -1 {
				if globmask == "" {
					root = "."
				} else {
					root = filepath.ToSlash(globmask)
				}
			}
			if n == 0 && i == "~" {
				if runtime.GOOS == "windows" {
					i = os.Getenv("USERPROFILE")
				} else {
					i = os.Getenv("HOME")
				}
			}
			if envre.MatchString(i) {
				i = strings.Trim(strings.Trim(os.Getenv(i[1:]), "()"), `"`)
			}

			globmask = filepath.Join(globmask, i)
			if n == 0 {
				if runtime.GOOS == "windows" && filepath.VolumeName(i) != "" {
					globmask = i + "/"
				} else if len(globmask) == 0 {
					globmask = "/"
				}
			}
		}
		if root == "" {
			path, _ := filepath.Abs(arg)
			fi, err := os.Stat(path)
			if err != nil {
				errorline(fmt.Sprintf("jvgrep: %s: No such file or directory", arg))
				os.Exit(1)
			}
			if !fi.IsDir() {
				if verbose {
					println("search:", path)
				}
				ch <- &GrepArg{
					pattern: pattern,
					input:   path,
					single:  nargs == 1,
					atty:    atty,
				}
				continue
			} else {
				root = path
				if fi.IsDir() {
					globmask = "**/*"
				} else {
					globmask = "**/" + globmask
				}
			}
		}
		if globmask == "" {
			globmask = "."
		}
		globmask = filepath.ToSlash(filepath.Clean(globmask))
		if recursive {
			if strings.Index(globmask, "/") > -1 {
				globmask += "/"
			} else {
				globmask = "**/" + globmask
			}
		}

		cc := []rune(globmask)
		dirmask := ""
		filemask := ""
		for i := 0; i < len(cc); i++ {
			if cc[i] == '*' {
				if i < len(cc)-2 && cc[i+1] == '*' && cc[i+2] == '/' {
					filemask += "(.*/)?"
					dirmask = filemask
					i += 2
				} else {
					filemask += "[^/]*"
				}
			} else {
				c := cc[i]
				if c == '/' || ('0' <= c && c <= '9') || ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || 255 < c {
					filemask += string(c)
				} else {
					filemask += fmt.Sprintf("[\\x%x]", c)
				}
				if c == '/' && dirmask == "" && strings.Index(filemask, "*") != -1 {
					dirmask = filemask
				}
			}
		}
		if dirmask == "" {
			dirmask = filemask
		}
		if len(filemask) > 0 && filemask[len(filemask)-1] == '/' {
			if root == "" {
				root = filemask
			}
			filemask += "[^/]*"
		}
		if runtime.GOOS == "windows" || runtime.GOOS == "darwin" {
			dirmask = "(?i:" + dirmask + ")"
			filemask = "(?i:" + filemask + ")"
		}
		dre := regexp.MustCompile("^" + dirmask)
		fre := regexp.MustCompile("^" + filemask + "$")

		root = filepath.Clean(root)

		if verbose {
			println("dirmask:", dirmask)
			println("filemask:", filemask)
			println("root:", root)
		}
		filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
			if info == nil {
				return err
			}

			path = filepath.ToSlash(path)

			if ere != nil && ere.MatchString(path) {
				if info.IsDir() {
					return filepath.SkipDir
				}
				return nil
			}

			if info.IsDir() {
				if path == "." || recursive || len(path) <= len(root) || dre.MatchString(path+"/") {
					return nil
				}
				return filepath.SkipDir
			}

			if fre.MatchString(path) && info.Mode().IsRegular() {
				if verbose {
					println("search:", path)
				}
				ch <- &GrepArg{
					pattern: pattern,
					input:   path,
					single:  false,
					atty:    atty,
				}
			}
			return nil
		})
	}
	ch <- nil
	if count {
		fmt.Println(countMatch)
	}
	<-done
}
Ejemplo n.º 20
0
func doGrep(path string, f []byte, arg *GrepArg) {
	encs := encodings

	if ignorebinary {
		if bytes.IndexFunc(f, func(r rune) bool { return 0 < r && r < 0x9 }) != -1 {
			return
		}
	}

	if len(f) > 2 {
		if f[0] == 0xfe && f[1] == 0xff {
			arg.bom = f[0:2]
			f = f[2:]
		} else if f[0] == 0xff && f[1] == 0xfe {
			arg.bom = f[0:2]
			f = f[2:]
		}
	}
	if len(arg.bom) > 0 {
		if arg.bom[0] == 0xfe && arg.bom[1] == 0xff {
			encs = []string{"utf-16be"}
		} else if arg.bom[0] == 0xff && arg.bom[1] == 0xfe {
			encs = []string{"utf-16le"}
		}
	}

	for _, enc := range encs {
		if verbose {
			println("trying("+enc+"):", path)
		}
		if len(arg.bom) > 0 && enc != "utf-16be" && enc != "utf-16le" {
			continue
		}

		did := false
		var t []byte
		var n, l, size, next, prev int

		if enc != "" {
			if len(arg.bom) > 0 || bytes.IndexFunc(f, func(r rune) bool { return 0 < r && r < 0x9 }) == -1 {
				ee, _ := charset.Lookup(enc)
				if ee == nil {
					continue
				}
				var buf bytes.Buffer
				ic := transform.NewWriter(&buf, ee.NewDecoder())
				_, err := ic.Write(f)
				if err != nil {
					next = -1
					continue
				}
				lf := false
				if len(arg.bom) > 0 && len(f)%2 != 0 {
					ic.Write([]byte{0})
					lf = true
				}
				err = ic.Close()
				if err != nil {
					if verbose {
						println(err.Error())
					}
					next = -1
					continue
				}
				f = buf.Bytes()
				if lf {
					f = f[:len(f)-1]
				}
			}
		}
		size = len(f)
		if size == 0 {
			continue
		}

		for next != -1 {
			for {
				if next >= size {
					next = -1
					break
				}
				if f[next] == '\n' {
					break
				}
				next++
			}
			n++
			if next == -1 {
				t = f[prev:]
			} else {
				t = f[prev:next]
				prev = next + 1
				next++
			}

			l = len(t)
			if l > 0 && t[l-1] == '\r' {
				t = t[:l-1]
				l--
			}

			var match bool
			if only {
				var matches []string
				ts := string(t)
				if re, ok := arg.pattern.(*regexp.Regexp); ok {
					matches = re.FindAllString(ts, -1)
				} else if s, ok := arg.pattern.(string); ok {
					if ignorecase {
						ts = strings.ToLower(ts)
					}
					ti := 0
					tl := len(ts)
					for ti != -1 && ti < tl-1 {
						ti = strings.Index(ts[ti:], s)
						if ti != -1 {
							matches = append(matches, s)
							ti++
						}
					}
				}
				match = len(matches) > 0
				// skip if not match without invert, or match with invert.
				if match == invert {
					continue
				}
				if verbose {
					println("found("+enc+"):", path)
				}
				if list {
					matchedfile(path)
					did = true
					break
				}
				for _, m := range matches {
					countMatch++
					if count {
						continue
					}
					if strings.IndexFunc(
						m, func(r rune) bool {
							return 0 < r && r < 0x9
						}) != -1 {
						errorline(fmt.Sprintf("matched binary file: %s", path))
						did = true
						break
					} else {
						if number {
							if utf8.ValidString(m) {
								matchedline(path, n, m, arg)
							} else {
								errorline(fmt.Sprintf("matched binary file: %s", path))
								did = true
								break
							}
						} else {
							if utf8.ValidString(m) {
								matchedline("", 0, m, arg)
							} else {
								errorline(fmt.Sprintf("matched binary file: %s", path))
								did = true
								break
							}
						}
					}
				}
			} else {
				if re, ok := arg.pattern.(*regexp.Regexp); ok {
					if len(re.FindAllIndex(t, 1)) > 0 {
						match = true
					}
				} else if s, ok := arg.pattern.(string); ok {
					if ignorecase {
						if strings.Index(strings.ToLower(string(t)),
							strings.ToLower(s)) > -1 {
							match = true
						}
					} else {
						if strings.Index(string(t), s) > -1 {
							match = true
						}
					}
				}
				// skip if not match without invert, or match with invert.
				if match == invert {
					continue
				}
				if verbose {
					println("found("+enc+"):", path)
				}
				if list {
					matchedfile(path)
					did = true
					break
				}
				countMatch++
				if count {
					did = true
					continue
				}
				if arg.single && !number {
					if utf8.Valid(t) {
						matchedline("", -1, string(t), arg)
					} else {
						errorline(fmt.Sprintf("matched binary file: %s", path))
						did = true
						break
					}
				} else {
					if bytes.IndexFunc(
						t, func(r rune) bool {
							return 0 < r && r < 0x9
						}) != -1 {
						errorline(fmt.Sprintf("matched binary file: %s", path))
						did = true
						break
					} else if utf8.Valid(t) {
						if after <= 0 && before <= 0 {
							matchedline(path, n, string(t), arg)
						} else {
							if countMatch > 1 {
								os.Stdout.WriteString("---\n")
							}
							bprev, bnext := next-l-2, next-l-2
							lines := make([]string, 0)
							for i := 0; i < before && bprev > 0; i++ {
								for {
									if bprev == 0 || f[bprev-1] == '\n' {
										lines = append(lines, string(f[bprev:bnext]))
										bnext = bprev - 1
										bprev--
										break
									}
									bprev--
								}
							}
							for i := len(lines); i > 0; i-- {
								matchedline(path, i-n, lines[i-1], arg)
							}
							matchedline(path, n, string(t), arg)
							lines = make([]string, 0)
							aprev, anext := next, next
							for i := 0; i < after && anext >= 0 && anext < size; i++ {
								for {
									if anext == size || f[anext] == '\n' {
										lines = append(lines, string(f[aprev:anext]))
										aprev = anext + 1
										anext++
										break
									}
									anext++
								}
							}
							for i := 0; i < len(lines); i++ {
								matchedline(path, -n-i-1, lines[i], arg)
							}
						}
					} else {
						errorline(fmt.Sprintf("matched binary file: %s", path))
						did = true
						break
					}
				}
			}
			did = true
		}
		runtime.GC()
		if did || next == -1 {
			break
		}
	}
}