Beispiel #1
0
func Test_eq(t *testing.T) {
	for _, ss := range good {
		scan := scanner.New([]byte(ss[0]))
		for _, s := range ss[1:] {
			code, ok := scan.Symbol()
			if !ok || s != code {
				t.Fatal(ok, s, code)
			}
		}
	}
}
Beispiel #2
0
//	干净的源码没有多余的占位和注释, 解析过程就是选取干净的 Token 构成当前节点.
//	缩进, 占位, 注释,间隔符号, 分号, 换行只是被保存, 永远不会成为当前节点.
//	逗号, 分号, 换行用于产生 FFinal 标记, 并切换当前节点.
//
func Parse(src []byte, file *ast.File) (err error) {
	var (
		tabKind bool // 缩进风格
	)

	scan := scanner.New(src)
	for err == nil && !scan.IsEOF() {
		pos := scan.Pos()
		code, ok := scan.Symbol()

		if !ok {
			err = errors.New("invalid UTF-8 encode")
			break
		}

		tok := token.Lookup(code)

		// 根节点, 只包含声明和占位, 非声明都转换为占位
		if file.Active == file {
			if !tok.As(token.Declare) {
				// 占位扫描
				var tmp string
				for ok && tok != token.EOF && !tok.As(token.Declare) {
					code += scan.Tail(true) + tmp
					pos = scan.Pos()
					tmp, ok = scan.Symbol()
					tok = token.Lookup(tmp)
				}
				if !ok {
					err = errors.New("invalid UTF-8 encode")
					break
				}

				if err = file.Push(pos, token.PLACEHOLDER, code); err != nil {
					break
				}
				code = tmp
			}
			err = file.Push(pos, tok, code)
			continue
		}

		last := file.Last
		// 脏 Token 全部由 File 解决, 并且不影响当前节点
		//
		switch tok {

		case token.SPACES:
			// 不支持 SPACES, TABS 混搭缩进
			if last.Token() == token.INDENTATION ||
				tabKind && last.Token() == token.NL {
				err = errors.New("parser: bad indentation style for TABS + SPACES")
				continue
			}
			if last.Token() == token.NL {
				tok = token.INDENTATION
				break
			}
			// 丢弃分隔空格
			continue

		case token.TABS:
			if last.Token() == token.INDENTATION {
				err = errors.New("parser: bad indentation style for SPACES + TABS")
				continue
			}
			if last.Token() == token.NL {
				tok = token.INDENTATION
				tabKind = true
			} else {
				// TABS 尾注释
				code += scan.Tail(false)
				tok = token.COMMENT
			}
		case token.COMMENT:
			err = file.Push(pos, tok, code+scan.Tail(false))
			continue
		case token.COMMENTS:
			// 完整块注释
			for !scan.IsEOF() {
				tmp, _ := scan.Symbol()
				code += tmp
				tok = token.Lookup(tmp)
				if tok == token.COMMENTS {
					break
				}
			}
			if tok != token.COMMENTS {
				err = errors.New("parser: COMMENTS is incomplete")
			} else {
				err = file.Push(pos, tok, code+scan.Tail(false))
			}
			continue
		case token.DOT: // MEMBER, SUGAR
		case token.TRUE, token.FALSE:
			tok = token.VALBOOL
		case token.NAN, token.INFINITE:
			tok = token.VALFLOAT
		// case token.NULL:
		case token.PLACEHOLDER:
			// 识别语义, 只剩下字面值和标识符, 成员
			if code == "\"" || code == "'" {
				// 完整字符串
				code += scan.EndString(code == "\"")
				if scan.IsEOF() {
					err = errors.New("parser: string is incomplete")
					continue
				}
				tok = token.VALSTRING
				break
			}
			// 整数, 浮点数, datetime
			// ??? 缺少严格检查
			if code[0] >= '0' && code[0] <= '9' {
				tok = token.VALINTEGER
				if code[0] == '0' && len(code) > 2 && (code[1] == 'x' || code[1] == 'b') {
				} else {
					for _, c := range code {
						if c == '.' || c == 'e' {
							tok = token.VALFLOAT
						} else if c == 'T' || c == ':' || c == 'Z' {
							tok = token.VALDATETIME
						} else if (c < '0' || c > '9') && c != '+' && c != '-' && c != '_' {
							tok = token.PLACEHOLDER
							break
						}
					}
				}
			} else {
				// 标识符, 成员
				tok = token.IDENT
				dot := 0
				for _, c := range code {
					if c == '.' {
						dot++
						continue
					}

					if c != '_' && !(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') {
						tok = token.PLACEHOLDER
						break
					}
				}
				if dot != 0 && tok == token.IDENT {
					if dot == 1 {
						tok = token.MEMBER
					} else {
						tok = token.MEMBERS
					}
				}
			}
		}

		if err == nil {
			err = file.Push(pos, tok, code)
		}
	}
	return
}
Beispiel #3
0
// Fast 快速解析, 转换, 合并 zxx 源码 src 中的 Token.
//
// 参数 rec 用于逐个接收解析到的 Token, 包括 EOF.
// 如果 rec 为 nil, 返回值 nodes 包含所有的 Token, 不包括 EOF.
//
// 缺陷:
//
// Fast 通过分析源码缩进判断顶层占位, 这可能对英文(非多字节)开始的顶层占位有影响.
// 常规的缩进或用 '//', '---' 开始英文顶层占位可以弥补缺陷.
//
func Fast(src []byte, cb func(scanner.Pos, token.Token, string) error) (nodes []Symbol, err error) {
	var eml, indent string
	var delay, tok, prev token.Token

	if cb == nil {
		nodes = make([]Symbol, 0, len(src)/10)
	}

	rec := func(pos scanner.Pos, tok token.Token, code string) (err error) {
		// 合并空白行和占位为 PLACEHOLDER
		switch tok {
		case token.NL:
			if delay == token.EOF {
				delay = token.PLACEHOLDER
				break
			}
			if prev == tok {
				eml += code
				if delay == token.EOF {
					delay = token.NL
				} else {
					delay = token.PLACEHOLDER
				}
				return
			}

			if prev == token.INDENTATION {
				eml += code
				delay = token.PLACEHOLDER
				return
			}

			break
		case token.PLACEHOLDER, token.COMMENT:
			if prev == token.INDENTATION {
				eml += indent
				indent = ""
			}
			eml += code
			delay = token.PLACEHOLDER
			return
		case token.INDENTATION:
			indent = code
			return
		}

		if eml != "" {
			if cb == nil {
				nodes = append(nodes, Symbol{pos.Offset(-len(eml)), token.PLACEHOLDER, eml})
			} else {
				err = cb(pos.Offset(-len(eml)), token.PLACEHOLDER, eml)
				if err != nil {
					return
				}
			}
			eml = ""
		}

		if prev == token.INDENTATION {
			if cb == nil {
				nodes = append(nodes, Symbol{pos.Offset(-len(indent)), token.INDENTATION, indent})
			} else {
				err = cb(pos.Offset(-len(indent)), token.INDENTATION, indent)
				if err != nil {
					return
				}
			}
			indent = ""
		}

		if cb == nil {
			nodes = append(nodes, Symbol{pos, tok, code})
		} else {
			err = cb(pos, tok, code)
		}
		return
	}

	tabKind := false
	isTop := true
	scan := scanner.New(src)

	for err == nil {

		pos := scan.Pos()
		code, ok := scan.Symbol()
		//fmt.Println(pos, code, token.Lookup(code))
		if !ok {
			err = errors.New("invalid UTF-8 encode")
			return
		}

		prev = tok
		tok = token.Lookup(code)

		if tok == token.EOF {
			if nodes == nil {
				err = rec(pos, tok, code)
			}
			return
		}

		if isTop {
			isTop = false
			if !tok.As(token.Declare) {
				var tmp string
				posi := pos
				for ok && tok != token.EOF && !tok.As(token.Declare) {
					code += scan.Tail(true) + tmp
					pos = scan.Pos()
					tmp, ok = scan.Symbol()
					tok = token.Lookup(tmp)
				}

				if !ok {
					err = errors.New("invalid UTF-8 encode")
					return
				}
				err = rec(posi, token.PLACEHOLDER, code)
				code = tmp
			}
			if err == nil {
				err = rec(pos, tok, code)
			}
			continue
		}

		switch tok {

		case token.SPACES:
			// 不支持 SPACES, TABS 混搭缩进
			if prev == token.INDENTATION ||
				tabKind && prev == token.NL {
				err = errors.New("parser: bad indentation style for TABS + SPACES")
				return
			}
			if prev == token.NL {
				tok = token.INDENTATION
				break
			}
			// 丢弃分隔空格
			continue

		case token.TABS:
			if prev == token.INDENTATION {
				err = errors.New("parser: bad indentation style for SPACES + TABS")
				return
			}
			if prev == token.NL {
				tok = token.INDENTATION
				tabKind = true
			} else {
				// TABS 尾注释
				code += scan.Tail(false)
				tok = token.COMMENT
			}
		case token.COMMENT:
			err = rec(pos, tok, code+scan.Tail(false))
			continue
		case token.COMMENTS:
			// 完整块注释
			for {
				tmp, _ := scan.Symbol()
				code += tmp
				tok = token.Lookup(tmp)
				if tok == token.COMMENTS || tok == token.EOF {
					break
				}
			}
			if tok != token.COMMENTS {
				err = errors.New("parser: COMMENTS is incomplete")
				return
			}
			err = rec(pos, tok, code+scan.Tail(false))
			continue
		case token.TRUE, token.FALSE:
			tok = token.VALBOOL
		case token.NAN, token.INFINITE:
			tok = token.VALFLOAT
		case token.PLACEHOLDER:
			// 识别语义, 只剩下字面值和标识符, 成员
			if code == `"` || code == `'` {
				// 完整字符串
				code += scan.EndString(code == `"`)
				if code[0] != code[len(code)-1] {
					err = errors.New("parser: string is incomplete")
					return
				}
				tok = token.VALSTRING
				break
			}
			// 整数, 浮点数, datetime
			// ??? 缺少严格检查
			if code[0] >= '0' && code[0] <= '9' {
				tok = token.VALINTEGER
				if code[0] == '0' && len(code) > 2 && (code[1] == 'x' || code[1] == 'b') {
				} else {
					for _, c := range code {
						if c == '.' || c == 'e' {
							tok = token.VALFLOAT
						} else if c == 'T' || c == ':' || c == 'Z' {
							tok = token.VALDATETIME
						} else if (c < '0' || c > '9') && c != '+' && c != '-' && c != '_' {
							tok = token.PLACEHOLDER
							break
						}
					}
				}
			} else {
				// 标识符, 成员
				tok = token.IDENT
				dot := 0
				for _, c := range code {
					if c == '.' {
						dot++
						continue
					}

					if c != '_' && !(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') {
						tok = token.PLACEHOLDER
						break
					}
				}
				if dot != 0 && tok == token.IDENT {
					if dot == 1 {
						tok = token.MEMBER
					} else {
						tok = token.MEMBERS
					}
				}
			}
		}
		err = rec(pos, tok, code)
	}
	return
}