// Fast 快速解析, 转换, 合并 zxx 源码 src 中的 Token. // // 参数 rec 用于逐个接收解析到的 Token, 包括 EOF. // 如果 rec 为 nil, 返回值 nodes 包含所有的 Token, 不包括 EOF. // // 缺陷: // // Fast 通过分析源码缩进判断顶层占位, 这可能对英文(非多字节)开始的顶层占位有影响. // 常规的缩进或用 '//', '---' 开始英文顶层占位可以弥补缺陷. // func Fast(src []byte, cb func(scanner.Pos, token.Token, string) error) (nodes []Symbol, err error) { var eml, indent string var delay, tok, prev token.Token if cb == nil { nodes = make([]Symbol, 0, len(src)/10) } rec := func(pos scanner.Pos, tok token.Token, code string) (err error) { // 合并空白行和占位为 PLACEHOLDER switch tok { case token.NL: if delay == token.EOF { delay = token.PLACEHOLDER break } if prev == tok { eml += code if delay == token.EOF { delay = token.NL } else { delay = token.PLACEHOLDER } return } if prev == token.INDENTATION { eml += code delay = token.PLACEHOLDER return } break case token.PLACEHOLDER, token.COMMENT: if prev == token.INDENTATION { eml += indent indent = "" } eml += code delay = token.PLACEHOLDER return case token.INDENTATION: indent = code return } if eml != "" { if cb == nil { nodes = append(nodes, Symbol{pos.Offset(-len(eml)), token.PLACEHOLDER, eml}) } else { err = cb(pos.Offset(-len(eml)), token.PLACEHOLDER, eml) if err != nil { return } } eml = "" } if prev == token.INDENTATION { if cb == nil { nodes = append(nodes, Symbol{pos.Offset(-len(indent)), token.INDENTATION, indent}) } else { err = cb(pos.Offset(-len(indent)), token.INDENTATION, indent) if err != nil { return } } indent = "" } if cb == nil { nodes = append(nodes, Symbol{pos, tok, code}) } else { err = cb(pos, tok, code) } return } tabKind := false isTop := true scan := scanner.New(src) for err == nil { pos := scan.Pos() code, ok := scan.Symbol() //fmt.Println(pos, code, token.Lookup(code)) if !ok { err = errors.New("invalid UTF-8 encode") return } prev = tok tok = token.Lookup(code) if tok == token.EOF { if nodes == nil { err = rec(pos, tok, code) } return } if isTop { isTop = false if !tok.As(token.Declare) { var tmp string posi := pos for ok && tok != token.EOF && !tok.As(token.Declare) { code += scan.Tail(true) + tmp pos = scan.Pos() tmp, ok = scan.Symbol() tok = token.Lookup(tmp) } if !ok { err = errors.New("invalid UTF-8 encode") return } err = rec(posi, token.PLACEHOLDER, code) code = tmp } if err == nil { err = rec(pos, tok, code) } continue } switch tok { case token.SPACES: // 不支持 SPACES, TABS 混搭缩进 if prev == token.INDENTATION || tabKind && prev == token.NL { err = errors.New("parser: bad indentation style for TABS + SPACES") return } if prev == token.NL { tok = token.INDENTATION break } // 丢弃分隔空格 continue case token.TABS: if prev == token.INDENTATION { err = errors.New("parser: bad indentation style for SPACES + TABS") return } if prev == token.NL { tok = token.INDENTATION tabKind = true } else { // TABS 尾注释 code += scan.Tail(false) tok = token.COMMENT } case token.COMMENT: err = rec(pos, tok, code+scan.Tail(false)) continue case token.COMMENTS: // 完整块注释 for { tmp, _ := scan.Symbol() code += tmp tok = token.Lookup(tmp) if tok == token.COMMENTS || tok == token.EOF { break } } if tok != token.COMMENTS { err = errors.New("parser: COMMENTS is incomplete") return } err = rec(pos, tok, code+scan.Tail(false)) continue case token.TRUE, token.FALSE: tok = token.VALBOOL case token.NAN, token.INFINITE: tok = token.VALFLOAT case token.PLACEHOLDER: // 识别语义, 只剩下字面值和标识符, 成员 if code == `"` || code == `'` { // 完整字符串 code += scan.EndString(code == `"`) if code[0] != code[len(code)-1] { err = errors.New("parser: string is incomplete") return } tok = token.VALSTRING break } // 整数, 浮点数, datetime // ??? 缺少严格检查 if code[0] >= '0' && code[0] <= '9' { tok = token.VALINTEGER if code[0] == '0' && len(code) > 2 && (code[1] == 'x' || code[1] == 'b') { } else { for _, c := range code { if c == '.' || c == 'e' { tok = token.VALFLOAT } else if c == 'T' || c == ':' || c == 'Z' { tok = token.VALDATETIME } else if (c < '0' || c > '9') && c != '+' && c != '-' && c != '_' { tok = token.PLACEHOLDER break } } } } else { // 标识符, 成员 tok = token.IDENT dot := 0 for _, c := range code { if c == '.' { dot++ continue } if c != '_' && !(c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' || c >= '0' && c <= '9') { tok = token.PLACEHOLDER break } } if dot != 0 && tok == token.IDENT { if dot == 1 { tok = token.MEMBER } else { tok = token.MEMBERS } } } } err = rec(pos, tok, code) } return }
// File.Push 接收扫描到的 Token, func (b *File) Push(pos scanner.Pos, tok token.Token, code string) error { var flag Flag switch tok { case token.NL: flag = FText last := b.Last.Token() if last == token.NL { tok = token.EMPTYLINE // 转 break } if last == token.INDENTATION || last == token.EMPTYLINE { text := b.Last.(*Text) text.Source += code text.Tok = token.EMPTYLINE // 合并 return nil } last = b.Active.Token() if last == token.RIGHT || last == token.IDENT || last.As(token.Literal) { err := b.Active.Final() if err != nil { return err } flag |= FFinal } // 识别 Python 缩进风格 case token.RIGHT: case token.COMMENT, token.COMMENTS: flag = FText if b.Last.Token() == token.EMPTYLINE || b.Last.Token() == token.PLACEHOLDER { tok = token.PLACEHOLDER // 转 break } // 先不使用合并 tok = token.COMMENT case token.PLACEHOLDER, token.INDENTATION: // 统一处理右括号闭合 flag = FText default: if tok > token.PLACEHOLDER || b.Active.Kind(FFile|FBlock|FText) != 0 { return errors.New("ast: Oop! invalid " + tok.String()) } if b.expect != nil { // expect 尝试生成 Text 节点, 并且 tok 不变 if _, pass := b.expect.Eat(tok); pass { flag = FText b.expect = nil break } b.expect = nil } base := Base{ Tok: tok, Pos: pos, Source: code, } b.Active.resolve(&base) if base.Flag == 0 { return errors.New("ast: Oop! invalid " + tok.String()) } return b.add(base) case token.EOF: // 最后的闭合检查 } if flag == 0 { return errors.New("ast: Oop! invalid " + tok.String()) } return b.add(Base{ Flag: flag, Tok: tok, Pos: pos, Source: code, }) }