func TestReadWriteRune(t *testing.T) { const NRune = 1000 byteBuf := new(bytes.Buffer) w := NewWriter(byteBuf) // Write the runes out using WriteRune buf := make([]byte, utf8.UTFMax) for rune := 0; rune < NRune; rune++ { size := utf8.EncodeRune(rune, buf) nbytes, err := w.WriteRune(rune) if err != nil { t.Fatalf("WriteRune(0x%x) error: %s", rune, err) } if nbytes != size { t.Fatalf("WriteRune(0x%x) expected %d, got %d", rune, size, nbytes) } } w.Flush() r := NewReader(byteBuf) // Read them back with ReadRune for rune := 0; rune < NRune; rune++ { size := utf8.EncodeRune(rune, buf) nr, nbytes, err := r.ReadRune() if nr != rune || nbytes != size || err != nil { t.Fatalf("ReadRune(0x%x) got 0x%x,%d not 0x%x,%d (err=%s)", r, nr, nbytes, r, size, err) } } }
func TestRuneIO(t *testing.T) { const NRune = 1000 // Built a test array while we write the data b := make([]byte, utf8.UTFMax*NRune) var buf Buffer n := 0 for r := 0; r < NRune; r++ { size := utf8.EncodeRune(r, b[n:]) nbytes, err := buf.WriteRune(r) if err != nil { t.Fatalf("WriteRune(0x%x) error: %s", r, err) } if nbytes != size { t.Fatalf("WriteRune(0x%x) expected %d, got %d", r, size, nbytes) } n += size } b = b[0:n] // Check the resulting bytes if !Equal(buf.Bytes(), b) { t.Fatalf("incorrect result from WriteRune: %q not %q", buf.Bytes(), b) } // Read it back with ReadRune for r := 0; r < NRune; r++ { size := utf8.EncodeRune(r, b) nr, nbytes, err := buf.ReadRune() if nr != r || nbytes != size || err != nil { t.Fatalf("ReadRune(0x%x) got 0x%x,%d not 0x%x,%d (err=%s)", r, nr, nbytes, r, size, err) } } }
func (p *Trie) outputDot(vec *vector.StringVector, rune int, serial int64, rgen *rand.Rand) { this := make([]byte, 10) child := make([]byte, 10) utf8.EncodeRune(this, rune) thisChar := string(this[0]) if serial == -1 { thisChar = "root" } for childRune, childNode := range p.children { utf8.EncodeRune(child, childRune) childSerial := rgen.Int63() childNodeStr := fmt.Sprintf("\"%s(%d)\"", string(child[0]), childSerial) var notation string if string(child[0]) == "/" { notation = fmt.Sprintf("[label=\"%s\" shape=box color=red]", string(child[0])) } else { notation = fmt.Sprintf("[label=\"%s\"]", string(child[0])) } vec.Push(fmt.Sprintf("\t%s %s\n\t\"%s(%d)\" -> \"%s(%d)\"", childNodeStr, notation, thisChar, serial, string(child[0]), childSerial)) childNode.outputDot(vec, childRune, childSerial, rgen) } }
func TestRuneIO(t *testing.T) { const NRune = 1000 // Built a test array while we write the data b := make([]byte, utf8.UTFMax*NRune) var buf Buffer n := 0 for r := rune(0); r < NRune; r++ { size := utf8.EncodeRune(b[n:], r) nbytes, err := buf.WriteRune(r) if err != nil { t.Fatalf("WriteRune(%U) error: %s", r, err) } if nbytes != size { t.Fatalf("WriteRune(%U) expected %d, got %d", r, size, nbytes) } n += size } b = b[0:n] // Check the resulting bytes if !Equal(buf.Bytes(), b) { t.Fatalf("incorrect result from WriteRune: %q not %q", buf.Bytes(), b) } p := make([]byte, utf8.UTFMax) // Read it back with ReadRune for r := rune(0); r < NRune; r++ { size := utf8.EncodeRune(p, r) nr, nbytes, err := buf.ReadRune() if nr != r || nbytes != size || err != nil { t.Fatalf("ReadRune(%U) got %U,%d not %U,%d (err=%s)", r, nr, nbytes, r, size, err) } } // Check that UnreadRune works buf.Reset() buf.Write(b) for r := rune(0); r < NRune; r++ { r1, size, _ := buf.ReadRune() if err := buf.UnreadRune(); err != nil { t.Fatalf("UnreadRune(%U) got error %q", r, err) } r2, nbytes, err := buf.ReadRune() if r1 != r2 || r1 != r || nbytes != size || err != nil { t.Fatalf("ReadRune(%U) after UnreadRune got %U,%d not %U,%d (err=%s)", r, r2, nbytes, r, size, err) } } }
// WriteRune writes a single Unicode code point, returning // the number of bytes written and any error. func (b *Writer) WriteRune(r rune) (size int, err error) { if r < utf8.RuneSelf { err = b.WriteByte(byte(r)) if err != nil { return 0, err } return 1, nil } if b.err != nil { return 0, b.err } n := b.Available() if n < utf8.UTFMax { if b.Flush(); b.err != nil { return 0, b.err } n = b.Available() if n < utf8.UTFMax { // Can only happen if buffer is silly small. return b.WriteString(string(r)) } } size = utf8.EncodeRune(b.buf[b.n:], r) b.n += size return size, nil }
// Converts a single numerical html entity to a regular Go utf8-token. func EntityToUtf8(entity string) string { var ok bool if ok = reg_entnamed.MatchString(entity); ok { return namedEntityToUtf8(entity[1 : len(entity)-1]) } if ok = reg_entnumeric.MatchString(entity); !ok { return "&" + entity[2:len(entity)-1] + ";" } var err os.Error var num int entity = entity[2 : len(entity)-1] if num, err = strconv.Atoi(entity); err != nil { return "&#" + entity + ";" } var arr [4]byte if size := utf8.EncodeRune(arr[:], num); size == 0 { return "&#" + entity + ";" } return string(arr[:]) }
// unescapeEntity reads an entity like "<" from b[src:] and writes the // corresponding "<" to b[dst:], returning the incremented dst and src cursors. // Precondition: src[0] == '&' && dst <= src. func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) { // TODO(nigeltao): Check that this entity substitution algorithm matches the spec: // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference // TODO(nigeltao): Handle things like "中" or "中". // i starts at 1 because we already know that s[0] == '&'. i, s := 1, b[src:] for i < len(s) { c := s[i] i++ // Lower-cased characters are more common in entities, so we check for them first. if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' { continue } if c != ';' { i-- } x := entity[string(s[1:i])] if x != 0 { return dst + utf8.EncodeRune(x, b[dst:]), src + i } break } dst1, src1 = dst+i, src+i copy(b[dst:dst1], b[src:src1]) return dst1, src1 }
// Extract regular text from the beginning of the pattern, // possibly after a leading iBOT. // That text can be used by doExecute to speed up matching. func (re *Regexp) setPrefix() { var b []byte var utf = make([]byte, utf8.UTFMax) var inst *instr // First instruction is start; skip that. Also skip any initial iBOT. inst = re.inst[0].next for inst.kind == iBOT { inst = inst.next } Loop: for ; inst.kind != iEnd; inst = inst.next { // stop if this is not a char if inst.kind != iChar { break } // stop if this char can be followed by a match for an empty string, // which includes closures, ^, and $. switch inst.next.kind { case iBOT, iEOT, iAlt: break Loop } n := utf8.EncodeRune(utf, inst.char) b = append(b, utf[0:n]...) } // point prefixStart instruction to first non-CHAR after prefix re.prefixStart = inst re.prefixBytes = b re.prefix = string(b) }
// Map returns a copy of the byte array s with all its characters modified // according to the mapping function. If mapping returns a negative value, the character is // dropped from the string with no replacement. The characters in s and the // output are interpreted as UTF-8-encoded Unicode code points. func Map(mapping func(rune int) int, s []byte) []byte { // In the worst case, the array can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. maxbytes := len(s) // length of b nbytes := 0 // number of bytes encoded in b b := make([]byte, maxbytes) for i := 0; i < len(s); { wid := 1 rune := int(s[i]) if rune >= utf8.RuneSelf { rune, wid = utf8.DecodeRune(s[i:]) } rune = mapping(rune) if rune >= 0 { if nbytes+utf8.RuneLen(rune) > maxbytes { // Grow the buffer. maxbytes = maxbytes*2 + utf8.UTFMax nb := make([]byte, maxbytes) copy(nb, b[0:nbytes]) b = nb } nbytes += utf8.EncodeRune(b[nbytes:maxbytes], rune) } i += wid } return b[0:nbytes] }
// Map returns a copy of the string s with all its characters modified // according to the mapping function. func Map(mapping func(rune int) int, s string) string { // In the worst case, the string can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. maxbytes := len(s) // length of b nbytes := 0 // number of bytes encoded in b b := make([]byte, maxbytes) for _, c := range s { rune := mapping(c) wid := 1 if rune >= utf8.RuneSelf { wid = utf8.RuneLen(rune) } if nbytes+wid > maxbytes { // Grow the buffer. maxbytes = maxbytes*2 + utf8.UTFMax nb := make([]byte, maxbytes) for i, c := range b[0:nbytes] { nb[i] = c } b = nb } nbytes += utf8.EncodeRune(rune, b[nbytes:maxbytes]) } return string(b[0:nbytes]) }
// appendRune inserts a rune at the end of the buffer. It is used for Hangul. func (rb *reorderBuffer) appendRune(rune uint32) { bn := rb.nbyte sz := utf8.EncodeRune(rb.byte[bn:], int(rune)) rb.nbyte += uint8(sz) rb.rune[rb.nrune] = runeInfo{bn, uint8(sz), 0, 0} rb.nrune++ }
// Inserts a character in the cursor position. func (b *buffer) insertRune(rune int) os.Error { var useRefresh bool b.grow(b.size + 1) // Check if there is free space for one more character // Avoid a full update of the line. if b.pos == b.size { char := make([]byte, utf8.UTFMax) utf8.EncodeRune(char, rune) if _, err := output.Write(char); err != nil { return outputError(err.String()) } } else { useRefresh = true copy(b.data[b.pos+1:b.size+1], b.data[b.pos:b.size]) } b.data[b.pos] = rune b.pos++ b.size++ if useRefresh { return b.refresh() } return nil }
// Returns a slice of the contents of the buffer. func (b *buffer) toBytes() []byte { chars := make([]byte, b.size*utf8.UTFMax) var end, runeLen int // === Each character (as integer) is encoded to []byte for i := 0; i < b.size; i++ { if i != 0 { runeLen = utf8.EncodeRune(chars[end:], b.data[i]) end += runeLen } else { runeLen = utf8.EncodeRune(chars, b.data[i]) end = runeLen } } return chars[:end] }
// Extract regular text from the beginning of the pattern. // That text can be used by doExecute to speed up matching. func (re *Regexp) setPrefix() { var b []byte var utf = make([]byte, utf8.UTFMax) // First instruction is start; skip that. i := re.inst.At(0).(instr).next().index() Loop: for i < re.inst.Len() { inst := re.inst.At(i).(instr) // stop if this is not a char if inst.kind() != _CHAR { break } // stop if this char can be followed by a match for an empty string, // which includes closures, ^, and $. switch re.inst.At(inst.next().index()).(instr).kind() { case _BOT, _EOT, _ALT: break Loop } n := utf8.EncodeRune(inst.(*_Char).char, utf) b = bytes.Add(b, utf[0:n]) i = inst.next().index() } // point prefixStart instruction to first non-CHAR after prefix re.prefixStart = re.inst.At(i).(instr) re.prefixBytes = b re.prefix = string(b) }
// fmtC formats a rune for the 'c' format. func (p *pp) fmtC(c int64) { rune := int(c) // Check for overflow. if int64(rune) != c { rune = utf8.RuneError } w := utf8.EncodeRune(p.runeBuf[0:utf8.UTFMax], rune) p.fmt.pad(p.runeBuf[0:w]) }
func (p *pp) add(c int) { if c < utf8.RuneSelf { p.buf.WriteByte(byte(c)) } else { w := utf8.EncodeRune(c, &p.runeBuf) p.buf.Write(p.runeBuf[0:w]) } }
func (S *StringBuffer) AppendStr(s string) *StringBuffer { // fmt.Printf("append: %c", ch) for _, ch := range s { w := utf8.EncodeRune(S.bytes[S.index:], ch) S.index += w } return S }
// WriteRune appends the UTF-8 encoding of Unicode // code point r to the buffer, returning its length and // an error, which is always nil but is included // to match bufio.Writer's WriteRune. func (b *Buffer) WriteRune(r rune) (n int, err error) { if r < utf8.RuneSelf { b.WriteByte(byte(r)) return 1, nil } n = utf8.EncodeRune(b.runeBytes[0:], r) b.Write(b.runeBytes[0:n]) return n, nil }
func urlquoter(c int, safe string) []byte { safe_bytes := strings.Bytes(safe); c_bytes := make([]byte, utf8.RuneLen(c)); utf8.EncodeRune(c, c_bytes); if bytes.Index(safe_bytes, c_bytes) != -1 || bytes.Index(always_safe, c_bytes) != -1 { return c_bytes; } else { return strings.Bytes(fmt.Sprintf("%%%02X", c)); } panic("unreachable"); }
func (r *DelimReader) Read(p []byte) (n int, err os.Error) { bytes_written := 0 if r.remainder != nil { for i := 0; i < len(r.remainder); i++ { p[i] = r.remainder[i] bytes_written++ } r.remainder = nil } for bytes_written < len(p) { rune, size, err := r.reader.ReadRune() if err != nil { return bytes_written, err } for _, value := range r.delimiters { if value == rune { rune = r.used_delimiter size = utf8.RuneLen(rune) } } if bytes_written+size > len(p) { // we need to split the rune and hold on to the remainder writable := len(p) - bytes_written target := make([]byte, size) _ = utf8.EncodeRune(target, rune) for i := 0; i < writable; i++ { p[bytes_written] = target[i] bytes_written++ } r.remainder = target[writable:] } else { target := p[bytes_written : bytes_written+size] _ = utf8.EncodeRune(target, rune) bytes_written += size } } return bytes_written, nil }
/* Capitalizes the first character of the value. Example: {value|capfirst} If value is "neste", the output will be "Neste". */ func CapFirstFormatter(w io.Writer, formatter string, data ...interface{}) { b := getBytes(data...) if len(b) > 0 { rune, size := utf8.DecodeRune(b) rune = unicode.ToUpper(rune) capSize := utf8.RuneLen(rune) capb := make([]byte, capSize) utf8.EncodeRune(capb, rune) w.Write(capb) w.Write(b[size:]) } }
// Parses the next rune and checks to see if its in a given range func pRange(ranges []unicode.Range, result *string, src []byte, i *int) bool { rune, size := utf8.DecodeRune(src[i : i+utf8.UTF8Max]) if unicode.Is(ranges, rune) { buf := make([]byte, size) utf8.EncodeRune(rune, buf) *result = string(buf) // return resulting rune *i += size // Update index //src = src[size:len(src)]; // Update slice return true } // No match return false }
func runesToString(runes []int) string { length := 0 for _, v := range runes { length += utf8.RuneLen(v) } data := make([]byte, length) cur := data for _, v := range runes { rlen := utf8.RuneLen(v) utf8.EncodeRune(cur[0:rlen], v) cur = cur[rlen:] } return string(data) }
// Internal output-building function used by Members() func (p *Trie) buildMembers(prefix string) *vector.StringVector { strList := new(vector.StringVector) if p.leaf { strList.Push(prefix) } // for each child, go grab all suffixes for rune, child := range p.children { buf := make([]byte, 4) numChars := utf8.EncodeRune(buf, rune) strList.AppendVector(child.buildMembers(prefix + string(buf[0:numChars]))) } return strList }
func (h *Hyphenator) hyphenateWord(s, hyphen string) string { testStr := `.` + s + `.` v := make([]int, utf8.RuneCountInString(testStr)) vIndex := 0 for pos, _ := range testStr { t := testStr[pos:] strs, values := h.patterns.AllSubstringsAndValues(t) for i := 0; i < values.Len(); i++ { str := strs.At(i) val := values.At(i).(*vector.IntVector) diff := val.Len() - len(str) vs := v[vIndex-diff:] for i := 0; i < val.Len(); i++ { if val.At(i) > vs[i] { vs[i] = val.At(i) } } } vIndex++ } var outstr string // trim the values for the beginning and ending dots markers := v[1 : len(v)-1] mIndex := 0 u := make([]byte, 4) for _, ch := range s { l := utf8.EncodeRune(ch, u) outstr += string(u[0:l]) // don't hyphenate between (or after) the last two characters of a string if mIndex < len(markers)-2 { // hyphens are inserted on odd values, skipped on even ones if markers[mIndex]%2 != 0 { outstr += hyphen } } mIndex++ } return outstr }
// decodeCSS decodes CSS3 escapes given a sequence of stringchars. // If there is no change, it returns the input, otherwise it returns a slice // backed by a new array. // http://www.w3.org/TR/css3-syntax/#SUBTOK-stringchar defines stringchar. func decodeCSS(s []byte) []byte { i := bytes.IndexByte(s, '\\') if i == -1 { return s } // The UTF-8 sequence for a codepoint is never longer than 1 + the // number hex digits need to represent that codepoint, so len(s) is an // upper bound on the output length. b := make([]byte, 0, len(s)) for len(s) != 0 { i := bytes.IndexByte(s, '\\') if i == -1 { i = len(s) } b, s = append(b, s[:i]...), s[i:] if len(s) < 2 { break } // http://www.w3.org/TR/css3-syntax/#SUBTOK-escape // escape ::= unicode | '\' [#x20-#x7E#x80-#xD7FF#xE000-#xFFFD#x10000-#x10FFFF] if isHex(s[1]) { // http://www.w3.org/TR/css3-syntax/#SUBTOK-unicode // unicode ::= '\' [0-9a-fA-F]{1,6} wc? j := 2 for j < len(s) && j < 7 && isHex(s[j]) { j++ } r := hexDecode(s[1:j]) if r > unicode.MaxRune { r, j = r/16, j-1 } n := utf8.EncodeRune(b[len(b):cap(b)], r) // The optional space at the end allows a hex // sequence to be followed by a literal hex. // string(decodeCSS([]byte(`\A B`))) == "\nB" b, s = b[:len(b)+n], skipCSSSpace(s[j:]) } else { // `\\` decodes to `\` and `\"` to `"`. _, n := utf8.DecodeRune(s[1:]) b, s = append(b, s[1:1+n]...), s[1+n:] } } return b }
func (n *trieNode) insert(r rune, value uint16) { var p [utf8.UTFMax]byte sz := utf8.EncodeRune(p[:], r) for i := 0; i < sz; i++ { if n.leaf { log.Fatalf("triegen: insert: node (%#v) should not be a leaf", n) } nn := n.table[p[i]] if nn == nil { nn = newNode() nn.b = p[i] n.table[p[i]] = nn } n = nn } n.value = int(value) n.leaf = true }
// Converts a single numerical html entity to a regular Go utf-token. // ex: "♣" -> "♣" func HtmlToUTF8(entity string) string { // Make sure we have a valid entity: { ok := reg_entity.MatchString(entity) if !ok { return "" } // Convert entity to number num, err := strconv.Atoi(entity[2 : len(entity)-1]) if err != nil { return "" } var arr [3]byte size := utf8.EncodeRune(num, &arr) if size == 0 { return "" } return string(&arr) }
// Map returns a copy of the string s with all its characters modified // according to the mapping function. If mapping returns a negative value, the character is // dropped from the string with no replacement. func Map(mapping func(rune) rune, s string) string { // In the worst case, the string can grow when mapped, making // things unpleasant. But it's so rare we barge in assuming it's // fine. It could also shrink but that falls out naturally. maxbytes := len(s) // length of b nbytes := 0 // number of bytes encoded in b // The output buffer b is initialized on demand, the first // time a character differs. var b []byte for i, c := range s { r := mapping(c) if b == nil { if r == c { continue } b = make([]byte, maxbytes) nbytes = copy(b, s[:i]) } if r >= 0 { wid := 1 if r >= utf8.RuneSelf { wid = utf8.RuneLen(r) } if nbytes+wid > maxbytes { // Grow the buffer. maxbytes = maxbytes*2 + utf8.UTFMax nb := make([]byte, maxbytes) copy(nb, b[0:nbytes]) b = nb } nbytes += utf8.EncodeRune(b[nbytes:maxbytes], r) } } if b == nil { return s } return string(b[0:nbytes]) }
func (h *Hyphenator) Hyphenate(s, hyphen string) (string, bool) { var sc scanner.Scanner sc.Init(strings.NewReader(s)) sc.Mode = scanner.ScanIdents sc.Whitespace = 0 var outstr string tok := sc.Scan() for tok != scanner.EOF { switch tok { case scanner.Ident: // a word (or part thereof) to hyphenate t := sc.TokenText() // try the exceptions first exc := h.exceptions[t] if len(exc) != 0 { if hyphen != `-` { strings.Replace(exc, `-`, hyphen, -1) } return exc, true } // not an exception, hyphenate normally outstr += h.hyphenateWord(sc.TokenText(), hyphen) default: // A Unicode rune to append to the output p := make([]byte, utf8.UTFMax) l := utf8.EncodeRune(tok, p) outstr += string(p[0:l]) } tok = sc.Scan() } return outstr, true }