func (n NeologdNormalizer) EliminateSpace(s string) string { var ( b bytes.Buffer prev rune ) for p := 0; p < len(s); { c, w := utf8.DecodeRuneInString(s[p:]) p += w if !unicode.IsSpace(c) { b.WriteRune(c) prev = c continue } for p < len(s) { c0, w0 := utf8.DecodeRuneInString(s[p:]) p += w0 if !unicode.IsSpace(c0) { if unicode.In(prev, unicode.Latin, latinSymbols) && unicode.In(c0, unicode.Latin, latinSymbols) { b.WriteRune(' ') } b.WriteRune(c0) prev = c0 break } } } return b.String() }
// bufio.Scanner function to split data by words and quoted strings func scanStrings(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } if atEOF && len(data) == 0 { return 0, nil, nil } // Scan until space, marking end of word. inquote := false for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if r == '"' { inquote = !inquote continue } if unicode.IsSpace(r) && !inquote { return i + width, data[start:i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data[start:], nil } // Request more data. return 0, nil, nil }
// Test white space table matches the Unicode definition. func TestSpace(t *testing.T) { for r := rune(0); r <= utf8.MaxRune; r++ { if IsSpace(r) != unicode.IsSpace(r) { t.Fatalf("white space property disagrees: %#U should be %t", r, unicode.IsSpace(r)) } } }
// top returns offset to start of an match. func (c *ctx) top(tail int, w string) int { for len(w) > 0 { if tail <= 0 { debug.Printf("over backtrack: w=%q", w) return -1 } wr, wn := utf8.DecodeLastRuneInString(w) cr, cn := utf8.DecodeLastRuneInString(c.content[:tail]) tail -= cn if unicode.IsSpace(wr) { if !unicode.IsSpace(cr) { // no spaces which required. debug.Printf("not space: tail=%d w=%q cr=%q", tail, w, cr) return -1 } w = w[:len(w)-wn] continue } if unicode.IsSpace(cr) { continue } w = w[:len(w)-wn] if cr != wr { // didn't match runes. debug.Printf("not match: tail=%d w=%q cr=%q wr=%q", tail, w, cr, wr) return -1 } } return tail }
// scanWords is a split function for a Scanner that returns each // space-separated word of text, with surrounding spaces deleted. It will // never return an empty string. The definition of space is set by // unicode.IsSpace. func scanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } quote := false // Scan until space, marking end of word. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) switch { case i == 0 && r == '"': quote = true case !quote && unicode.IsSpace(r): return i + width, data[start:i], nil case quote && r == '"': return i + width, data[start+width : i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data[start:], nil } // Request more data. return start, nil, nil }
// scanWordsKeepPrefix is a split function for a Scanner that returns each // space-separated word of text, with prefixing spaces included. It will never // return an empty string. The definition of space is set by unicode.IsSpace. // // Adapted from bufio.ScanWords(). func scanTokensKeepPrefix(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } if atEOF && len(data) == 0 || start == len(data) { return len(data), data, nil } if len(data) > start && data[start] == '#' { return scanLinesKeepPrefix(data, atEOF) } // Scan until space, marking end of word. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if unicode.IsSpace(r) { return i, data[:i], nil } } // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. if atEOF && len(data) > start { return len(data), data, nil } // Request more data. return 0, nil, nil }
//converts a string into a slice of strings. symbols and contiguous strings of any other type //are returned as individual elements. all whitespace is excluded func getTokens(value string) []string { var buffer []rune var result []string chars := []rune(value) for i, r := range chars { if !unicode.IsLetter(r) && !unicode.IsNumber(r) && !unicode.IsDigit(r) && !unicode.IsSpace(r) { if len(buffer) > 0 { result = append(result, string(buffer)) buffer = nil } result = append(result, string(r)) } else if unicode.IsSpace(r) { if len(buffer) > 0 { result = append(result, string(buffer)) } buffer = nil } else { buffer = append(buffer, r) if i == len(chars)-1 { result = append(result, string(buffer)) } } } return result }
// Move cursor forward to beginning of the previous word. // Skips the rest of the current word, if any, unless is located at its // first character. Returns true if the move was successful, false if EOF reached. func (c *Cursor) PrevWord() bool { isNotSpace := func(r rune) bool { return !unicode.IsSpace(r) } for { // Skip space until we find a word character. // Re-try if we reached beginning-of-line. if !c.PrevRuneFunc(isNotSpace) { return false } if !c.BOL() { break } } r, _ := c.RuneBefore() if isNotSpace(r) { // Lowercase word motion differentiates words consisting of // (A-Z0-9_) and any other non-whitespace character. Skip until // we find either the other word type or whitespace. if utils.IsWord(r) { c.PrevRuneFunc(func(r rune) bool { return !utils.IsWord(r) || unicode.IsSpace(r) }) } else { c.PrevRuneFunc(func(r rune) bool { return utils.IsWord(r) || unicode.IsSpace(r) }) } } return !c.BOL() }
func splitSections(s string) (sections []string) { var i, j int var quote bool = false var section string i = 0 for i < len(s) { section = "" for j = i; j < len(s); j++ { if s[j] == '\'' { quote = !quote } else if unicode.IsSpace(rune(s[j])) && !quote { break } else { section = section + string(s[j]) } } sections = append(sections, section) for i = j; i < len(s); i++ { if !unicode.IsSpace(rune(s[i])) { break } } } return sections }
func (self *TextPreview) Render(context *Context, writer *utils.XMLWriter) (err error) { if len(self.PlainText) < self.MaxLength { writer.Content(self.PlainText) } else { shortLength := self.ShortLength if shortLength == 0 { shortLength = self.MaxLength } // If in the middle of a word, go back to space before it for shortLength > 0 && !unicode.IsSpace(rune(self.PlainText[shortLength-1])) { shortLength-- } // If in the middle of space, go back to word before it for shortLength > 0 && unicode.IsSpace(rune(self.PlainText[shortLength-1])) { shortLength-- } writer.Content(self.PlainText[:shortLength]) writer.Content("... ") if self.MoreLink != nil { writer.OpenTag("a") writer.Attrib("href", self.MoreLink.URL(context.PathArgs...)) writer.AttribIfNotDefault("title", self.MoreLink.LinkTitle(context)) content := self.MoreLink.LinkContent(context) if content != nil { err = content.Render(context, writer) } writer.ForceCloseTag() // a } } return err }
// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument. func parseHealthConfig(rest string) (*Node, map[string]bool, error) { // Find end of first argument var sep int for ; sep < len(rest); sep++ { if unicode.IsSpace(rune(rest[sep])) { break } } next := sep for ; next < len(rest); next++ { if !unicode.IsSpace(rune(rest[next])) { break } } if sep == 0 { return nil, nil, nil } typ := rest[:sep] cmd, attrs, err := parseMaybeJSON(rest[next:]) if err != nil { return nil, nil, err } return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err }
func anagram(word1 string, word2 string) bool { // make a map containing the number of appearances for each rune // (go's encoding-agnostic abstraction of characters) // in both strings, and compare them: if they match, then word1 and // word2 are anagrams of each other // initialize empty maps/dictionaries/hashes that map runes to // integers; these are our rune-count dicts for each word chars1 := make(map[rune]int) chars2 := make(map[rune]int) // range gives (int-index, rune) pairs for strings: this is a foreach // loop for _, c := range strings.ToLower(word1) { // discarding spaces makes the function more flexible, so // it can check whether two PHRASES, not just two WORDS, // are anagrams of each other if !unicode.IsSpace(c) { // default int value in golang is 0, so this is safe chars1[c] = chars1[c] + 1 } } for _, c := range strings.ToLower(word2) { if !unicode.IsSpace(c) { chars2[c] = chars2[c] + 1 } } return reflect.DeepEqual(chars1, chars2) }
func (self *TextPreview) Render(ctx *Context) (err error) { if len(self.PlainText) < self.MaxLength { ctx.Response.XML.Content(self.PlainText) } else { shortLength := self.ShortLength if shortLength == 0 { shortLength = self.MaxLength } // If in the middle of a word, go back to space before it for shortLength > 0 && !unicode.IsSpace(rune(self.PlainText[shortLength-1])) { shortLength-- } // If in the middle of space, go back to word before it for shortLength > 0 && unicode.IsSpace(rune(self.PlainText[shortLength-1])) { shortLength-- } ctx.Response.XML.Content(self.PlainText[:shortLength]) ctx.Response.XML.Content("... ") if self.MoreLink != nil { ctx.Response.XML.OpenTag("a") ctx.Response.XML.Attrib("href", self.MoreLink.URL(ctx)) ctx.Response.XML.AttribIfNotDefault("title", self.MoreLink.LinkTitle(ctx)) content := self.MoreLink.LinkContent(ctx) if content != nil { err = content.Render(ctx) } ctx.Response.XML.CloseTagAlways() // a } } return err }
func count(in *bufio.Reader) (nl, nw, nr, nc int, err error) { inword := false for { var r rune var sz int r, sz, err = in.ReadRune() if err == io.EOF { err = nil break } if err != nil { return } nr++ nc += sz if r == '\n' { nl++ } if unicode.IsSpace(r) && inword { inword = false nw++ } else if !unicode.IsSpace(r) { inword = true } } return }
func lexPrivmsg(l *LogLexer) stateFn { for i := 0; ; i++ { l.buf.ignoreWhile(func(r rune) bool { return unicode.IsSpace(r) && r != '\n' }) n := l.buf.acceptWhile(func(r rune) bool { return r != utf8.RuneError && !unicode.IsSpace(r) }) if n > 0 { l.emit(markov.TokWord) } r := l.buf.peek() switch { case r == '\n': l.emit(markov.TokEOL) l.buf.next() l.buf.ignoreToken() l.newline() return lexDate case r == utf8.RuneError: l.errorfEOFValid(nil) } } panic("not reached") }
// Fields splits the string s around each instance of one or more consecutive white space // characters, returning an array of substrings of s or an empty list if s contains only white space. func Fields(s string) []string { n := 0 inField := false for _, rune := range s { wasInField := inField inField = !unicode.IsSpace(rune) if inField && !wasInField { n++ } } a := make([]string, n) na := 0 fieldStart := -1 for i, rune := range s { if unicode.IsSpace(rune) { if fieldStart >= 0 { a[na] = s[fieldStart:i] na++ fieldStart = -1 } } else if fieldStart == -1 { fieldStart = i } } if fieldStart != -1 { a[na] = s[fieldStart:] na++ } return a[0:na] }
// Fields splits the array s around each instance of one or more consecutive white space // characters, returning a slice of subarrays of s or an empty list if s contains only white space. func Fields(s []byte) [][]byte { n := 0 inField := false for i := 0; i < len(s); { rune, size := utf8.DecodeRune(s[i:]) wasInField := inField inField = !unicode.IsSpace(rune) if inField && !wasInField { n++ } i += size } a := make([][]byte, n) na := 0 fieldStart := -1 for i := 0; i <= len(s) && na < n; { rune, size := utf8.DecodeRune(s[i:]) if fieldStart < 0 && size > 0 && !unicode.IsSpace(rune) { fieldStart = i i += size continue } if fieldStart >= 0 && (size == 0 || unicode.IsSpace(rune)) { a[na] = s[fieldStart:i] na++ fieldStart = -1 } if size == 0 { break } i += size } return a[0:na] }
// poolTrim trims all but immediately surrounding space. // \n\t\tfoobar\n\t\t becomes \tfoobar\n func poolTrim(s string) string { var start, end int for i, r := range s { if !unicode.IsSpace(r) { if i != 0 { start = i - 1 // preserve preceding space } break } } for i := len(s) - 1; i >= 0; i-- { r := rune(s[i]) if !unicode.IsSpace(r) { if i != len(s)-1 { end = i + 2 } break } } if start == 0 && end == 0 { return "" // every char was a space } return s[start:end] }
// GraveTrim func GraveTrim(target string) string { // Discard \r? Go already does this for raw string literals. end := len(target) last := 0 index := 0 for index = 0; index < end; index++ { chr := rune(target[index]) if chr == '\n' || !unicode.IsSpace(chr) { last = index break } } if index >= end { return "" } start := last if rune(target[start]) == '\n' { // Skip the leading newline start++ } last = end - 1 for index = last; index > start; index-- { chr := rune(target[index]) if chr == '\n' || !unicode.IsSpace(chr) { last = index break } } stop := last result := target[start : stop+1] return result }
func SplitVerb(s string) (verb, rest string) { state := splitStateVerb verbBuf := &bytes.Buffer{} restBuf := &bytes.Buffer{} for _, r := range s { switch state { case splitStateVerb: if unicode.IsSpace(r) { state = splitStateWhite } else { io.WriteString(verbBuf, string([]rune{r})) } case splitStateWhite: if !unicode.IsSpace(r) { state = splitStateRest io.WriteString(restBuf, string([]rune{r})) } case splitStateRest: io.WriteString(restBuf, string([]rune{r})) } } verb = verbBuf.String() rest = restBuf.String() return }
func upperWordLetterPairs(runes []rune) ([]runeBigram, int) { limit := len(runes) - 1 if limit < 1 { return make([]runeBigram, 0), 0 } bigrams := make([]runeBigram, limit) var a rune var b rune numPairs := 0 for i := 0; i < limit; i++ { a = runes[i] b = runes[i+1] if unicode.IsSpace(b) { i++ continue } if unicode.IsSpace(a) { continue } bigrams[numPairs] = runeBigram{rA: unicode.ToUpper(a), rB: unicode.ToUpper(b)} numPairs++ } bigrams = bigrams[0:numPairs] return bigrams, numPairs }
func (ctx *textifyTraverseCtx) emit(data string) error { if len(data) == 0 { return nil } lines := ctx.breakLongLines(data) var err error for _, line := range lines { runes := []rune(line) startsWithSpace := unicode.IsSpace(runes[0]) if !startsWithSpace && !ctx.endsWithSpace { ctx.Buf.WriteByte(' ') ctx.lineLength++ } ctx.endsWithSpace = unicode.IsSpace(runes[len(runes)-1]) for _, c := range line { _, err = ctx.Buf.WriteString(string(c)) if err != nil { return err } ctx.lineLength++ if c == '\n' { ctx.lineLength = 0 if ctx.prefix != "" { _, err = ctx.Buf.WriteString(ctx.prefix) if err != nil { return err } } } } } return nil }
// Trim returns a slice of the string s, with all leading and trailing white space // removed, as defined by Unicode. func TrimSpace(s string) string { start, end := 0, len(s) for start < end { wid := 1 rune := int(s[start]) if rune >= utf8.RuneSelf { rune, wid = utf8.DecodeRuneInString(s[start:end]) } if !unicode.IsSpace(rune) { break } start += wid } for start < end { wid := 1 rune := int(s[end-1]) if rune >= utf8.RuneSelf { // Back up carefully looking for beginning of rune. Mustn't pass start. for wid = 2; start <= end-wid && !utf8.RuneStart(s[end-wid]); wid++ { } if start > end-wid { // invalid UTF-8 sequence; stop processing return s[start:end] } rune, wid = utf8.DecodeRuneInString(s[end-wid : end]) } if !unicode.IsSpace(rune) { break } end -= wid } return s[start:end] }
func beautify(line string) string { buf := new(bytes.Buffer) lineRune := []rune(line) for i, current := range lineRune { if i == 0 { buf.WriteString(string(current)) continue } previous := lineRune[i-1] // chinese english char appears alternatively, when english char is not a space and // chinese char is not a punctuation, insert a whitespace. if isEnglish(previous) && isChinese(current) { if !unicode.IsSpace(previous) && !isPunctuation(string(current)) { buf.WriteString(insertionChar) } } else if isChinese(previous) && isEnglish(current) { if !isPunctuation(string(previous)) && !unicode.IsSpace(current) { buf.WriteString(insertionChar) } } buf.WriteString(string(current)) } return buf.String() }
func scanStmts(data []byte, atEOF bool) (advance int, token []byte, err error) { // Skip leading spaces. start := 0 for width := 0; start < len(data); start += width { var r rune r, width = utf8.DecodeRune(data[start:]) if !unicode.IsSpace(r) { break } } if atEOF && len(data) == 0 { return 0, nil, nil } end := start // Scan until semicolon, marking end of statement. for width, i := 0, start; i < len(data); i += width { var r rune r, width = utf8.DecodeRune(data[i:]) if r == ';' { return i + width, data[start:i], nil } else if !unicode.IsSpace(r) { end = i + 1 } } // If we're at EOF, we have a final, non-empty, non-terminated statement. Return it. if atEOF && len(data) > start { return len(data), data[start:end], nil } // Request more data. return 0, nil, nil }
func handleForwardWord(i *Input, _ termbox.Event) { if i.caretPos >= len(i.query) { return } foundSpace := false for pos := i.caretPos; pos < len(i.query); pos++ { r := i.query[pos] if foundSpace { if !unicode.IsSpace(r) { i.caretPos = pos i.DrawMatches(nil) return } } else { if unicode.IsSpace(r) { foundSpace = true } } } // not found. just move to the end of the buffer i.caretPos = len(i.query) i.DrawMatches(nil) }
func TestIsSpace(t *testing.T) { // This tests the internal isSpace function. // IsSpace = isSpace is defined in export_test.go. for i := rune(0); i <= unicode.MaxRune; i++ { if IsSpace(i) != unicode.IsSpace(i) { t.Errorf("isSpace(%U) = %v, want %v", i, IsSpace(i), unicode.IsSpace(i)) } } }
func lexInsideDelims(l *lexer) lexerState { for { rest := l.data[l.pos:] //lex the inside tokens that dont change state for _, delim := range insideDelims { if bytes.HasPrefix(rest, delim.value) { l.pos += len(delim.value) //if we have a keyword, check that the next letter //either is a space or a close delim follows it if !unicode.IsSpace(l.peek()) && !bytes.HasPrefix(l.data[l.pos:], closeDelim.value) { //theres more than just a keyword so back up l.pos -= len(delim.value) continue } l.emit(delim.typ) return lexInsideDelims } } //check for things that start selectors for _, delim := range selDelims { if bytes.HasPrefix(rest, delim.value) { l.emit(tokenStartSel) return lexInsideSel } } //check for a close delim if bytes.HasPrefix(rest, closeDelim.value) { return lexCloseDelim } switch r := l.next(); { case r == eof || r == '\n' || r == '\r': return l.errorf("unclosed action") case unicode.IsSpace(r): l.advance() //remove letter/number literals /* case r == '+' || r == '-' || '0' <= r && r <= '9': l.backup() return lexNumber case r == '"': l.advance() return lexValue */ case unicode.IsLetter(r) || r == '_': //go spec return lexIdentifier default: return l.errorf("invalid character: %q", r) } } return nil }
func (self *scanner) nextWord() (word tok, err os.Error) { if self.index >= len(self.content) { err = os.NewError("EOF") return } for self.index < len(self.content) { r, l := utf8.DecodeRune(self.content[self.index:]) if !unicode.IsSpace(r) || r == '\n' { break } self.index += l } j, ttype, inchar, incode := self.index, other, false, 0 for self.index < len(self.content) { r, l := utf8.DecodeRune(self.content[self.index:]) if r == '\'' { inchar = !inchar } if self.index == j { switch { case unicode.IsUpper(r): ttype = nonterm case r == '\n': self.index++ ttype = newline break case r == ':': ttype = begindef case r == ';': ttype = enddef case r == '|': ttype = alternate case r == '{' && memorizeTerms: incode++ ttype = code default: ttype = term } } else if incode > 0 && r == '{' { incode++ } else if incode > 0 && r == '}' { incode-- } if incode == 0 && !inchar && unicode.IsSpace(r) { break } self.index += l } token := string(self.content[j:self.index]) if ttype == newline { token = "" } word = tok{token, ttype} return }
func (w *wordsStruct) addChar(ch rune) { if unicode.IsSpace(ch) && w.inWord { if len(w.word) != 0 { w.words = append(w.words, w.word) w.word = "" w.inWord = false } } else if !unicode.IsSpace(ch) { w.addRawChar(ch) } }