forked from extemporalgenome/slug
/
bytes.go
116 lines (110 loc) · 2.54 KB
/
bytes.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package slug
import (
"encoding/hex"
"unicode"
"unicode/utf8"
"golang.org/x/text/unicode/norm"
)
// SlugBytes replaces each run of characters which are not unicode letters or
// numbers with a single hyphen, except for leading or trailing runs. Letters
// will be stripped of diacritical marks and lowercased. Letter or number
// codepoints that do not have combining marks or a lower-cased variant will
// be passed through unaltered.
func SlugBytes(s []byte) []byte {
s = norm.NFKD.Bytes(s)
buf := make([]byte, 0, len(s))
dash := false
for len(s) > 0 {
r, i := utf8.DecodeRune(s)
switch {
// unicode 'letters' like mandarin characters pass through
case unicode.IsOneOf(lat, r):
buf = append(buf, s[:i]...)
dash = true
case unicode.IsOneOf(nop, r):
// skip
case dash:
buf = append(buf, '-')
dash = false
}
s = s[i:]
}
i := len(buf) - 1
if i >= 0 && buf[i] == '-' {
buf = buf[:i]
}
return buf
}
// SlugAsciiBytes is identical to SlugBytes, except that runs of one or more
// unicode letters or numbers that still fall outside the ASCII range will have
// their UTF-8 representation hex encoded and delimited by hyphens. As with
// SlugBytes, in no case will hyphens appear at either end of the returned
// string.
func SlugAsciiBytes(s []byte) []byte {
s = norm.NFKD.Bytes(s)
const m = utf8.UTFMax
var (
ib [m * 3]byte
ob []byte
buf = make([]byte, 0, len(s))
dash = false
latin = true
)
for len(s) > 0 {
r, i := utf8.DecodeRune(s)
switch {
case unicode.IsOneOf(lat, r):
r = unicode.ToLower(r)
n := utf8.EncodeRune(ib[:m], r)
if r >= 128 {
if latin && dash {
buf = append(buf, '-')
}
n = hex.Encode(ib[m:], ib[:n])
ob = ib[m : m+n]
latin = false
} else {
if !latin {
buf = append(buf, '-')
}
ob = ib[:n]
latin = true
}
dash = true
buf = append(buf, ob...)
case unicode.IsOneOf(nop, r):
// skip
case dash:
buf = append(buf, '-')
dash = false
latin = true
}
s = s[i:]
}
i := len(buf) - 1
if i >= 0 && buf[i] == '-' {
buf = buf[:i]
}
return buf
}
// IsSlugAsciiBytes is equivalent to IsSlugAscii.
func IsSlugAsciiBytes(s []byte) bool {
dash := true
for _, b := range s {
switch {
case b == '-':
if dash {
return false
}
dash = true
case 'a' <= b && b <= 'z', '0' <= b && b <= '9':
dash = false
default:
return false
}
}
return !dash
}