611 lines
15 KiB
Go
611 lines
15 KiB
Go
// Copyright 2011 The Go Authors. All rights reserved.
|
|
// Use of this source code is governed by a BSD-style
|
|
// license that can be found in the LICENSE file.
|
|
|
|
// Note: the file data_test.go that is generated should not be checked in.
|
|
//go:generate go run maketables.go triegen.go
|
|
//go:generate go test -tags test
|
|
|
|
// Package norm contains types and functions for normalizing Unicode strings.
|
|
package norm // import "golang.org/x/text/unicode/norm"
|
|
|
|
import (
|
|
"unicode/utf8"
|
|
|
|
"golang.org/x/text/transform"
|
|
)
|
|
|
|
// A Form denotes a canonical representation of Unicode code points.
|
|
// The Unicode-defined normalization and equivalence forms are:
|
|
//
|
|
// NFC Unicode Normalization Form C
|
|
// NFD Unicode Normalization Form D
|
|
// NFKC Unicode Normalization Form KC
|
|
// NFKD Unicode Normalization Form KD
|
|
//
|
|
// For a Form f, this documentation uses the notation f(x) to mean
|
|
// the bytes or string x converted to the given form.
|
|
// A position n in x is called a boundary if conversion to the form can
|
|
// proceed independently on both sides:
|
|
//
|
|
// f(x) == append(f(x[0:n]), f(x[n:])...)
|
|
//
|
|
// References: https://unicode.org/reports/tr15/ and
|
|
// https://unicode.org/notes/tn5/.
|
|
type Form int
|
|
|
|
const (
|
|
NFC Form = iota
|
|
NFD
|
|
NFKC
|
|
NFKD
|
|
)
|
|
|
|
// Bytes returns f(b). May return b if f(b) = b.
|
|
func (f Form) Bytes(b []byte) []byte {
|
|
src := inputBytes(b)
|
|
ft := formTable[f]
|
|
n, ok := ft.quickSpan(src, 0, len(b), true)
|
|
if ok {
|
|
return b
|
|
}
|
|
out := make([]byte, n, len(b))
|
|
copy(out, b[0:n])
|
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
|
|
return doAppendInner(&rb, n)
|
|
}
|
|
|
|
// String returns f(s).
|
|
func (f Form) String(s string) string {
|
|
src := inputString(s)
|
|
ft := formTable[f]
|
|
n, ok := ft.quickSpan(src, 0, len(s), true)
|
|
if ok {
|
|
return s
|
|
}
|
|
out := make([]byte, n, len(s))
|
|
copy(out, s[0:n])
|
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
|
|
return string(doAppendInner(&rb, n))
|
|
}
|
|
|
|
// IsNormal returns true if b == f(b).
|
|
func (f Form) IsNormal(b []byte) bool {
|
|
src := inputBytes(b)
|
|
ft := formTable[f]
|
|
bp, ok := ft.quickSpan(src, 0, len(b), true)
|
|
if ok {
|
|
return true
|
|
}
|
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
|
|
rb.setFlusher(nil, cmpNormalBytes)
|
|
for bp < len(b) {
|
|
rb.out = b[bp:]
|
|
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
|
|
return false
|
|
}
|
|
bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
|
|
}
|
|
return true
|
|
}
|
|
|
|
func cmpNormalBytes(rb *reorderBuffer) bool {
|
|
b := rb.out
|
|
for i := 0; i < rb.nrune; i++ {
|
|
info := rb.rune[i]
|
|
if int(info.size) > len(b) {
|
|
return false
|
|
}
|
|
p := info.pos
|
|
pe := p + info.size
|
|
for ; p < pe; p++ {
|
|
if b[0] != rb.byte[p] {
|
|
return false
|
|
}
|
|
b = b[1:]
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
// IsNormalString returns true if s == f(s).
|
|
func (f Form) IsNormalString(s string) bool {
|
|
src := inputString(s)
|
|
ft := formTable[f]
|
|
bp, ok := ft.quickSpan(src, 0, len(s), true)
|
|
if ok {
|
|
return true
|
|
}
|
|
rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
|
|
rb.setFlusher(nil, func(rb *reorderBuffer) bool {
|
|
for i := 0; i < rb.nrune; i++ {
|
|
info := rb.rune[i]
|
|
if bp+int(info.size) > len(s) {
|
|
return false
|
|
}
|
|
p := info.pos
|
|
pe := p + info.size
|
|
for ; p < pe; p++ {
|
|
if s[bp] != rb.byte[p] {
|
|
return false
|
|
}
|
|
bp++
|
|
}
|
|
}
|
|
return true
|
|
})
|
|
for bp < len(s) {
|
|
if bp = decomposeSegment(&rb, bp, true); bp < 0 {
|
|
return false
|
|
}
|
|
bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
|
|
}
|
|
return true
|
|
}
|
|
|
|
// patchTail fixes a case where a rune may be incorrectly normalized
|
|
// if it is followed by illegal continuation bytes. It returns the
|
|
// patched buffer and whether the decomposition is still in progress.
|
|
func patchTail(rb *reorderBuffer) bool {
|
|
info, p := lastRuneStart(&rb.f, rb.out)
|
|
if p == -1 || info.size == 0 {
|
|
return true
|
|
}
|
|
end := p + int(info.size)
|
|
extra := len(rb.out) - end
|
|
if extra > 0 {
|
|
// Potentially allocating memory. However, this only
|
|
// happens with ill-formed UTF-8.
|
|
x := make([]byte, 0)
|
|
x = append(x, rb.out[len(rb.out)-extra:]...)
|
|
rb.out = rb.out[:end]
|
|
decomposeToLastBoundary(rb)
|
|
rb.doFlush()
|
|
rb.out = append(rb.out, x...)
|
|
return false
|
|
}
|
|
buf := rb.out[p:]
|
|
rb.out = rb.out[:p]
|
|
decomposeToLastBoundary(rb)
|
|
if s := rb.ss.next(info); s == ssStarter {
|
|
rb.doFlush()
|
|
rb.ss.first(info)
|
|
} else if s == ssOverflow {
|
|
rb.doFlush()
|
|
rb.insertCGJ()
|
|
rb.ss = 0
|
|
}
|
|
rb.insertUnsafe(inputBytes(buf), 0, info)
|
|
return true
|
|
}
|
|
|
|
func appendQuick(rb *reorderBuffer, i int) int {
|
|
if rb.nsrc == i {
|
|
return i
|
|
}
|
|
end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
|
|
rb.out = rb.src.appendSlice(rb.out, i, end)
|
|
return end
|
|
}
|
|
|
|
// Append returns f(append(out, b...)).
|
|
// The buffer out must be nil, empty, or equal to f(out).
|
|
func (f Form) Append(out []byte, src ...byte) []byte {
|
|
return f.doAppend(out, inputBytes(src), len(src))
|
|
}
|
|
|
|
func (f Form) doAppend(out []byte, src input, n int) []byte {
|
|
if n == 0 {
|
|
return out
|
|
}
|
|
ft := formTable[f]
|
|
// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
|
|
if len(out) == 0 {
|
|
p, _ := ft.quickSpan(src, 0, n, true)
|
|
out = src.appendSlice(out, 0, p)
|
|
if p == n {
|
|
return out
|
|
}
|
|
rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
|
|
return doAppendInner(&rb, p)
|
|
}
|
|
rb := reorderBuffer{f: *ft, src: src, nsrc: n}
|
|
return doAppend(&rb, out, 0)
|
|
}
|
|
|
|
func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
|
|
rb.setFlusher(out, appendFlush)
|
|
src, n := rb.src, rb.nsrc
|
|
doMerge := len(out) > 0
|
|
if q := src.skipContinuationBytes(p); q > p {
|
|
// Move leading non-starters to destination.
|
|
rb.out = src.appendSlice(rb.out, p, q)
|
|
p = q
|
|
doMerge = patchTail(rb)
|
|
}
|
|
fd := &rb.f
|
|
if doMerge {
|
|
var info Properties
|
|
if p < n {
|
|
info = fd.info(src, p)
|
|
if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
|
|
if p == 0 {
|
|
decomposeToLastBoundary(rb)
|
|
}
|
|
p = decomposeSegment(rb, p, true)
|
|
}
|
|
}
|
|
if info.size == 0 {
|
|
rb.doFlush()
|
|
// Append incomplete UTF-8 encoding.
|
|
return src.appendSlice(rb.out, p, n)
|
|
}
|
|
if rb.nrune > 0 {
|
|
return doAppendInner(rb, p)
|
|
}
|
|
}
|
|
p = appendQuick(rb, p)
|
|
return doAppendInner(rb, p)
|
|
}
|
|
|
|
func doAppendInner(rb *reorderBuffer, p int) []byte {
|
|
for n := rb.nsrc; p < n; {
|
|
p = decomposeSegment(rb, p, true)
|
|
p = appendQuick(rb, p)
|
|
}
|
|
return rb.out
|
|
}
|
|
|
|
// AppendString returns f(append(out, []byte(s))).
|
|
// The buffer out must be nil, empty, or equal to f(out).
|
|
func (f Form) AppendString(out []byte, src string) []byte {
|
|
return f.doAppend(out, inputString(src), len(src))
|
|
}
|
|
|
|
// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
|
|
// It is not guaranteed to return the largest such n.
|
|
func (f Form) QuickSpan(b []byte) int {
|
|
n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
|
|
return n
|
|
}
|
|
|
|
// Span implements transform.SpanningTransformer. It returns a boundary n such
|
|
// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
|
|
func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
|
|
n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
|
|
if n < len(b) {
|
|
if !ok {
|
|
err = transform.ErrEndOfSpan
|
|
} else {
|
|
err = transform.ErrShortSrc
|
|
}
|
|
}
|
|
return n, err
|
|
}
|
|
|
|
// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
|
// It is not guaranteed to return the largest such n.
|
|
func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
|
|
n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
|
|
if n < len(s) {
|
|
if !ok {
|
|
err = transform.ErrEndOfSpan
|
|
} else {
|
|
err = transform.ErrShortSrc
|
|
}
|
|
}
|
|
return n, err
|
|
}
|
|
|
|
// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
|
|
// whether any non-normalized parts were found. If atEOF is false, n will
|
|
// not point past the last segment if this segment might be become
|
|
// non-normalized by appending other runes.
|
|
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
|
|
var lastCC uint8
|
|
ss := streamSafe(0)
|
|
lastSegStart := i
|
|
for n = end; i < n; {
|
|
if j := src.skipASCII(i, n); i != j {
|
|
i = j
|
|
lastSegStart = i - 1
|
|
lastCC = 0
|
|
ss = 0
|
|
continue
|
|
}
|
|
info := f.info(src, i)
|
|
if info.size == 0 {
|
|
if atEOF {
|
|
// include incomplete runes
|
|
return n, true
|
|
}
|
|
return lastSegStart, true
|
|
}
|
|
// This block needs to be before the next, because it is possible to
|
|
// have an overflow for runes that are starters (e.g. with U+FF9E).
|
|
switch ss.next(info) {
|
|
case ssStarter:
|
|
lastSegStart = i
|
|
case ssOverflow:
|
|
return lastSegStart, false
|
|
case ssSuccess:
|
|
if lastCC > info.ccc {
|
|
return lastSegStart, false
|
|
}
|
|
}
|
|
if f.composing {
|
|
if !info.isYesC() {
|
|
break
|
|
}
|
|
} else {
|
|
if !info.isYesD() {
|
|
break
|
|
}
|
|
}
|
|
lastCC = info.ccc
|
|
i += int(info.size)
|
|
}
|
|
if i == n {
|
|
if !atEOF {
|
|
n = lastSegStart
|
|
}
|
|
return n, true
|
|
}
|
|
return lastSegStart, false
|
|
}
|
|
|
|
// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
|
|
// It is not guaranteed to return the largest such n.
|
|
func (f Form) QuickSpanString(s string) int {
|
|
n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
|
|
return n
|
|
}
|
|
|
|
// FirstBoundary returns the position i of the first boundary in b
|
|
// or -1 if b contains no boundary.
|
|
func (f Form) FirstBoundary(b []byte) int {
|
|
return f.firstBoundary(inputBytes(b), len(b))
|
|
}
|
|
|
|
func (f Form) firstBoundary(src input, nsrc int) int {
|
|
i := src.skipContinuationBytes(0)
|
|
if i >= nsrc {
|
|
return -1
|
|
}
|
|
fd := formTable[f]
|
|
ss := streamSafe(0)
|
|
// We should call ss.first here, but we can't as the first rune is
|
|
// skipped already. This means FirstBoundary can't really determine
|
|
// CGJ insertion points correctly. Luckily it doesn't have to.
|
|
for {
|
|
info := fd.info(src, i)
|
|
if info.size == 0 {
|
|
return -1
|
|
}
|
|
if s := ss.next(info); s != ssSuccess {
|
|
return i
|
|
}
|
|
i += int(info.size)
|
|
if i >= nsrc {
|
|
if !info.BoundaryAfter() && !ss.isMax() {
|
|
return -1
|
|
}
|
|
return nsrc
|
|
}
|
|
}
|
|
}
|
|
|
|
// FirstBoundaryInString returns the position i of the first boundary in s
|
|
// or -1 if s contains no boundary.
|
|
func (f Form) FirstBoundaryInString(s string) int {
|
|
return f.firstBoundary(inputString(s), len(s))
|
|
}
|
|
|
|
// NextBoundary reports the index of the boundary between the first and next
|
|
// segment in b or -1 if atEOF is false and there are not enough bytes to
|
|
// determine this boundary.
|
|
func (f Form) NextBoundary(b []byte, atEOF bool) int {
|
|
return f.nextBoundary(inputBytes(b), len(b), atEOF)
|
|
}
|
|
|
|
// NextBoundaryInString reports the index of the boundary between the first and
|
|
// next segment in b or -1 if atEOF is false and there are not enough bytes to
|
|
// determine this boundary.
|
|
func (f Form) NextBoundaryInString(s string, atEOF bool) int {
|
|
return f.nextBoundary(inputString(s), len(s), atEOF)
|
|
}
|
|
|
|
func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
|
|
if nsrc == 0 {
|
|
if atEOF {
|
|
return 0
|
|
}
|
|
return -1
|
|
}
|
|
fd := formTable[f]
|
|
info := fd.info(src, 0)
|
|
if info.size == 0 {
|
|
if atEOF {
|
|
return 1
|
|
}
|
|
return -1
|
|
}
|
|
ss := streamSafe(0)
|
|
ss.first(info)
|
|
|
|
for i := int(info.size); i < nsrc; i += int(info.size) {
|
|
info = fd.info(src, i)
|
|
if info.size == 0 {
|
|
if atEOF {
|
|
return i
|
|
}
|
|
return -1
|
|
}
|
|
// TODO: Using streamSafe to determine the boundary isn't the same as
|
|
// using BoundaryBefore. Determine which should be used.
|
|
if s := ss.next(info); s != ssSuccess {
|
|
return i
|
|
}
|
|
}
|
|
if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
|
|
return -1
|
|
}
|
|
return nsrc
|
|
}
|
|
|
|
// LastBoundary returns the position i of the last boundary in b
|
|
// or -1 if b contains no boundary.
|
|
func (f Form) LastBoundary(b []byte) int {
|
|
return lastBoundary(formTable[f], b)
|
|
}
|
|
|
|
func lastBoundary(fd *formInfo, b []byte) int {
|
|
i := len(b)
|
|
info, p := lastRuneStart(fd, b)
|
|
if p == -1 {
|
|
return -1
|
|
}
|
|
if info.size == 0 { // ends with incomplete rune
|
|
if p == 0 { // starts with incomplete rune
|
|
return -1
|
|
}
|
|
i = p
|
|
info, p = lastRuneStart(fd, b[:i])
|
|
if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
|
|
return i
|
|
}
|
|
}
|
|
if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
|
|
return i
|
|
}
|
|
if info.BoundaryAfter() {
|
|
return i
|
|
}
|
|
ss := streamSafe(0)
|
|
v := ss.backwards(info)
|
|
for i = p; i >= 0 && v != ssStarter; i = p {
|
|
info, p = lastRuneStart(fd, b[:i])
|
|
if v = ss.backwards(info); v == ssOverflow {
|
|
break
|
|
}
|
|
if p+int(info.size) != i {
|
|
if p == -1 { // no boundary found
|
|
return -1
|
|
}
|
|
return i // boundary after an illegal UTF-8 encoding
|
|
}
|
|
}
|
|
return i
|
|
}
|
|
|
|
// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
|
|
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
|
|
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
|
|
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
|
|
// Force one character to be consumed.
|
|
info := rb.f.info(rb.src, sp)
|
|
if info.size == 0 {
|
|
return 0
|
|
}
|
|
if s := rb.ss.next(info); s == ssStarter {
|
|
// TODO: this could be removed if we don't support merging.
|
|
if rb.nrune > 0 {
|
|
goto end
|
|
}
|
|
} else if s == ssOverflow {
|
|
rb.insertCGJ()
|
|
goto end
|
|
}
|
|
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
|
|
return int(err)
|
|
}
|
|
for {
|
|
sp += int(info.size)
|
|
if sp >= rb.nsrc {
|
|
if !atEOF && !info.BoundaryAfter() {
|
|
return int(iShortSrc)
|
|
}
|
|
break
|
|
}
|
|
info = rb.f.info(rb.src, sp)
|
|
if info.size == 0 {
|
|
if !atEOF {
|
|
return int(iShortSrc)
|
|
}
|
|
break
|
|
}
|
|
if s := rb.ss.next(info); s == ssStarter {
|
|
break
|
|
} else if s == ssOverflow {
|
|
rb.insertCGJ()
|
|
break
|
|
}
|
|
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
|
|
return int(err)
|
|
}
|
|
}
|
|
end:
|
|
if !rb.doFlush() {
|
|
return int(iShortDst)
|
|
}
|
|
return sp
|
|
}
|
|
|
|
// lastRuneStart returns the runeInfo and position of the last
|
|
// rune in buf or the zero runeInfo and -1 if no rune was found.
|
|
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
|
|
p := len(buf) - 1
|
|
for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
|
|
}
|
|
if p < 0 {
|
|
return Properties{}, -1
|
|
}
|
|
return fd.info(inputBytes(buf), p), p
|
|
}
|
|
|
|
// decomposeToLastBoundary finds an open segment at the end of the buffer
|
|
// and scans it into rb. Returns the buffer minus the last segment.
|
|
func decomposeToLastBoundary(rb *reorderBuffer) {
|
|
fd := &rb.f
|
|
info, i := lastRuneStart(fd, rb.out)
|
|
if int(info.size) != len(rb.out)-i {
|
|
// illegal trailing continuation bytes
|
|
return
|
|
}
|
|
if info.BoundaryAfter() {
|
|
return
|
|
}
|
|
var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
|
|
padd := 0
|
|
ss := streamSafe(0)
|
|
p := len(rb.out)
|
|
for {
|
|
add[padd] = info
|
|
v := ss.backwards(info)
|
|
if v == ssOverflow {
|
|
// Note that if we have an overflow, it the string we are appending to
|
|
// is not correctly normalized. In this case the behavior is undefined.
|
|
break
|
|
}
|
|
padd++
|
|
p -= int(info.size)
|
|
if v == ssStarter || p < 0 {
|
|
break
|
|
}
|
|
info, i = lastRuneStart(fd, rb.out[:p])
|
|
if int(info.size) != p-i {
|
|
break
|
|
}
|
|
}
|
|
rb.ss = ss
|
|
// Copy bytes for insertion as we may need to overwrite rb.out.
|
|
var buf [maxBufferSize * utf8.UTFMax]byte
|
|
cp := buf[:copy(buf[:], rb.out[p:])]
|
|
rb.out = rb.out[:p]
|
|
for padd--; padd >= 0; padd-- {
|
|
info = add[padd]
|
|
rb.insertUnsafe(inputBytes(cp), 0, info)
|
|
cp = cp[info.size:]
|
|
}
|
|
}
|