600 lines
17 KiB
Go
600 lines
17 KiB
Go
// Copyright 2014 The Go Authors. All rights reserved.
|
||
// Use of this source code is governed by a BSD-style
|
||
// license that can be found in the LICENSE file.
|
||
|
||
package cases
|
||
|
||
// This file contains the definitions of case mappings for all supported
|
||
// languages. The rules for the language-specific tailorings were taken and
|
||
// modified from the CLDR transform definitions in common/transforms.
|
||
|
||
import (
|
||
"strings"
|
||
"unicode"
|
||
"unicode/utf8"
|
||
|
||
"golang.org/x/text/language"
|
||
"golang.org/x/text/transform"
|
||
"golang.org/x/text/unicode/norm"
|
||
)
|
||
|
||
// A mapFunc takes a context set to the current rune and writes the mapped
|
||
// version to the same context. It may advance the context to the next rune. It
|
||
// returns whether a checkpoint is possible: whether the pDst bytes written to
|
||
// dst so far won't need changing as we see more source bytes.
|
||
type mapFunc func(*context) bool
|
||
|
||
// maxIgnorable defines the maximum number of ignorables to consider for
|
||
// lookahead operations.
|
||
const maxIgnorable = 30
|
||
|
||
// supported lists the language tags for which we have tailorings.
|
||
const supported = "und af az el lt nl tr"
|
||
|
||
func init() {
|
||
tags := []language.Tag{}
|
||
for _, s := range strings.Split(supported, " ") {
|
||
tags = append(tags, language.MustParse(s))
|
||
}
|
||
matcher = language.NewMatcher(tags)
|
||
Supported = language.NewCoverage(tags)
|
||
}
|
||
|
||
var (
|
||
matcher language.Matcher
|
||
|
||
Supported language.Coverage
|
||
|
||
// We keep the following lists separate, instead of having a single per-
|
||
// language struct, to give the compiler a chance to remove unused code.
|
||
|
||
// Some uppercase mappers are stateless, so we can precompute the
|
||
// Transformers and save a bit on runtime allocations.
|
||
upperFunc = []mapFunc{
|
||
nil, // und
|
||
nil, // af
|
||
aztrUpper(upper), // az
|
||
elUpper, // el
|
||
ltUpper(upper), // lt
|
||
nil, // nl
|
||
aztrUpper(upper), // tr
|
||
}
|
||
|
||
undUpper transform.Transformer = &undUpperCaser{}
|
||
|
||
lowerFunc = []mapFunc{
|
||
lower, // und
|
||
lower, // af
|
||
aztrLower, // az
|
||
lower, // el
|
||
ltLower, // lt
|
||
lower, // nl
|
||
aztrLower, // tr
|
||
}
|
||
|
||
titleInfos = []struct {
|
||
title, lower mapFunc
|
||
rewrite func(*context)
|
||
}{
|
||
{title, lower, nil}, // und
|
||
{title, lower, afnlRewrite}, // af
|
||
{aztrUpper(title), aztrLower, nil}, // az
|
||
{title, lower, nil}, // el
|
||
{ltUpper(title), ltLower, nil}, // lt
|
||
{nlTitle, lower, afnlRewrite}, // nl
|
||
{aztrUpper(title), aztrLower, nil}, // tr
|
||
}
|
||
)
|
||
|
||
func makeUpper(t language.Tag, o options) transform.Transformer {
|
||
_, i, _ := matcher.Match(t)
|
||
f := upperFunc[i]
|
||
if f == nil {
|
||
return undUpper
|
||
}
|
||
return &simpleCaser{f: f}
|
||
}
|
||
|
||
func makeLower(t language.Tag, o options) transform.Transformer {
|
||
_, i, _ := matcher.Match(t)
|
||
f := lowerFunc[i]
|
||
if o.noFinalSigma {
|
||
return &simpleCaser{f: f}
|
||
}
|
||
return &lowerCaser{
|
||
first: f,
|
||
midWord: finalSigma(f),
|
||
}
|
||
}
|
||
|
||
func makeTitle(t language.Tag, o options) transform.Transformer {
|
||
_, i, _ := matcher.Match(t)
|
||
x := &titleInfos[i]
|
||
lower := x.lower
|
||
if o.noLower {
|
||
lower = (*context).copy
|
||
} else if !o.noFinalSigma {
|
||
lower = finalSigma(lower)
|
||
}
|
||
return &titleCaser{
|
||
title: x.title,
|
||
lower: lower,
|
||
rewrite: x.rewrite,
|
||
}
|
||
}
|
||
|
||
// TODO: consider a similar special case for the fast majority lower case. This
|
||
// is a bit more involved so will require some more precise benchmarking to
|
||
// justify it.
|
||
|
||
type undUpperCaser struct{ transform.NopResetter }
|
||
|
||
// undUpperCaser implements the Transformer interface for doing an upper case
|
||
// mapping for the root locale (und). It eliminates the need for an allocation
|
||
// as it prevents escaping by not using function pointers.
|
||
func (t *undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
c := context{dst: dst, src: src, atEOF: atEOF}
|
||
for c.next() {
|
||
upper(&c)
|
||
c.checkpoint()
|
||
}
|
||
return c.ret()
|
||
}
|
||
|
||
type simpleCaser struct {
|
||
context
|
||
f mapFunc
|
||
}
|
||
|
||
// simpleCaser implements the Transformer interface for doing a case operation
|
||
// on a rune-by-rune basis.
|
||
func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
t.context = context{dst: dst, src: src, atEOF: atEOF}
|
||
c := &t.context
|
||
for c.next() && t.f(c) {
|
||
c.checkpoint()
|
||
}
|
||
return c.ret()
|
||
}
|
||
|
||
// lowerCaser implements the Transformer interface. The default Unicode lower
|
||
// casing requires different treatment for the first and subsequent characters
|
||
// of a word, most notably to handle the Greek final Sigma.
|
||
type lowerCaser struct {
|
||
context
|
||
|
||
first, midWord mapFunc
|
||
}
|
||
|
||
func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
t.context = context{dst: dst, src: src, atEOF: atEOF}
|
||
c := &t.context
|
||
|
||
for isInterWord := true; c.next(); {
|
||
if isInterWord {
|
||
if c.info.isCased() {
|
||
if !t.first(c) {
|
||
break
|
||
}
|
||
isInterWord = false
|
||
} else if !c.copy() {
|
||
break
|
||
}
|
||
} else {
|
||
if c.info.isNotCasedAndNotCaseIgnorable() {
|
||
if !c.copy() {
|
||
break
|
||
}
|
||
isInterWord = true
|
||
} else if !t.midWord(c) {
|
||
break
|
||
}
|
||
}
|
||
c.checkpoint()
|
||
}
|
||
return c.ret()
|
||
}
|
||
|
||
// titleCaser implements the Transformer interface. Title casing algorithms
|
||
// distinguish between the first letter of a word and subsequent letters of the
|
||
// same word. It uses state to avoid requiring a potentially infinite lookahead.
|
||
type titleCaser struct {
|
||
context
|
||
|
||
// rune mappings used by the actual casing algorithms.
|
||
title, lower mapFunc
|
||
|
||
rewrite func(*context)
|
||
}
|
||
|
||
// Transform implements the standard Unicode title case algorithm as defined in
|
||
// Chapter 3 of The Unicode Standard:
|
||
// toTitlecase(X): Find the word boundaries in X according to Unicode Standard
|
||
// Annex #29, "Unicode Text Segmentation." For each word boundary, find the
|
||
// first cased character F following the word boundary. If F exists, map F to
|
||
// Titlecase_Mapping(F); then map all characters C between F and the following
|
||
// word boundary to Lowercase_Mapping(C).
|
||
func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
|
||
c := &t.context
|
||
|
||
if !c.next() {
|
||
return c.ret()
|
||
}
|
||
|
||
for {
|
||
p := c.info
|
||
if t.rewrite != nil {
|
||
t.rewrite(c)
|
||
}
|
||
|
||
wasMid := p.isCaseIgnorableAndNonBreakStarter()
|
||
// Break out of this loop on failure to ensure we do not modify the
|
||
// state incorrectly.
|
||
if p.isCased() && !p.isCaseIgnorableAndNotCased() {
|
||
if !c.isMidWord {
|
||
if !t.title(c) {
|
||
break
|
||
}
|
||
c.isMidWord = true
|
||
} else if !t.lower(c) {
|
||
break
|
||
}
|
||
} else if !c.copy() {
|
||
break
|
||
}
|
||
|
||
// TODO: make this an "else if" if we can prove that no rune that does
|
||
// not match the first condition of the if statement can be a break.
|
||
if p.isBreak() {
|
||
c.isMidWord = false
|
||
}
|
||
|
||
// As we save the state of the transformer, it is safe to call
|
||
// checkpoint after any successful write.
|
||
c.checkpoint()
|
||
|
||
if !c.next() {
|
||
break
|
||
}
|
||
if wasMid && c.info.isCaseIgnorableAndNonBreakStarter() {
|
||
c.isMidWord = false
|
||
}
|
||
}
|
||
return c.ret()
|
||
}
|
||
|
||
// finalSigma adds Greek final Sigma handing to another casing function. It
|
||
// determines whether a lowercased sigma should be σ or ς, by looking ahead for
|
||
// case-ignorables and a cased letters.
|
||
func finalSigma(f mapFunc) mapFunc {
|
||
return func(c *context) bool {
|
||
// ::NFD();
|
||
// # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
|
||
// Σ } [:case-ignorable:]* [:cased:] → σ;
|
||
// [:cased:] [:case-ignorable:]* { Σ → ς;
|
||
// ::Any-Lower;
|
||
// ::NFC();
|
||
|
||
if !c.hasPrefix("Σ") {
|
||
return f(c)
|
||
}
|
||
|
||
p := c.pDst
|
||
c.writeString("ς")
|
||
// We need to do one more iteration after maxIgnorable, as a cased
|
||
// letter is not an ignorable and may modify the result.
|
||
for i := 0; i < maxIgnorable+1; i++ {
|
||
if !c.next() {
|
||
return false
|
||
}
|
||
if !c.info.isCaseIgnorable() {
|
||
if c.info.isCased() {
|
||
// p+1 is guaranteed to be in bounds: if writing ς was
|
||
// successful, p+1 will contain the second byte of ς. If not,
|
||
// this function will have returned after c.next returned false.
|
||
c.dst[p+1]++ // ς → σ
|
||
}
|
||
c.unreadRune()
|
||
return true
|
||
}
|
||
// A case ignorable may also introduce a word break, so we may need
|
||
// to continue searching even after detecting a break.
|
||
c.isMidWord = c.isMidWord && !c.info.isBreak()
|
||
c.copy()
|
||
}
|
||
return true
|
||
}
|
||
}
|
||
|
||
// elUpper implements Greek upper casing, which entails removing a predefined
|
||
// set of non-blocked modifiers. Note that these accents should not be removed
|
||
// for title casing!
|
||
// Example: "Οδός" -> "ΟΔΟΣ".
|
||
func elUpper(c *context) bool {
|
||
// From CLDR:
|
||
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ;
|
||
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;
|
||
|
||
r, _ := utf8.DecodeRune(c.src[c.pSrc:])
|
||
oldPDst := c.pDst
|
||
if !upper(c) {
|
||
return false
|
||
}
|
||
if !unicode.Is(unicode.Greek, r) {
|
||
return true
|
||
}
|
||
i := 0
|
||
// Take the properties of the uppercased rune that is already written to the
|
||
// destination. This saves us the trouble of having to uppercase the
|
||
// decomposed rune again.
|
||
if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil {
|
||
// Restore the destination position and process the decomposed rune.
|
||
r, sz := utf8.DecodeRune(b)
|
||
if r <= 0xFF { // See A.6.1
|
||
return true
|
||
}
|
||
c.pDst = oldPDst
|
||
// Insert the first rune and ignore the modifiers. See A.6.2.
|
||
c.writeBytes(b[:sz])
|
||
i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
|
||
}
|
||
|
||
for ; i < maxIgnorable && c.next(); i++ {
|
||
switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r {
|
||
// Above and Iota Subscript
|
||
case 0x0300, // U+0300 COMBINING GRAVE ACCENT
|
||
0x0301, // U+0301 COMBINING ACUTE ACCENT
|
||
0x0304, // U+0304 COMBINING MACRON
|
||
0x0306, // U+0306 COMBINING BREVE
|
||
0x0308, // U+0308 COMBINING DIAERESIS
|
||
0x0313, // U+0313 COMBINING COMMA ABOVE
|
||
0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
|
||
0x0342, // U+0342 COMBINING GREEK PERISPOMENI
|
||
0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI
|
||
// No-op. Gobble the modifier.
|
||
|
||
default:
|
||
switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
|
||
case cccZero:
|
||
c.unreadRune()
|
||
return true
|
||
|
||
// We don't need to test for IotaSubscript as the only rune that
|
||
// qualifies (U+0345) was already excluded in the switch statement
|
||
// above. See A.4.
|
||
|
||
case cccAbove:
|
||
return c.copy()
|
||
default:
|
||
// Some other modifier. We're still allowed to gobble Greek
|
||
// modifiers after this.
|
||
c.copy()
|
||
}
|
||
}
|
||
}
|
||
return i == maxIgnorable
|
||
}
|
||
|
||
func ltLower(c *context) bool {
|
||
// From CLDR:
|
||
// # Introduce an explicit dot above when lowercasing capital I's and J's
|
||
// # whenever there are more accents above.
|
||
// # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
|
||
// # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
|
||
// # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
|
||
// # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
|
||
// # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
|
||
// # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
|
||
// # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
|
||
// ::NFD();
|
||
// I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307;
|
||
// J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307;
|
||
// Į } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → į \u0307;
|
||
// Ì → i \u0307 \u0300;
|
||
// Í → i \u0307 \u0301;
|
||
// Ĩ → i \u0307 \u0303;
|
||
// ::Any-Lower();
|
||
// ::NFC();
|
||
|
||
i := 0
|
||
if r := c.src[c.pSrc]; r < utf8.RuneSelf {
|
||
lower(c)
|
||
if r != 'I' && r != 'J' {
|
||
return true
|
||
}
|
||
} else {
|
||
p := norm.NFD.Properties(c.src[c.pSrc:])
|
||
if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') {
|
||
// UTF-8 optimization: the decomposition will only have an above
|
||
// modifier if the last rune of the decomposition is in [U+300-U+311].
|
||
// In all other cases, a decomposition starting with I is always
|
||
// an I followed by modifiers that are not cased themselves. See A.2.
|
||
if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
|
||
if !c.writeBytes(d[:1]) {
|
||
return false
|
||
}
|
||
c.dst[c.pDst-1] += 'a' - 'A' // lower
|
||
|
||
// Assumption: modifier never changes on lowercase. See A.1.
|
||
// Assumption: all modifiers added have CCC = Above. See A.2.3.
|
||
return c.writeString("\u0307") && c.writeBytes(d[1:])
|
||
}
|
||
// In all other cases the additional modifiers will have a CCC
|
||
// that is less than 230 (Above). We will insert the U+0307, if
|
||
// needed, after these modifiers so that a string in FCD form
|
||
// will remain so. See A.2.2.
|
||
lower(c)
|
||
i = 1
|
||
} else {
|
||
return lower(c)
|
||
}
|
||
}
|
||
|
||
for ; i < maxIgnorable && c.next(); i++ {
|
||
switch c.info.cccType() {
|
||
case cccZero:
|
||
c.unreadRune()
|
||
return true
|
||
case cccAbove:
|
||
return c.writeString("\u0307") && c.copy() // See A.1.
|
||
default:
|
||
c.copy() // See A.1.
|
||
}
|
||
}
|
||
return i == maxIgnorable
|
||
}
|
||
|
||
func ltUpper(f mapFunc) mapFunc {
|
||
return func(c *context) bool {
|
||
// From CLDR:
|
||
// ::NFD();
|
||
// [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ;
|
||
// ::Any-Upper();
|
||
// ::NFC();
|
||
|
||
// TODO: See A.5. A soft-dotted rune never has an exception. This would
|
||
// allow us to overload the exception bit and encode this property in
|
||
// info. Need to measure performance impact of this.
|
||
r, _ := utf8.DecodeRune(c.src[c.pSrc:])
|
||
oldPDst := c.pDst
|
||
if !f(c) {
|
||
return false
|
||
}
|
||
if !unicode.Is(unicode.Soft_Dotted, r) {
|
||
return true
|
||
}
|
||
|
||
// We don't need to do an NFD normalization, as a soft-dotted rune never
|
||
// contains U+0307. See A.3.
|
||
|
||
i := 0
|
||
for ; i < maxIgnorable && c.next(); i++ {
|
||
switch c.info.cccType() {
|
||
case cccZero:
|
||
c.unreadRune()
|
||
return true
|
||
case cccAbove:
|
||
if c.hasPrefix("\u0307") {
|
||
// We don't do a full NFC, but rather combine runes for
|
||
// some of the common cases. (Returning NFC or
|
||
// preserving normal form is neither a requirement nor
|
||
// a possibility anyway).
|
||
if !c.next() {
|
||
return false
|
||
}
|
||
if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
|
||
s := ""
|
||
switch c.src[c.pSrc+1] {
|
||
case 0x80: // U+0300 COMBINING GRAVE ACCENT
|
||
s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
|
||
case 0x81: // U+0301 COMBINING ACUTE ACCENT
|
||
s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
|
||
case 0x83: // U+0303 COMBINING TILDE
|
||
s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
|
||
case 0x88: // U+0308 COMBINING DIAERESIS
|
||
s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
|
||
default:
|
||
}
|
||
if s != "" {
|
||
c.pDst = oldPDst
|
||
return c.writeString(s)
|
||
}
|
||
}
|
||
}
|
||
return c.copy()
|
||
default:
|
||
c.copy()
|
||
}
|
||
}
|
||
return i == maxIgnorable
|
||
}
|
||
}
|
||
|
||
func aztrUpper(f mapFunc) mapFunc {
|
||
return func(c *context) bool {
|
||
// i→İ;
|
||
if c.src[c.pSrc] == 'i' {
|
||
return c.writeString("İ")
|
||
}
|
||
return f(c)
|
||
}
|
||
}
|
||
|
||
func aztrLower(c *context) (done bool) {
|
||
// From CLDR:
|
||
// # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
|
||
// # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE
|
||
// İ→i;
|
||
// # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
|
||
// # This matches the behavior of the canonically equivalent I-dot_above
|
||
// # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
|
||
// # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
|
||
// # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
|
||
// I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ;
|
||
// I→ı ;
|
||
// ::Any-Lower();
|
||
if c.hasPrefix("\u0130") { // İ
|
||
return c.writeString("i")
|
||
}
|
||
if c.src[c.pSrc] != 'I' {
|
||
return lower(c)
|
||
}
|
||
|
||
// We ignore the lower-case I for now, but insert it later when we know
|
||
// which form we need.
|
||
start := c.pSrc + c.sz
|
||
|
||
i := 0
|
||
Loop:
|
||
// We check for up to n ignorables before \u0307. As \u0307 is an
|
||
// ignorable as well, n is maxIgnorable-1.
|
||
for ; i < maxIgnorable && c.next(); i++ {
|
||
switch c.info.cccType() {
|
||
case cccAbove:
|
||
if c.hasPrefix("\u0307") {
|
||
return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
|
||
}
|
||
done = true
|
||
break Loop
|
||
case cccZero:
|
||
c.unreadRune()
|
||
done = true
|
||
break Loop
|
||
default:
|
||
// We'll write this rune after we know which starter to use.
|
||
}
|
||
}
|
||
if i == maxIgnorable {
|
||
done = true
|
||
}
|
||
return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
|
||
}
|
||
|
||
func nlTitle(c *context) bool {
|
||
// From CLDR:
|
||
// # Special titlecasing for Dutch initial "ij".
|
||
// ::Any-Title();
|
||
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29)
|
||
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;
|
||
if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
|
||
return title(c)
|
||
}
|
||
|
||
if !c.writeString("I") || !c.next() {
|
||
return false
|
||
}
|
||
if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
|
||
return c.writeString("J")
|
||
}
|
||
c.unreadRune()
|
||
return true
|
||
}
|
||
|
||
// Not part of CLDR, but see http://unicode.org/cldr/trac/ticket/7078.
|
||
func afnlRewrite(c *context) {
|
||
if c.hasPrefix("'") || c.hasPrefix("’") {
|
||
c.isMidWord = true
|
||
}
|
||
}
|