Source: map.go in package golang.org/x/text/cases


package cases

This file contains the definitions of case mappings for all supported languages. The rules for the language-specific tailorings were taken and modified from the CLDR transform definitions in common/transforms.


import (
	"strings"
	"unicode"
	"unicode/utf8"

	"golang.org/x/text/internal"
	"golang.org/x/text/language"
	"golang.org/x/text/transform"
	"golang.org/x/text/unicode/norm"
)

A mapFunc takes a context set to the current rune and writes the mapped version to the same context. It may advance the context to the next rune. It returns whether a checkpoint is possible: whether the pDst bytes written to dst so far won't need changing as we see more source bytes.

type mapFunc func(*context) bool

A spanFunc takes a context set to the current rune and returns whether this rune would be altered when written to the output. It may advance the context to the next rune. It returns whether a checkpoint is possible.

type spanFunc func(*context) bool

maxIgnorable defines the maximum number of ignorables to consider for lookahead operations.

const maxIgnorable = 30

supported lists the language tags for which we have tailorings.

const supported = "und af az el lt nl tr"

func init() {
	tags := []language.Tag{}
	for _, s := range strings.Split(supported, " ") {
		tags = append(tags, language.MustParse(s))
	}
	matcher = internal.NewInheritanceMatcher(tags)
	Supported = language.NewCoverage(tags)
}

var (
	matcher *internal.InheritanceMatcher

	Supported language.Coverage

We keep the following lists separate, instead of having a single per- language struct, to give the compiler a chance to remove unused code.

Some uppercase mappers are stateless, so we can precompute the Transformers and save a bit on runtime allocations.

	upperFunc = []struct {
		upper mapFunc
		span  spanFunc
	}{
		{nil, nil},                  // und
		{nil, nil},                  // af
		{aztrUpper(upper), isUpper}, // az
		{elUpper, noSpan},           // el
		{ltUpper(upper), noSpan},    // lt
		{nil, nil},                  // nl
		{aztrUpper(upper), isUpper}, // tr
	}

	undUpper            transform.SpanningTransformer = &undUpperCaser{}
	undLower            transform.SpanningTransformer = &undLowerCaser{}
	undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{}

	lowerFunc = []mapFunc{
		nil,       // und
		nil,       // af
		aztrLower, // az
		nil,       // el
		ltLower,   // lt
		nil,       // nl
		aztrLower, // tr
	}

	titleInfos = []struct {
		title     mapFunc
		lower     mapFunc
		titleSpan spanFunc
		rewrite   func(*context)
	}{
		{title, lower, isTitle, nil},                // und
		{title, lower, isTitle, afnlRewrite},        // af
		{aztrUpper(title), aztrLower, isTitle, nil}, // az
		{title, lower, isTitle, nil},                // el
		{ltUpper(title), ltLower, noSpan, nil},      // lt
		{nlTitle, lower, nlTitleSpan, afnlRewrite},  // nl
		{aztrUpper(title), aztrLower, isTitle, nil}, // tr
	}
)

func makeUpper(t language.Tag, o options) transform.SpanningTransformer {
	_, i, _ := matcher.Match(t)
	f := upperFunc[i].upper
	if f == nil {
		return undUpper
	}
	return &simpleCaser{f: f, span: upperFunc[i].span}
}

func makeLower(t language.Tag, o options) transform.SpanningTransformer {
	_, i, _ := matcher.Match(t)
	f := lowerFunc[i]
	if f == nil {
		if o.ignoreFinalSigma {
			return undLowerIgnoreSigma
		}
		return undLower
	}
	if o.ignoreFinalSigma {
		return &simpleCaser{f: f, span: isLower}
	}
	return &lowerCaser{
		first:   f,
		midWord: finalSigma(f),
	}
}

func makeTitle(t language.Tag, o options) transform.SpanningTransformer {
	_, i, _ := matcher.Match(t)
	x := &titleInfos[i]
	lower := x.lower
	if o.noLower {
		lower = (*context).copy
	} else if !o.ignoreFinalSigma {
		lower = finalSigma(lower)
	}
	return &titleCaser{
		title:     x.title,
		lower:     lower,
		titleSpan: x.titleSpan,
		rewrite:   x.rewrite,
	}
}

func noSpan(c *context) bool {
	c.err = transform.ErrEndOfSpan
	return false
}

TODO: consider a similar special case for the fast majority lower case. This is a bit more involved so will require some more precise benchmarking to justify it.


type undUpperCaser struct{ transform.NopResetter }

undUpperCaser implements the Transformer interface for doing an upper case mapping for the root locale (und). It eliminates the need for an allocation as it prevents escaping by not using function pointers.

func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	c := context{dst: dst, src: src, atEOF: atEOF}
	for c.next() {
		upper(&c)
		c.checkpoint()
	}
	return c.ret()
}

func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) {
	c := context{src: src, atEOF: atEOF}
	for c.next() && isUpper(&c) {
		c.checkpoint()
	}
	return c.retSpan()
}

undLowerIgnoreSigmaCaser implements the Transformer interface for doing a lower case mapping for the root locale (und) ignoring final sigma handling. This casing algorithm is used in some performance-critical packages like secure/precis and x/net/http/idna, which warrants its special-casing.

type undLowerIgnoreSigmaCaser struct{ transform.NopResetter }

func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	c := context{dst: dst, src: src, atEOF: atEOF}
	for c.next() && lower(&c) {
		c.checkpoint()
	}
	return c.ret()

}

Span implements a generic lower-casing. This is possible as isLower works for all lowercasing variants. All lowercase variants only vary in how they transform a non-lowercase letter. They will never change an already lowercase letter. In addition, there is no state.

func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) {
	c := context{src: src, atEOF: atEOF}
	for c.next() && isLower(&c) {
		c.checkpoint()
	}
	return c.retSpan()
}

type simpleCaser struct {
	context
	f    mapFunc
	span spanFunc
}

simpleCaser implements the Transformer interface for doing a case operation on a rune-by-rune basis.

func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	c := context{dst: dst, src: src, atEOF: atEOF}
	for c.next() && t.f(&c) {
		c.checkpoint()
	}
	return c.ret()
}

func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) {
	c := context{src: src, atEOF: atEOF}
	for c.next() && t.span(&c) {
		c.checkpoint()
	}
	return c.retSpan()
}

undLowerCaser implements the Transformer interface for doing a lower case mapping for the root locale (und) ignoring final sigma handling. This casing algorithm is used in some performance-critical packages like secure/precis and x/net/http/idna, which warrants its special-casing.

type undLowerCaser struct{ transform.NopResetter }

func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	c := context{dst: dst, src: src, atEOF: atEOF}

	for isInterWord := true; c.next(); {
		if isInterWord {
			if c.info.isCased() {
				if !lower(&c) {
					break
				}
				isInterWord = false
			} else if !c.copy() {
				break
			}
		} else {
			if c.info.isNotCasedAndNotCaseIgnorable() {
				if !c.copy() {
					break
				}
				isInterWord = true
			} else if !c.hasPrefix("Σ") {
				if !lower(&c) {
					break
				}
			} else if !finalSigmaBody(&c) {
				break
			}
		}
		c.checkpoint()
	}
	return c.ret()
}

func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) {
	c := context{src: src, atEOF: atEOF}
	for c.next() && isLower(&c) {
		c.checkpoint()
	}
	return c.retSpan()
}

lowerCaser implements the Transformer interface. The default Unicode lower casing requires different treatment for the first and subsequent characters of a word, most notably to handle the Greek final Sigma.

type lowerCaser struct {
	undLowerIgnoreSigmaCaser

	context

	first, midWord mapFunc
}

func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	t.context = context{dst: dst, src: src, atEOF: atEOF}
	c := &t.context

	for isInterWord := true; c.next(); {
		if isInterWord {
			if c.info.isCased() {
				if !t.first(c) {
					break
				}
				isInterWord = false
			} else if !c.copy() {
				break
			}
		} else {
			if c.info.isNotCasedAndNotCaseIgnorable() {
				if !c.copy() {
					break
				}
				isInterWord = true
			} else if !t.midWord(c) {
				break
			}
		}
		c.checkpoint()
	}
	return c.ret()
}

titleCaser implements the Transformer interface. Title casing algorithms distinguish between the first letter of a word and subsequent letters of the same word. It uses state to avoid requiring a potentially infinite lookahead.

type titleCaser struct {
	context

rune mappings used by the actual casing algorithms.

	title     mapFunc
	lower     mapFunc
	titleSpan spanFunc

	rewrite func(*context)
}

Transform implements the standard Unicode title case algorithm as defined in Chapter 3 of The Unicode Standard: toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29, "Unicode Text Segmentation." For each word boundary, find the first cased character F following the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between F and the following word boundary to Lowercase_Mapping(C).

func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
	t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord}
	c := &t.context

	if !c.next() {
		return c.ret()
	}

	for {
		p := c.info
		if t.rewrite != nil {
			t.rewrite(c)
		}

Break out of this loop on failure to ensure we do not modify the state incorrectly.

		if p.isCased() {
			if !c.isMidWord {
				if !t.title(c) {
					break
				}
				c.isMidWord = true
			} else if !t.lower(c) {
				break
			}
		} else if !c.copy() {
			break
		} else if p.isBreak() {
			c.isMidWord = false
		}

As we save the state of the transformer, it is safe to call checkpoint after any successful write.

		if !(c.isMidWord && wasMid) {
			c.checkpoint()
		}

		if !c.next() {
			break
		}
		if wasMid && c.info.isMid() {
			c.isMidWord = false
		}
	}
	return c.ret()
}

func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) {
	t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord}
	c := &t.context

	if !c.next() {
		return c.retSpan()
	}

	for {
		p := c.info
		if t.rewrite != nil {
			t.rewrite(c)
		}

Break out of this loop on failure to ensure we do not modify the state incorrectly.

		if p.isCased() {
			if !c.isMidWord {
				if !t.titleSpan(c) {
					break
				}
				c.isMidWord = true
			} else if !isLower(c) {
				break
			}
		} else if p.isBreak() {
			c.isMidWord = false

As we save the state of the transformer, it is safe to call checkpoint after any successful write.

		if !(c.isMidWord && wasMid) {
			c.checkpoint()
		}

		if !c.next() {
			break
		}
		if wasMid && c.info.isMid() {
			c.isMidWord = false
		}
	}
	return c.retSpan()
}

finalSigma adds Greek final Sigma handing to another casing function. It determines whether a lowercased sigma should be σ or ς, by looking ahead for case-ignorables and a cased letters.

func finalSigma(f mapFunc) mapFunc {
	return func(c *context) bool {
		if !c.hasPrefix("Σ") {
			return f(c)
		}
		return finalSigmaBody(c)
	}
}

Current rune must be ∑.

::NFD(); # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA Σ } [:case-ignorable:]* [:cased:] → σ; [:cased:] [:case-ignorable:]* { Σ → ς; ::Any-Lower; ::NFC();


	p := c.pDst
	c.writeString("ς")

TODO: we should do this here, but right now this will never have an effect as this is called when the prefix is Sigma, whereas Dutch and Afrikaans only test for an apostrophe. if t.rewrite != nil { t.rewrite(c) }

We need to do one more iteration after maxIgnorable, as a cased letter is not an ignorable and may modify the result.

	wasMid := false
	for i := 0; i < maxIgnorable+1; i++ {
		if !c.next() {
			return false
		}

All Midword runes are also case ignorable, so we are guaranteed to have a letter or word break here. As we are unreading the run, there is no need to unset c.isMidWord; the title caser will handle this.

p+1 is guaranteed to be in bounds: if writing ς was successful, p+1 will contain the second byte of ς. If not, this function will have returned after c.next returned false.

				c.dst[p+1]++ // ς → σ
			}
			c.unreadRune()
			return true

A case ignorable may also introduce a word break, so we may need to continue searching even after detecting a break.

		isMid := c.info.isMid()
		if (wasMid && isMid) || c.info.isBreak() {
			c.isMidWord = false
		}
		wasMid = isMid
		c.copy()
	}
	return true
}

finalSigmaSpan would be the same as isLower.

elUpper implements Greek upper casing, which entails removing a predefined set of non-blocked modifiers. Note that these accents should not be removed for title casing! Example: "Οδός" -> "ΟΔΟΣ".

From CLDR: [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ;


	r, _ := utf8.DecodeRune(c.src[c.pSrc:])
	oldPDst := c.pDst
	if !upper(c) {
		return false
	}
	if !unicode.Is(unicode.Greek, r) {
		return true
	}

Take the properties of the uppercased rune that is already written to the destination. This saves us the trouble of having to uppercase the decomposed rune again.

Restore the destination position and process the decomposed rune.

		r, sz := utf8.DecodeRune(b)
		if r <= 0xFF { // See A.6.1
			return true
		}

Insert the first rune and ignore the modifiers. See A.6.2.

		c.writeBytes(b[:sz])
		i = len(b[sz:]) / 2 // Greek modifiers are always of length 2.
	}

	for ; i < maxIgnorable && c.next(); i++ {

Above and Iota Subscript

		case 0x0300, // U+0300 COMBINING GRAVE ACCENT
			0x0301, // U+0301 COMBINING ACUTE ACCENT
			0x0304, // U+0304 COMBINING MACRON
			0x0306, // U+0306 COMBINING BREVE
			0x0308, // U+0308 COMBINING DIAERESIS
			0x0313, // U+0313 COMBINING COMMA ABOVE
			0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE
			0x0342, // U+0342 COMBINING GREEK PERISPOMENI

No-op. Gobble the modifier.


		default:
			switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() {
			case cccZero:
				c.unreadRune()
				return true

We don't need to test for IotaSubscript as the only rune that qualifies (U+0345) was already excluded in the switch statement above. See A.4.


			case cccAbove:
				return c.copy()

Some other modifier. We're still allowed to gobble Greek modifiers after this.

				c.copy()
			}
		}
	}
	return i == maxIgnorable
}

TODO: implement elUpperSpan (low-priority: complex and infrequent).

From CLDR: # Introduce an explicit dot above when lowercasing capital I's and J's # whenever there are more accents above. # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE ::NFD(); I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307; I \u0300 (Ì) → i \u0307 \u0300; I \u0301 (Í) → i \u0307 \u0301; I \u0303 (Ĩ) → i \u0307 \u0303; ::Any-Lower(); ::NFC();


	i := 0
	if r := c.src[c.pSrc]; r < utf8.RuneSelf {
		lower(c)
		if r != 'I' && r != 'J' {
			return true
		}
	} else {
		p := norm.NFD.Properties(c.src[c.pSrc:])

UTF-8 optimization: the decomposition will only have an above modifier if the last rune of the decomposition is in [U+300-U+311]. In all other cases, a decomposition starting with I is always an I followed by modifiers that are not cased themselves. See A.2.

			if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4.
				if !c.writeBytes(d[:1]) {
					return false
				}
				c.dst[c.pDst-1] += 'a' - 'A' // lower

Assumption: modifier never changes on lowercase. See A.1. Assumption: all modifiers added have CCC = Above. See A.2.3.

				return c.writeString("\u0307") && c.writeBytes(d[1:])

In all other cases the additional modifiers will have a CCC that is less than 230 (Above). We will insert the U+0307, if needed, after these modifiers so that a string in FCD form will remain so. See A.2.2.

			lower(c)
			i = 1
		} else {
			return lower(c)
		}
	}

	for ; i < maxIgnorable && c.next(); i++ {
		switch c.info.cccType() {
		case cccZero:
			c.unreadRune()
			return true
		case cccAbove:
			return c.writeString("\u0307") && c.copy() // See A.1.
		default:
			c.copy() // See A.1.
		}
	}
	return i == maxIgnorable
}

ltLowerSpan would be the same as isLower.


func ltUpper(f mapFunc) mapFunc {

Unicode: 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE From CLDR: # Remove \u0307 following soft-dotteds (i, j, and the like), with possible # intervening non-230 marks. ::NFD(); [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; ::Any-Upper(); ::NFC();

TODO: See A.5. A soft-dotted rune never has an exception. This would allow us to overload the exception bit and encode this property in info. Need to measure performance impact of this.

		r, _ := utf8.DecodeRune(c.src[c.pSrc:])
		oldPDst := c.pDst
		if !f(c) {
			return false
		}
		if !unicode.Is(unicode.Soft_Dotted, r) {
			return true
		}

We don't need to do an NFD normalization, as a soft-dotted rune never contains U+0307. See A.3.


		i := 0
		for ; i < maxIgnorable && c.next(); i++ {
			switch c.info.cccType() {
			case cccZero:
				c.unreadRune()
				return true
			case cccAbove:

We don't do a full NFC, but rather combine runes for some of the common cases. (Returning NFC or preserving normal form is neither a requirement nor a possibility anyway).

					if !c.next() {
						return false
					}
					if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc {
						s := ""
						switch c.src[c.pSrc+1] {
						case 0x80: // U+0300 COMBINING GRAVE ACCENT
							s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE
						case 0x81: // U+0301 COMBINING ACUTE ACCENT
							s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE
						case 0x83: // U+0303 COMBINING TILDE
							s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE
						case 0x88: // U+0308 COMBINING DIAERESIS
							s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS
						default:
						}
						if s != "" {
							c.pDst = oldPDst
							return c.writeString(s)
						}
					}
				}
				return c.copy()
			default:
				c.copy()
			}
		}
		return i == maxIgnorable
	}
}

TODO: implement ltUpperSpan (low priority: complex and infrequent).


func aztrUpper(f mapFunc) mapFunc {

i→İ;

		if c.src[c.pSrc] == 'i' {
			return c.writeString("İ")
		}
		return f(c)
	}
}

From CLDR: # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE İ→i; # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. # This matches the behavior of the canonically equivalent I-dot_above # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; I→ı ; ::Any-Lower();

	if c.hasPrefix("\u0130") { // İ
		return c.writeString("i")
	}
	if c.src[c.pSrc] != 'I' {
		return lower(c)
	}

We ignore the lower-case I for now, but insert it later when we know which form we need.

	start := c.pSrc + c.sz

	i := 0

We check for up to n ignorables before \u0307. As \u0307 is an ignorable as well, n is maxIgnorable-1.

	for ; i < maxIgnorable && c.next(); i++ {
		switch c.info.cccType() {
		case cccAbove:
			if c.hasPrefix("\u0307") {
				return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307
			}
			done = true
			break Loop
		case cccZero:
			c.unreadRune()
			done = true
			break Loop

We'll write this rune after we know which starter to use.

		}
	}
	if i == maxIgnorable {
		done = true
	}
	return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done
}

aztrLowerSpan would be the same as isLower.

From CLDR: # Special titlecasing for Dutch initial "ij". ::Any-Title(); # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ;

	if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' {
		return title(c)
	}

	if !c.writeString("I") || !c.next() {
		return false
	}
	if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' {
		return c.writeString("J")
	}
	c.unreadRune()
	return true
}

	if c.src[c.pSrc] != 'I' {
		return isTitle(c)
	}
	if !c.next() || c.src[c.pSrc] == 'j' {
		return false
	}
	if c.src[c.pSrc] != 'J' {
		c.unreadRune()
	}
	return true
}

Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078.

func afnlRewrite(c *context) {
	if c.hasPrefix("'") || c.hasPrefix("’") {
		c.isMidWord = true
	}