Copyright 2020 The Go Authors. All rights reserved. Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
Package match defines matching algorithms and support code for the license checker.
package match

import (
	
	
	
)
A Dict maps words to integer indexes in a word list, of type WordID. The zero Dict is an empty dictionary ready for use. Lookup and Words are read-only operations, safe for any number of concurrent calls from multiple goroutines. Insert is a write operation; it must not run concurrently with any other call, whether to Insert, Lookup, or Words.
type Dict struct {
	dict map[string]WordID // dict maps word to index in list
	list []string          // list of known words
}
A WordID is the index of a word in a dictionary.
BadWord represents a word not present in the dictionary.
const BadWord WordID = -1
AnyWord represents a wildcard matching any word.
const AnyWord WordID = -2
Insert adds the word w to the word list, returning its index. If w is already in the word list, it is not added again; Insert returns the existing index.
func ( *Dict) ( string) WordID {
	,  := .dict[]
	if  {
		return 
	}
	if .dict == nil {
		.dict = make(map[string]WordID)
	}
	 = WordID(len(.list))
	if int() != len(.list) {
		panic("dictionary too large")
	}
	.list = append(.list, )
	.dict[] = 
	return 
}
Lookup looks for the word w in the word list and returns its index. If w is not in the word list, Lookup returns BadWord.
func ( *Dict) ( string) WordID {
	,  := .dict[]
	if ! {
		return BadWord
	}
	return 
}
Words returns the current word list. The list is not a copy; the caller can read but must not modify the list.
func ( *Dict) () []string {
	return .list
}
A Word represents a single word found in a text.
type Word struct {
	ID WordID
	Lo int32 // Word appears at text[Lo:Hi].
	Hi int32
}
InsertSplit splits text into a sequence of lowercase words, inserting any new words in the dictionary.
func ( *Dict) ( string) []Word {
	return .split(, true)
}
Split splits text into a sequence of lowercase words. It does not add any new words to the dictionary. Unrecognized words are reported as having ID = BadWord.
func ( *Dict) ( string) []Word {
	return .split(, false)
}
© is rewritten to this text.
var copyright = []byte("copyright")

func ( *Dict) ( string,  bool) []Word {
	var  []byte
	var  []Word
	 := 
	for  != "" {
		var  []byte
		var ,  int32
		{
			switch [0] {
			case '<':
				if  := htmlTagSize();  > 0 {
					 = [:]
					continue
				}
			case '{':
				if  := markdownAnchorSize();  > 0 {
					 = [:]
					continue
				}
Assume HTML entity is punctuation.
				if  := htmlEntitySize();  > 0 {
					if [:] == "&copy;" {
						 = int32(len() - len())
						 =  + int32()
						 = copyright
						 = [:]
						goto 
					}
					 = [:]
					continue
				}
			}
			if len() >= 2 && [0] == ']' && [1] == '(' {
				if  := markdownLinkSize();  > 0 {
					 = [:]
					continue
				}
			}

			,  := utf8.DecodeRuneInString()
			if !isWordStart() {
				 = [:]
				continue
			}
			 = appendFoldRune([:0], )
Scan whole word (except © which is already a word by itself, even when it appears next to other text, like ©1996).
			 = int32(len() - len())
			if  != '©' {
				for  < len() {
					,  := utf8.DecodeRuneInString([:])
					if !isWordContinue() {
						break
					}
					 += 
					 = appendFoldRune(, )
				}
Read "notice(s)" as "notices" and let spell-check accept "notice" too.
					 = append(, 's')
					 += 3
				}
			}
			 =  + int32()
			 = [:]

			 = 
Special case rewrites suggested by SPDX.
			switch {
"https" -> "http".
				 = [:4]

			case string() == "c" &&  > 0 && [-1] == '(' && int() < len() && [] == ')':
				 = copyright
				--
				++

			case string() == "©":
				 = copyright
			}
More of our own.
			for ,  := range canonicalRewrites {
				if string() == .y {
					 = append([:0], .x...)
				}
			}
		}

	:
		,  := .dict[string()]
		if  {
Treat "Copyright ©" as a single "copyright" instead of two.
				continue
			}
			 = append(, Word{, , })
			continue
		}

		if  {
			 = append(, Word{.Insert(string()), , })
			continue
		}
Unknown word
		 = append(, Word{BadWord, , })
	}

	return 
}
foldRune returns the folded rune r. It returns -1 if the rune r should be omitted entirely. Folding can be any canonicalizing transformation we want. For now folding means: - fold to consistent case (unicode.SimpleFold, but moving to lower-case afterward) - return -1 for (drop) combining grave and acute U+0300, U+0301 - strip pre-combined graves and acutes on vowels: é to e, etc. (for Canadian or European licenses mentioning Québec or Commissariat à l'Energie Atomique) If necessary we could do a full Unicode-based conversion, but that will require more thought about exactly what to do and doing it efficiently. For now, the accents are enough.
Iterate SimpleFold until we hit the min equivalent rune, which - for the ones we care about - is the upper case ASCII rune.
	for {
		 := unicode.SimpleFold()
		if  >=  {
			break
		}
		 = 
	}

	switch  {
	case 'Á', 'À':
		return 'a'
	case 'É', 'È':
		return 'e'
	case 'Í', 'Ì':
		return 'i'
	case 'Ó', 'Ò':
		return 'o'
	case 'Ú', 'Ù':
		return 'u'
	}

	if 'A' <=  &&  <= 'Z' {
		 += 'a' - 'A'
	}
delete ( ) in (c) or notice(s)
		return -1
	}

	return 
}
toFold converts s to folded form.
func ( string) string {
	var  []byte
	for ,  := range  {
		 = appendFoldRune(, )
	}
	return string()
}
appendFoldRune appends foldRune(r) to buf and returns the updated buffer.
func ( []byte,  rune) []byte {
	 = foldRune()
	if  < 0 {
		return 
	}
	if  < utf8.RuneSelf {
		return append(, byte())
	}

	 := len()
	 := utf8.RuneLen()
	for cap() < + {
		 = append([:cap()], 0)
	}
	 = [:+]
	utf8.EncodeRune([:], )
	return 
}
isWordStart reports whether r can appear at the start of a word.
func ( rune) bool {
	return unicode.IsLetter() || unicode.IsDigit() ||  == '©'
}
isWordContinue reports whether r can appear in a word, after the start.
func ( rune) bool {
	return unicode.IsLetter() || unicode.IsDigit() || unicode.Is(unicode.Mn, )
}
htmlTagSize returns the length of the HTML tag at the start of t, or else 0.
func ( string) int {
	if len() < 3 || [0] != '<' {
		return 0
	}
	 := 1
	if [] == '/' {
		++
	}
	if !('A' <= [] && [] <= 'Z' || 'a' <= [] && [] <= 'z') {
		return 0
	}
	 := false
	 := 0
	for ;  < len(); ++ {
		switch [] {
Keep <me@example.com>
			if ! {
				return 0
			}
Keep <http://example.com>
			if ! && +1 < len() && [+1] == '/' {
				return 0
			}
		case '\r', '\n':
			if ++;  > 2 {
				return 0
			}
		case '<':
			return 0
		case '>':
			return  + 1
		case ' ':
			 = true
		}
	}
	return 0
}
htmlEntitySize returns the length of the HTML entity expression at the start of t, or else 0.
func ( string) int {
	if len() < 3 || [0] != '&' {
		return 0
	}
	if [1] == '#' {
&#xHEX;
			 := 3
			for  < len() && ('0' <= [] && [] <= '9' || 'A' <= [] && [] <= 'F' || 'a' <= [] && [] <= 'f') {
				++
			}
			if  > 3 &&  < len() && [] == ';' {
				return  + 1
			}
			return 0
&#DECIMAL;
		 := 2
		for  < len() && '0' <= [] && [] <= '9' {
			++
		}
		if  > 2 &&  < len() && [] == ';' {
			return  + 1
		}
		return 0
	}
&name;
	 := 1
	for  < len() && ('A' <= [] && [] <= 'Z' || 'a' <= [] && [] <= 'z') {
		++
	}
	if  > 1 &&  < len() && [] == ';' {
		return  + 1
	}
	return 0
}
markdownAnchorSize returns the length of the Markdown anchor at the start of t, or else 0. (like {#head})
func ( string) int {
	if len() < 4 || [0] != '{' || [1] != '#' {
		return 0
	}
	 := 2
	for ;  < len(); ++ {
		switch [] {
		case '}':
			return  + 1
		case ' ', '\r', '\n':
			return 0
		}
	}
	return 0
}

var markdownLinkPrefixes = []string{
	"http://",
	"https://",
	"mailto:",
	"file:",
	"#",
}
markdownLinkSize returns the length of the Markdown link target at the start of t, or else 0. Instead of fully parsing Markdown, this looks for ](http:// or ](https://.
func ( string) int {
	if len() < 2 || [0] != ']' || [1] != '(' {
		return 0
	}
	 := false
	for ,  := range markdownLinkPrefixes {
		if strings.HasPrefix([2:], ) {
			 = true
			break
		}
	}
	if ! {
		return 0
	}

	for  := 2;  < len(); ++ {
		 := []
		if  == ' ' ||  == '\t' ||  == '\r' ||  == '\n' {
			return 0
		}
		if  == ')' {
			return  + 1
		}
	}
	return 0
}
canonicalRewrites is a list of pairs that are canonicalized during word splittting. The words on the right are parsed as if they were the words on the left. This happens during dictionary splitting, so canMisspell will never see any of the words on the right.
var canonicalRewrites = []struct {
	x, y string
}{
	{"is", "are"},
	{"it", "them"},
	{"it", "they"},
	{"the", "these"},
	{"the", "this"},
	{"the", "those"},
	{"copy", "copies"}, // most plurals are handled as 1-letter typos