Copyright 2019 The Go Authors. All rights reserved. Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.

package old

import (
	
	
	
)

const (
	blankID       = -1
	unknownWordID = -2
)
htmlesc unescapes HTML escapes that we've observed, especially in Markdown-formatted licenses. The replacements must have the same length as the original strings to preserve byte offsets.
var htmlesc = strings.NewReplacer(
	"“", "   \"   ",
	"”", "   \"   ",
	"&", "  &  ",
)
normalize turns the input byte slice into a slice of normalized words as a document, including the indexes required to recover the original. Normalized text is all lower case, stripped of punctuation and space. The slice of normalized words is a slice of indexes into c.words, which is updated to add new words as needed. Using integer indexes makes the comparison against input texts faster.
func ( *Checker) ( []byte,  bool) *document {
	var  rune
	var  int
	 := 0
	 := toLower()
	 = htmlesc.Replace()
	 := func() {
		,  = utf8.DecodeRuneInString([:])
		 += 
	}
	 := make([]int32, 0, 100)
Each iteration adds a word.
	for  < len() {
		 := 
		const  = "___" // fill in the blank wildcard
		if strings.HasPrefix([:], ) {
			 = append(, blankID)
			 = append(, int32())
			 += len()
			continue
		}
Skip spaces, punctuation, etc. and keep only word characters.
		if !isWordChar() {
			continue
Now at start of word.
		for  < len() {
			()
			if !isWordChar() {
				 -=  // Will skip r next time around.
				break
			}
		}
Is it a list marker? Longest one is maxListMarkerLength bytes: "viii".
			if - > maxListMarkerLength || !isListMarker([:], ) { // If at EOF, r will not be valid punctuation
				 := [:]
				,  := .dict[]
				if ! {
					if  {
						 = int32(len(.words))
						.words = append(.words, )
						.dict[] = 
					} else {
						 = unknownWordID
					}
				}
				 = append(, )
				 = append(, int32())
			}
		}
	}
	return &document{
		text:    ,
		words:   ,
		byteOff: ,
	}
}
toLower returns a lowercased version of the input, guaranteeing that the size remains the same so byte offsets between the slice and the string created from it, which will be used to locate words, will line up. TODO: There is a proposal in Go to provide a UTF-8 handler that would make this nicer. Use it if it arrives. https://github.com/golang/go/issues/25805
func ( []byte) string {
	var  strings.Builder
	for ,  := 0, 0;  < len();  +=  {
		var  rune
		,  = utf8.DecodeRune([:])
Trouble. Just copy one byte and make it ASCII.
			.WriteByte('?')
			continue
		}
		 := unicode.ToLower()
More trouble. Just use the original.
			 = 
		}
		.WriteRune()
	}
	return .String()
}
isWordChar reports whether r is valid in a word. That means it must be a letter, although that definition may change. The rune has already been case lowered, although that doesn't matter here.
func ( rune) bool {
	return unicode.IsLetter()
}

const maxListMarkerLength = 4

var listMarker = func() map[string]bool {
	const  = "a b c d e f g h i j k l m n o p q r ii iii iv vi vii viii ix xi xii xiii xiv xv"
	 := map[string]bool{}
	for ,  := range strings.Split(, " ") {
		if len() > maxListMarkerLength {
			panic("marker too long")
		}
		[] = true
	}
	return 
}()
isListMarker reports whether s, followed immediately by nextRune, is a potential list marker such as "i." or "a)".
func ( string,  rune) bool {
	if !listMarker[] {
		return false
	}
	switch  {
	case '.', ':', ')':
		return true
	}
	return false