1. package chroma
    
  2. 
    
  3. import (
    
  4. 	"fmt"
    
  5. 	"strings"
    
  6. )
    
  7. 
    
  8. var (
    
  9. 	defaultOptions = &TokeniseOptions{
    
  10. 		State:    "root",
    
  11. 		EnsureLF: true,
    
  12. 	}
    
  13. )
    
  14. 
    
  15. // Config for a lexer.
    
  16. type Config struct {
    
  17. 	// Name of the lexer.
    
  18. 	Name string `xml:"name,omitempty"`
    
  19. 
    
  20. 	// Shortcuts for the lexer
    
  21. 	Aliases []string `xml:"alias,omitempty"`
    
  22. 
    
  23. 	// File name globs
    
  24. 	Filenames []string `xml:"filename,omitempty"`
    
  25. 
    
  26. 	// Secondary file name globs
    
  27. 	AliasFilenames []string `xml:"alias_filename,omitempty"`
    
  28. 
    
  29. 	// MIME types
    
  30. 	MimeTypes []string `xml:"mime_type,omitempty"`
    
  31. 
    
  32. 	// Regex matching is case-insensitive.
    
  33. 	CaseInsensitive bool `xml:"case_insensitive,omitempty"`
    
  34. 
    
  35. 	// Regex matches all characters.
    
  36. 	DotAll bool `xml:"dot_all,omitempty"`
    
  37. 
    
  38. 	// Regex does not match across lines ($ matches EOL).
    
  39. 	//
    
  40. 	// Defaults to multiline.
    
  41. 	NotMultiline bool `xml:"not_multiline,omitempty"`
    
  42. 
    
  43. 	// Don't strip leading and trailing newlines from the input.
    
  44. 	// DontStripNL bool
    
  45. 
    
  46. 	// Strip all leading and trailing whitespace from the input
    
  47. 	// StripAll bool
    
  48. 
    
  49. 	// Make sure that the input ends with a newline. This
    
  50. 	// is required for some lexers that consume input linewise.
    
  51. 	EnsureNL bool `xml:"ensure_nl,omitempty"`
    
  52. 
    
  53. 	// If given and greater than 0, expand tabs in the input.
    
  54. 	// TabSize int
    
  55. 
    
  56. 	// Priority of lexer.
    
  57. 	//
    
  58. 	// If this is 0 it will be treated as a default of 1.
    
  59. 	Priority float32 `xml:"priority,omitempty"`
    
  60. 
    
  61. 	// Analyse is a list of regexes to match against the input.
    
  62. 	//
    
  63. 	// If a match is found, the score is returned if single attribute is set to true,
    
  64. 	// otherwise the sum of all the score of matching patterns will be
    
  65. 	// used as the final score.
    
  66. 	Analyse *AnalyseConfig `xml:"analyse,omitempty"`
    
  67. }
    
  68. 
    
  69. // AnalyseConfig defines the list of regexes analysers.
    
  70. type AnalyseConfig struct {
    
  71. 	Regexes []RegexConfig `xml:"regex,omitempty"`
    
  72. 	// If true, the first matching score is returned.
    
  73. 	First bool `xml:"first,attr"`
    
  74. }
    
  75. 
    
  76. // RegexConfig defines a single regex pattern and its score in case of match.
    
  77. type RegexConfig struct {
    
  78. 	Pattern string  `xml:"pattern,attr"`
    
  79. 	Score   float32 `xml:"score,attr"`
    
  80. }
    
  81. 
    
  82. // Token output to formatter.
    
  83. type Token struct {
    
  84. 	Type  TokenType `json:"type"`
    
  85. 	Value string    `json:"value"`
    
  86. }
    
  87. 
    
  88. func (t *Token) String() string   { return t.Value }
    
  89. func (t *Token) GoString() string { return fmt.Sprintf("&Token{%s, %q}", t.Type, t.Value) }
    
  90. 
    
  91. // Clone returns a clone of the Token.
    
  92. func (t *Token) Clone() Token {
    
  93. 	return *t
    
  94. }
    
  95. 
    
  96. // EOF is returned by lexers at the end of input.
    
  97. var EOF Token
    
  98. 
    
  99. // TokeniseOptions contains options for tokenisers.
    
  100. type TokeniseOptions struct {
    
  101. 	// State to start tokenisation in. Defaults to "root".
    
  102. 	State string
    
  103. 	// Nested tokenisation.
    
  104. 	Nested bool
    
  105. 
    
  106. 	// If true, all EOLs are converted into LF
    
  107. 	// by replacing CRLF and CR
    
  108. 	EnsureLF bool
    
  109. }
    
  110. 
    
  111. // A Lexer for tokenising source code.
    
  112. type Lexer interface {
    
  113. 	// Config describing the features of the Lexer.
    
  114. 	Config() *Config
    
  115. 	// Tokenise returns an Iterator over tokens in text.
    
  116. 	Tokenise(options *TokeniseOptions, text string) (Iterator, error)
    
  117. 	// SetRegistry sets the registry this Lexer is associated with.
    
  118. 	//
    
  119. 	// The registry should be used by the Lexer if it needs to look up other
    
  120. 	// lexers.
    
  121. 	SetRegistry(registry *LexerRegistry) Lexer
    
  122. 	// SetAnalyser sets a function the Lexer should use for scoring how
    
  123. 	// likely a fragment of text is to match this lexer, between 0.0 and 1.0.
    
  124. 	// A value of 1 indicates high confidence.
    
  125. 	//
    
  126. 	// Lexers may ignore this if they implement their own analysers.
    
  127. 	SetAnalyser(analyser func(text string) float32) Lexer
    
  128. 	// AnalyseText scores how likely a fragment of text is to match
    
  129. 	// this lexer, between 0.0 and 1.0. A value of 1 indicates high confidence.
    
  130. 	AnalyseText(text string) float32
    
  131. }
    
  132. 
    
  133. // Lexers is a slice of lexers sortable by name.
    
  134. type Lexers []Lexer
    
  135. 
    
  136. func (l Lexers) Len() int      { return len(l) }
    
  137. func (l Lexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
    
  138. func (l Lexers) Less(i, j int) bool {
    
  139. 	return strings.ToLower(l[i].Config().Name) < strings.ToLower(l[j].Config().Name)
    
  140. }
    
  141. 
    
  142. // PrioritisedLexers is a slice of lexers sortable by priority.
    
  143. type PrioritisedLexers []Lexer
    
  144. 
    
  145. func (l PrioritisedLexers) Len() int      { return len(l) }
    
  146. func (l PrioritisedLexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
    
  147. func (l PrioritisedLexers) Less(i, j int) bool {
    
  148. 	ip := l[i].Config().Priority
    
  149. 	if ip == 0 {
    
  150. 		ip = 1
    
  151. 	}
    
  152. 	jp := l[j].Config().Priority
    
  153. 	if jp == 0 {
    
  154. 		jp = 1
    
  155. 	}
    
  156. 	return ip > jp
    
  157. }
    
  158. 
    
  159. // Analyser determines how appropriate this lexer is for the given text.
    
  160. type Analyser interface {
    
  161. 	AnalyseText(text string) float32
    
  162. }