1. package chroma
    
  2. 
    
  3. import (
    
  4. 	"fmt"
    
  5. 	"os"
    
  6. 	"path/filepath"
    
  7. 	"regexp"
    
  8. 	"sort"
    
  9. 	"strings"
    
  10. 	"sync"
    
  11. 	"time"
    
  12. 	"unicode/utf8"
    
  13. 
    
  14. 	"github.com/dlclark/regexp2"
    
  15. )
    
  16. 
    
  17. // A Rule is the fundamental matching unit of the Regex lexer state machine.
    
  18. type Rule struct {
    
  19. 	Pattern string
    
  20. 	Type    Emitter
    
  21. 	Mutator Mutator
    
  22. }
    
  23. 
    
  24. // Words creates a regex that matches any of the given literal words.
    
  25. func Words(prefix, suffix string, words ...string) string {
    
  26. 	sort.Slice(words, func(i, j int) bool {
    
  27. 		return len(words[j]) < len(words[i])
    
  28. 	})
    
  29. 	for i, word := range words {
    
  30. 		words[i] = regexp.QuoteMeta(word)
    
  31. 	}
    
  32. 	return prefix + `(` + strings.Join(words, `|`) + `)` + suffix
    
  33. }
    
  34. 
    
  35. // Tokenise text using lexer, returning tokens as a slice.
    
  36. func Tokenise(lexer Lexer, options *TokeniseOptions, text string) ([]Token, error) {
    
  37. 	var out []Token
    
  38. 	it, err := lexer.Tokenise(options, text)
    
  39. 	if err != nil {
    
  40. 		return nil, err
    
  41. 	}
    
  42. 	for t := it(); t != EOF; t = it() {
    
  43. 		out = append(out, t)
    
  44. 	}
    
  45. 	return out, nil
    
  46. }
    
  47. 
    
  48. // Rules maps from state to a sequence of Rules.
    
  49. type Rules map[string][]Rule
    
  50. 
    
  51. // Rename clones rules then a rule.
    
  52. func (r Rules) Rename(oldRule, newRule string) Rules {
    
  53. 	r = r.Clone()
    
  54. 	r[newRule] = r[oldRule]
    
  55. 	delete(r, oldRule)
    
  56. 	return r
    
  57. }
    
  58. 
    
  59. // Clone returns a clone of the Rules.
    
  60. func (r Rules) Clone() Rules {
    
  61. 	out := map[string][]Rule{}
    
  62. 	for key, rules := range r {
    
  63. 		out[key] = make([]Rule, len(rules))
    
  64. 		copy(out[key], rules)
    
  65. 	}
    
  66. 	return out
    
  67. }
    
  68. 
    
  69. // Merge creates a clone of "r" then merges "rules" into the clone.
    
  70. func (r Rules) Merge(rules Rules) Rules {
    
  71. 	out := r.Clone()
    
  72. 	for k, v := range rules.Clone() {
    
  73. 		out[k] = v
    
  74. 	}
    
  75. 	return out
    
  76. }
    
  77. 
    
  78. // MustNewLexer creates a new Lexer with deferred rules generation or panics.
    
  79. func MustNewLexer(config *Config, rules func() Rules) *RegexLexer {
    
  80. 	lexer, err := NewLexer(config, rules)
    
  81. 	if err != nil {
    
  82. 		panic(err)
    
  83. 	}
    
  84. 	return lexer
    
  85. }
    
  86. 
    
  87. // NewLexer creates a new regex-based Lexer.
    
  88. //
    
  89. // "rules" is a state machine transition map. Each key is a state. Values are sets of rules
    
  90. // that match input, optionally modify lexer state, and output tokens.
    
  91. func NewLexer(config *Config, rulesFunc func() Rules) (*RegexLexer, error) {
    
  92. 	if config == nil {
    
  93. 		config = &Config{}
    
  94. 	}
    
  95. 	for _, glob := range append(config.Filenames, config.AliasFilenames...) {
    
  96. 		_, err := filepath.Match(glob, "")
    
  97. 		if err != nil {
    
  98. 			return nil, fmt.Errorf("%s: %q is not a valid glob: %w", config.Name, glob, err)
    
  99. 		}
    
  100. 	}
    
  101. 	r := &RegexLexer{
    
  102. 		config:         config,
    
  103. 		fetchRulesFunc: func() (Rules, error) { return rulesFunc(), nil },
    
  104. 	}
    
  105. 	// One-off code to generate XML lexers in the Chroma source tree.
    
  106. 	// var nameCleanRe = regexp.MustCompile(`[^-+A-Za-z0-9_]`)
    
  107. 	// name := strings.ToLower(nameCleanRe.ReplaceAllString(config.Name, "_"))
    
  108. 	// data, err := Marshal(r)
    
  109. 	// if err != nil {
    
  110. 	// 	if errors.Is(err, ErrNotSerialisable) {
    
  111. 	// 		fmt.Fprintf(os.Stderr, "warning: %q: %s\n", name, err)
    
  112. 	// 		return r, nil
    
  113. 	// 	}
    
  114. 	// 	return nil, err
    
  115. 	// }
    
  116. 	// _, file, _, ok := runtime.Caller(2)
    
  117. 	// if !ok {
    
  118. 	// 	panic("??")
    
  119. 	// }
    
  120. 	// fmt.Println(file)
    
  121. 	// if strings.Contains(file, "/lexers/") {
    
  122. 	// 	dir := filepath.Join(filepath.Dir(file), "embedded")
    
  123. 	// 	err = os.MkdirAll(dir, 0700)
    
  124. 	// 	if err != nil {
    
  125. 	// 		return nil, err
    
  126. 	// 	}
    
  127. 	// 	filename := filepath.Join(dir, name) + ".xml"
    
  128. 	// 	fmt.Println(filename)
    
  129. 	// 	err = ioutil.WriteFile(filename, data, 0600)
    
  130. 	// 	if err != nil {
    
  131. 	// 		return nil, err
    
  132. 	// 	}
    
  133. 	// }
    
  134. 	return r, nil
    
  135. }
    
  136. 
    
  137. // Trace enables debug tracing.
    
  138. func (r *RegexLexer) Trace(trace bool) *RegexLexer {
    
  139. 	r.trace = trace
    
  140. 	return r
    
  141. }
    
  142. 
    
  143. // A CompiledRule is a Rule with a pre-compiled regex.
    
  144. //
    
  145. // Note that regular expressions are lazily compiled on first use of the lexer.
    
  146. type CompiledRule struct {
    
  147. 	Rule
    
  148. 	Regexp *regexp2.Regexp
    
  149. 	flags  string
    
  150. }
    
  151. 
    
  152. // CompiledRules is a map of rule name to sequence of compiled rules in that rule.
    
  153. type CompiledRules map[string][]*CompiledRule
    
  154. 
    
  155. // LexerState contains the state for a single lex.
    
  156. type LexerState struct {
    
  157. 	Lexer    *RegexLexer
    
  158. 	Registry *LexerRegistry
    
  159. 	Text     []rune
    
  160. 	Pos      int
    
  161. 	Rules    CompiledRules
    
  162. 	Stack    []string
    
  163. 	State    string
    
  164. 	Rule     int
    
  165. 	// Group matches.
    
  166. 	Groups []string
    
  167. 	// Named Group matches.
    
  168. 	NamedGroups map[string]string
    
  169. 	// Custum context for mutators.
    
  170. 	MutatorContext map[interface{}]interface{}
    
  171. 	iteratorStack  []Iterator
    
  172. 	options        *TokeniseOptions
    
  173. 	newlineAdded   bool
    
  174. }
    
  175. 
    
  176. // Set mutator context.
    
  177. func (l *LexerState) Set(key interface{}, value interface{}) {
    
  178. 	l.MutatorContext[key] = value
    
  179. }
    
  180. 
    
  181. // Get mutator context.
    
  182. func (l *LexerState) Get(key interface{}) interface{} {
    
  183. 	return l.MutatorContext[key]
    
  184. }
    
  185. 
    
  186. // Iterator returns the next Token from the lexer.
    
  187. func (l *LexerState) Iterator() Token { // nolint: gocognit
    
  188. 	end := len(l.Text)
    
  189. 	if l.newlineAdded {
    
  190. 		end--
    
  191. 	}
    
  192. 	for l.Pos < end && len(l.Stack) > 0 {
    
  193. 		// Exhaust the iterator stack, if any.
    
  194. 		for len(l.iteratorStack) > 0 {
    
  195. 			n := len(l.iteratorStack) - 1
    
  196. 			t := l.iteratorStack[n]()
    
  197. 			if t == EOF {
    
  198. 				l.iteratorStack = l.iteratorStack[:n]
    
  199. 				continue
    
  200. 			}
    
  201. 			return t
    
  202. 		}
    
  203. 
    
  204. 		l.State = l.Stack[len(l.Stack)-1]
    
  205. 		if l.Lexer.trace {
    
  206. 			fmt.Fprintf(os.Stderr, "%s: pos=%d, text=%q\n", l.State, l.Pos, string(l.Text[l.Pos:]))
    
  207. 		}
    
  208. 		selectedRule, ok := l.Rules[l.State]
    
  209. 		if !ok {
    
  210. 			panic("unknown state " + l.State)
    
  211. 		}
    
  212. 		ruleIndex, rule, groups, namedGroups := matchRules(l.Text, l.Pos, selectedRule)
    
  213. 		// No match.
    
  214. 		if groups == nil {
    
  215. 			// From Pygments :\
    
  216. 			//
    
  217. 			// If the RegexLexer encounters a newline that is flagged as an error token, the stack is
    
  218. 			// emptied and the lexer continues scanning in the 'root' state. This can help producing
    
  219. 			// error-tolerant highlighting for erroneous input, e.g. when a single-line string is not
    
  220. 			// closed.
    
  221. 			if l.Text[l.Pos] == '\n' && l.State != l.options.State {
    
  222. 				l.Stack = []string{l.options.State}
    
  223. 				continue
    
  224. 			}
    
  225. 			l.Pos++
    
  226. 			return Token{Error, string(l.Text[l.Pos-1 : l.Pos])}
    
  227. 		}
    
  228. 		l.Rule = ruleIndex
    
  229. 		l.Groups = groups
    
  230. 		l.NamedGroups = namedGroups
    
  231. 		l.Pos += utf8.RuneCountInString(groups[0])
    
  232. 		if rule.Mutator != nil {
    
  233. 			if err := rule.Mutator.Mutate(l); err != nil {
    
  234. 				panic(err)
    
  235. 			}
    
  236. 		}
    
  237. 		if rule.Type != nil {
    
  238. 			l.iteratorStack = append(l.iteratorStack, rule.Type.Emit(l.Groups, l))
    
  239. 		}
    
  240. 	}
    
  241. 	// Exhaust the IteratorStack, if any.
    
  242. 	// Duplicate code, but eh.
    
  243. 	for len(l.iteratorStack) > 0 {
    
  244. 		n := len(l.iteratorStack) - 1
    
  245. 		t := l.iteratorStack[n]()
    
  246. 		if t == EOF {
    
  247. 			l.iteratorStack = l.iteratorStack[:n]
    
  248. 			continue
    
  249. 		}
    
  250. 		return t
    
  251. 	}
    
  252. 
    
  253. 	// If we get to here and we still have text, return it as an error.
    
  254. 	if l.Pos != len(l.Text) && len(l.Stack) == 0 {
    
  255. 		value := string(l.Text[l.Pos:])
    
  256. 		l.Pos = len(l.Text)
    
  257. 		return Token{Type: Error, Value: value}
    
  258. 	}
    
  259. 	return EOF
    
  260. }
    
  261. 
    
  262. // RegexLexer is the default lexer implementation used in Chroma.
    
  263. type RegexLexer struct {
    
  264. 	registry *LexerRegistry // The LexerRegistry this Lexer is associated with, if any.
    
  265. 	config   *Config
    
  266. 	analyser func(text string) float32
    
  267. 	trace    bool
    
  268. 
    
  269. 	mu             sync.Mutex
    
  270. 	compiled       bool
    
  271. 	rawRules       Rules
    
  272. 	rules          map[string][]*CompiledRule
    
  273. 	fetchRulesFunc func() (Rules, error)
    
  274. 	compileOnce    sync.Once
    
  275. }
    
  276. 
    
  277. func (r *RegexLexer) String() string {
    
  278. 	return r.config.Name
    
  279. }
    
  280. 
    
  281. // Rules in the Lexer.
    
  282. func (r *RegexLexer) Rules() (Rules, error) {
    
  283. 	if err := r.needRules(); err != nil {
    
  284. 		return nil, err
    
  285. 	}
    
  286. 	return r.rawRules, nil
    
  287. }
    
  288. 
    
  289. // SetRegistry the lexer will use to lookup other lexers if necessary.
    
  290. func (r *RegexLexer) SetRegistry(registry *LexerRegistry) Lexer {
    
  291. 	r.registry = registry
    
  292. 	return r
    
  293. }
    
  294. 
    
  295. // SetAnalyser sets the analyser function used to perform content inspection.
    
  296. func (r *RegexLexer) SetAnalyser(analyser func(text string) float32) Lexer {
    
  297. 	r.analyser = analyser
    
  298. 	return r
    
  299. }
    
  300. 
    
  301. // AnalyseText scores how likely a fragment of text is to match this lexer, between 0.0 and 1.0.
    
  302. func (r *RegexLexer) AnalyseText(text string) float32 {
    
  303. 	if r.analyser != nil {
    
  304. 		return r.analyser(text)
    
  305. 	}
    
  306. 	return 0
    
  307. }
    
  308. 
    
  309. // SetConfig replaces the Config for this Lexer.
    
  310. func (r *RegexLexer) SetConfig(config *Config) *RegexLexer {
    
  311. 	r.config = config
    
  312. 	return r
    
  313. }
    
  314. 
    
  315. // Config returns the Config for this Lexer.
    
  316. func (r *RegexLexer) Config() *Config {
    
  317. 	return r.config
    
  318. }
    
  319. 
    
  320. // Regex compilation is deferred until the lexer is used. This is to avoid significant init() time costs.
    
  321. func (r *RegexLexer) maybeCompile() (err error) {
    
  322. 	r.mu.Lock()
    
  323. 	defer r.mu.Unlock()
    
  324. 	if r.compiled {
    
  325. 		return nil
    
  326. 	}
    
  327. 	for state, rules := range r.rules {
    
  328. 		for i, rule := range rules {
    
  329. 			if rule.Regexp == nil {
    
  330. 				pattern := "(?:" + rule.Pattern + ")"
    
  331. 				if rule.flags != "" {
    
  332. 					pattern = "(?" + rule.flags + ")" + pattern
    
  333. 				}
    
  334. 				pattern = `\G` + pattern
    
  335. 				rule.Regexp, err = regexp2.Compile(pattern, 0)
    
  336. 				if err != nil {
    
  337. 					return fmt.Errorf("failed to compile rule %s.%d: %s", state, i, err)
    
  338. 				}
    
  339. 				rule.Regexp.MatchTimeout = time.Millisecond * 250
    
  340. 			}
    
  341. 		}
    
  342. 	}
    
  343. restart:
    
  344. 	seen := map[LexerMutator]bool{}
    
  345. 	for state := range r.rules {
    
  346. 		for i := 0; i < len(r.rules[state]); i++ {
    
  347. 			rule := r.rules[state][i]
    
  348. 			if compile, ok := rule.Mutator.(LexerMutator); ok {
    
  349. 				if seen[compile] {
    
  350. 					return fmt.Errorf("saw mutator %T twice; this should not happen", compile)
    
  351. 				}
    
  352. 				seen[compile] = true
    
  353. 				if err := compile.MutateLexer(r.rules, state, i); err != nil {
    
  354. 					return err
    
  355. 				}
    
  356. 				// Process the rules again in case the mutator added/removed rules.
    
  357. 				//
    
  358. 				// This sounds bad, but shouldn't be significant in practice.
    
  359. 				goto restart
    
  360. 			}
    
  361. 		}
    
  362. 	}
    
  363. 	r.compiled = true
    
  364. 	return nil
    
  365. }
    
  366. 
    
  367. func (r *RegexLexer) fetchRules() error {
    
  368. 	rules, err := r.fetchRulesFunc()
    
  369. 	if err != nil {
    
  370. 		return fmt.Errorf("%s: failed to compile rules: %w", r.config.Name, err)
    
  371. 	}
    
  372. 	if _, ok := rules["root"]; !ok {
    
  373. 		return fmt.Errorf("no \"root\" state")
    
  374. 	}
    
  375. 	compiledRules := map[string][]*CompiledRule{}
    
  376. 	for state, rules := range rules {
    
  377. 		compiledRules[state] = nil
    
  378. 		for _, rule := range rules {
    
  379. 			flags := ""
    
  380. 			if !r.config.NotMultiline {
    
  381. 				flags += "m"
    
  382. 			}
    
  383. 			if r.config.CaseInsensitive {
    
  384. 				flags += "i"
    
  385. 			}
    
  386. 			if r.config.DotAll {
    
  387. 				flags += "s"
    
  388. 			}
    
  389. 			compiledRules[state] = append(compiledRules[state], &CompiledRule{Rule: rule, flags: flags})
    
  390. 		}
    
  391. 	}
    
  392. 
    
  393. 	r.rawRules = rules
    
  394. 	r.rules = compiledRules
    
  395. 	return nil
    
  396. }
    
  397. 
    
  398. func (r *RegexLexer) needRules() error {
    
  399. 	var err error
    
  400. 	if r.fetchRulesFunc != nil {
    
  401. 		r.compileOnce.Do(func() {
    
  402. 			err = r.fetchRules()
    
  403. 		})
    
  404. 	}
    
  405. 	if err := r.maybeCompile(); err != nil {
    
  406. 		return err
    
  407. 	}
    
  408. 	return err
    
  409. }
    
  410. 
    
  411. // Tokenise text using lexer, returning an iterator.
    
  412. func (r *RegexLexer) Tokenise(options *TokeniseOptions, text string) (Iterator, error) {
    
  413. 	err := r.needRules()
    
  414. 	if err != nil {
    
  415. 		return nil, err
    
  416. 	}
    
  417. 	if options == nil {
    
  418. 		options = defaultOptions
    
  419. 	}
    
  420. 	if options.EnsureLF {
    
  421. 		text = ensureLF(text)
    
  422. 	}
    
  423. 	newlineAdded := false
    
  424. 	if !options.Nested && r.config.EnsureNL && !strings.HasSuffix(text, "\n") {
    
  425. 		text += "\n"
    
  426. 		newlineAdded = true
    
  427. 	}
    
  428. 	state := &LexerState{
    
  429. 		Registry:       r.registry,
    
  430. 		newlineAdded:   newlineAdded,
    
  431. 		options:        options,
    
  432. 		Lexer:          r,
    
  433. 		Text:           []rune(text),
    
  434. 		Stack:          []string{options.State},
    
  435. 		Rules:          r.rules,
    
  436. 		MutatorContext: map[interface{}]interface{}{},
    
  437. 	}
    
  438. 	return state.Iterator, nil
    
  439. }
    
  440. 
    
  441. // MustRules is like Rules() but will panic on error.
    
  442. func (r *RegexLexer) MustRules() Rules {
    
  443. 	rules, err := r.Rules()
    
  444. 	if err != nil {
    
  445. 		panic(err)
    
  446. 	}
    
  447. 	return rules
    
  448. }
    
  449. 
    
  450. func matchRules(text []rune, pos int, rules []*CompiledRule) (int, *CompiledRule, []string, map[string]string) {
    
  451. 	for i, rule := range rules {
    
  452. 		match, err := rule.Regexp.FindRunesMatchStartingAt(text, pos)
    
  453. 		if match != nil && err == nil && match.Index == pos {
    
  454. 			groups := []string{}
    
  455. 			namedGroups := make(map[string]string)
    
  456. 			for _, g := range match.Groups() {
    
  457. 				namedGroups[g.Name] = g.String()
    
  458. 				groups = append(groups, g.String())
    
  459. 			}
    
  460. 			return i, rule, groups, namedGroups
    
  461. 		}
    
  462. 	}
    
  463. 	return 0, &CompiledRule{}, nil, nil
    
  464. }
    
  465. 
    
  466. // replace \r and \r\n with \n
    
  467. // same as strings.ReplaceAll but more efficient
    
  468. func ensureLF(text string) string {
    
  469. 	buf := make([]byte, len(text))
    
  470. 	var j int
    
  471. 	for i := 0; i < len(text); i++ {
    
  472. 		c := text[i]
    
  473. 		if c == '\r' {
    
  474. 			if i < len(text)-1 && text[i+1] == '\n' {
    
  475. 				continue
    
  476. 			}
    
  477. 			c = '\n'
    
  478. 		}
    
  479. 		buf[j] = c
    
  480. 		j++
    
  481. 	}
    
  482. 	return string(buf[:j])
    
  483. }