1. #!/usr/bin/env python3
    
  2. import functools
    
  3. import importlib
    
  4. import json
    
  5. import os
    
  6. import re
    
  7. import sys
    
  8. import types
    
  9. import html
    
  10. 
    
  11. import pystache
    
  12. from pygments import lexer as pygments_lexer
    
  13. from pygments.token import _TokenType
    
  14. 
    
  15. TEMPLATE = r'''
    
  16. <lexer>
    
  17.   <config>
    
  18.     <name>{{name}}</name>
    
  19.     {{#aliases}}
    
  20.     <alias>{{alias}}</alias>
    
  21.     {{/aliases}}
    
  22.     {{#filenames}}
    
  23.     <filename>{{filename}}</filename>
    
  24.     {{/filenames}}
    
  25.     {{#mimetypes}}
    
  26.     <mime_type>{{mimetype}}</mime_type>
    
  27.     {{/mimetypes}}
    
  28.     {{#re_ignorecase}}
    
  29.     <case_insensitive>true</case_insensitive>
    
  30.     {{/re_ignorecase}}
    
  31.     {{#re_dotall}}
    
  32.     <dot_all>true</dot_all>
    
  33.     {{/re_dotall}}
    
  34.     {{#re_not_multiline}}
    
  35.     <not_multiline>true</not_multiline>
    
  36.     {{/re_not_multiline}}
    
  37.   </config>
    
  38.   <rules>
    
  39.     {{#tokens}}
    
  40.     <state name="{{state}}">
    
  41.       {{#rules}}
    
  42.       {{{.}}}
    
  43.       {{/rules}}
    
  44.     </state>
    
  45.     {{/tokens}}
    
  46.   </rules>
    
  47. </lexer>
    
  48. '''
    
  49. 
    
  50. 
    
  51. def xml_regex(s):
    
  52.     return xml_string(s)
    
  53. 
    
  54. def xml_string(s):
    
  55.     s = html.escape(s)
    
  56.     return '"' + s + '"'
    
  57. 
    
  58. 
    
  59. def to_camel_case(snake_str):
    
  60.     components = snake_str.split('_')
    
  61.     return ''.join(x.title() for x in components)
    
  62. 
    
  63. 
    
  64. def warning(message):
    
  65.     print('warning: ' + message, file=sys.stderr)
    
  66. 
    
  67. 
    
  68. def resolve_emitter(emitter):
    
  69.     if isinstance(emitter, types.FunctionType):
    
  70.         if repr(emitter).startswith('<function bygroups.'):
    
  71.             args = emitter.__closure__[0].cell_contents
    
  72.             emitter = '<bygroups>%s</bygroups>' % ''.join(resolve_emitter(e) for e in args)
    
  73.         elif repr(emitter).startswith('<function using.'):
    
  74.             args = emitter.__closure__[0].cell_contents
    
  75.             if isinstance(args, dict):
    
  76.                 state = 'root'
    
  77.                 if 'stack' in args:
    
  78.                     state = args['stack'][1]
    
  79.                     args.pop('stack')
    
  80.                 assert args == {}, args
    
  81.                 emitter = '<usingself state="%s"/>' % state
    
  82.             elif issubclass(args, pygments_lexer.Lexer):
    
  83.                 name = args.__name__
    
  84.                 if name.endswith('Lexer'):
    
  85.                     name = name[:-5]
    
  86.                 emitter = '<using lexer="%s"/>' % state
    
  87.             else:
    
  88.                 raise ValueError('only support "using" with lexer classes, not %r' % args)
    
  89.         else:
    
  90.             warning('unsupported emitter function %r' % emitter)
    
  91.             emitter = '?? %r ??' % emitter
    
  92.     elif isinstance(emitter, _TokenType):
    
  93.         emitter = '<token type="%s"/>' % str(emitter).replace('.', '')[5:]
    
  94.     elif emitter is None:
    
  95.         return 'None'
    
  96.     else:
    
  97.         raise ValueError('unsupported emitter type %r' % emitter)
    
  98.     assert isinstance(emitter, str)
    
  99.     return emitter
    
  100. 
    
  101. 
    
  102. def process_state_action(action):
    
  103.     if isinstance(action, tuple):
    
  104.         return functools.reduce(lambda a, b: a + b, (process_state_action(a) for a in action))
    
  105.     if action.startswith('#'):
    
  106.         action = action[1:]
    
  107.         if action== 'pop':
    
  108.             action = '<pop depth="1"/>'
    
  109.         elif action.startswith('pop:'):
    
  110.             action = '<pop depth="%s"/>' % action[4:]
    
  111.         elif action == 'push':
    
  112.             action = '<push/>'
    
  113.         elif action.startswith('push:'):
    
  114.             action = '<push state="%s"/>' % action[5:]
    
  115.         else:
    
  116.             raise ValueError('unsupported action %r' % (action,))
    
  117.     else:
    
  118.         action = '<push state="%s"/>' % action
    
  119.     return (action,)
    
  120. 
    
  121. 
    
  122. def translate_rules(rules):
    
  123.     out = []
    
  124.     for rule in rules:
    
  125.         if isinstance(rule, tuple):
    
  126.             regex = rule[0]
    
  127.             if isinstance(regex, str):
    
  128.                 regex = xml_regex(regex)
    
  129.             elif isinstance(regex, pygments_lexer.words):
    
  130.                 regex = xml_string('%s(%s)%s' % (regex.prefix,
    
  131.                                       '|'.join(w for w in regex.words),
    
  132.                                       regex.suffix))
    
  133.             else:
    
  134.                 raise ValueError('expected regex string but got %r' % regex)
    
  135.             emitter = resolve_emitter(rule[1])
    
  136.             if len(rule) == 2:
    
  137.                 modifier = ''
    
  138.             elif type(rule[2]) is str:
    
  139.                 modifier = process_state_action(rule[2])[0]
    
  140.             elif isinstance(rule[2], pygments_lexer.combined):
    
  141.                 modifier = '<combined state="%s"/>' % '" state="'.join(rule[2])
    
  142.             elif type(rule[2]) is tuple:
    
  143.                 modifier = '<push state="%s"/>' % '" state="'.join(rule[2])
    
  144.             else:
    
  145.                 raise ValueError('unsupported modifier %r' % (rule[2],))
    
  146.             out.append('<rule pattern={}>{}{}</rule>'.format(regex, emitter, modifier))
    
  147.         elif isinstance(rule, pygments_lexer.include):
    
  148.             out.append('<rule><include state="{}"/></rule>'.format(rule))
    
  149.         elif isinstance(rule, pygments_lexer.default):
    
  150.             process_state_action(rule.state)
    
  151.             out.append('<rule>{}</rule>'.format(''.join(process_state_action(rule.state))))
    
  152.         else:
    
  153.             raise ValueError('unsupported rule %r' % (rule,))
    
  154.     return out
    
  155. 
    
  156. 
    
  157. class TemplateView(object):
    
  158.     def __init__(self, **kwargs):
    
  159.         for key, value in kwargs.items():
    
  160.             setattr(self, key, value)
    
  161. 
    
  162.     def re_not_multiline(self):
    
  163.         return not (self.regex_flags & re.MULTILINE)
    
  164. 
    
  165.     def re_dotall(self):
    
  166.         return self.regex_flags & re.DOTALL
    
  167. 
    
  168.     def re_ignorecase(self):
    
  169.         return self.regex_flags & re.IGNORECASE
    
  170. 
    
  171. 
    
  172. def main():
    
  173.     package_name, symbol_name = sys.argv[1].rsplit(sep=".", maxsplit=1)
    
  174. 
    
  175.     package = importlib.import_module(package_name)
    
  176. 
    
  177.     lexer_cls = getattr(package, symbol_name)
    
  178. 
    
  179.     assert issubclass(lexer_cls, pygments_lexer.RegexLexer), 'can only translate from RegexLexer'
    
  180. 
    
  181.     print(pystache.render(TEMPLATE, TemplateView(
    
  182.         name=lexer_cls.name,
    
  183.         regex_flags=lexer_cls.flags,
    
  184.         aliases=[{'alias': alias} for alias in lexer_cls.aliases],
    
  185.         filenames=[{'filename': filename} for filename in lexer_cls.filenames],
    
  186.         mimetypes=[{'mimetype': mimetype} for mimetype in lexer_cls.mimetypes],
    
  187.         tokens=[{'state': state, 'rules': translate_rules(rules)} for (state, rules) in lexer_cls.get_tokendefs().items()],
    
  188.     )))
    
  189. 
    
  190. 
    
  191. if __name__ == '__main__':
    
  192.     main()