No Description

SQLLexer.py 2.1KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. import ply.lex as lex
  2. from ply.lex import TOKEN
  3. class SQLLexer(object):
  4. # List of token names.
  5. reserved = {
  6. 'SELECT' : 'SELECT',
  7. 'select' : 'SELECT',
  8. 'FROM' : 'FROM',
  9. 'from' : 'FROM',
  10. 'WHERE' : 'WHERE',
  11. 'where' : 'WHERE',
  12. 'AND' : 'AND',
  13. 'and' : 'AND',
  14. 'or' : 'OR',
  15. 'OR' : 'OR'
  16. }
  17. tokens = [
  18. 'COMMENT',
  19. 'NUMBER', 'ID', 'LITERAL',
  20. 'LPAREN', 'RPAREN',
  21. 'EQ', 'NEQ', 'GT', 'LT', 'LTE', 'GTE', 'SEMI', 'COMMA', 'WILD'] + list(set(list(reserved.values())))
  22. # REGULAR EXPRESSION RULES FOR TOKENS
  23. number = r''
  24. identifier = r''
  25. literal = r''
  26. comment = r''
  27. t_LPAREN = r'\('
  28. t_RPAREN = r'\)'
  29. t_LTE = r'<='
  30. t_GTE = r'>='
  31. t_LT = r'<'
  32. t_GT = r'>'
  33. t_EQ = r'='
  34. t_NEQ = r'!='
  35. t_SEMI = r';'
  36. t_COMMA =r','
  37. t_WILD = r'\*'
  38. # Ignore space characters
  39. t_ignore = ' \t\n'
  40. # Actions nexts
  41. # Only integer numbers for this example
  42. @TOKEN(number)
  43. def t_NUMBER(self, t):
  44. t.value = int(t.value)
  45. return t
  46. # Extract literals content
  47. @TOKEN(literal)
  48. def t_LITERAL(self, t):
  49. if t.value == "\"\"":
  50. t.value = ""
  51. else:
  52. t.value = t.value[1:len(t.value)-1]
  53. return t
  54. # Identifiers
  55. @TOKEN(identifier)
  56. def t_ID(self, t):
  57. t.type = self.reserved.get(t.value, 'ID')
  58. return t
  59. # Ignore comments
  60. @TOKEN(comment)
  61. def t_COMMENT(self, t):
  62. pass
  63. def t_error(self, t):
  64. print("Illegal character '%s'" % t.value[0])
  65. raise
  66. #t.lexer.skip(1)
  67. #Build the lexer
  68. def build(self, **kwargs):
  69. self.lexer = lex.lex(module=self, **kwargs)
  70. def input(self, data):
  71. self.lexer.input(data)
  72. def token(self):
  73. return self.lexer.token()
  74. def tokenize(self, data):
  75. tokens = []
  76. self.lexer.input(data)
  77. while True:
  78. tok = self.lexer.token()
  79. if not tok:
  80. break
  81. tokens.append(tok)
  82. return tokens
  83. def getTokensHash(self):
  84. return dict((k,0) for k in self.tokens)
  85. if __name__ == '__main__':
  86. sqllex = SQLLexer()
  87. sqllex.build()
  88. tok_counter = sqllex.getTokensHash()
  89. while True:
  90. try:
  91. s = raw_input("sql> ")
  92. except EOFError:
  93. break
  94. for tok in sqllex.tokenize(s):
  95. print tok
  96. tok_counter[tok.type] += 1
  97. print tok_counter