123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125 |
- import ply.lex as lex
- from ply.lex import TOKEN
-
- class SQLLexer(object):
- # List of token names.
-
- reserved = {
-
- 'SELECT' : 'SELECT',
- 'select' : 'SELECT',
- 'FROM' : 'FROM',
- 'from' : 'FROM',
- 'WHERE' : 'WHERE',
- 'where' : 'WHERE',
- 'AND' : 'AND',
- 'and' : 'AND',
- 'or' : 'OR',
- 'OR' : 'OR'
- }
-
- tokens = [
- 'COMMENT',
- 'NUMBER', 'ID', 'LITERAL',
- 'LPAREN', 'RPAREN',
- 'EQ', 'NEQ', 'GT', 'LT', 'LTE', 'GTE', 'SEMI', 'COMMA', 'WILD'] + list(set(list(reserved.values())))
-
-
- # REGULAR EXPRESSION RULES FOR TOKENS
-
- number = r'\d+'
- identifier = r'[a-zA-Z_][a-zA-Z0-9_]*'
- literal = r'"([^"\\]|\\.)*"' #r'\"[a-zA-Z0-9_\-=<>]*\"'
- comment = r'--.*'
-
- t_LPAREN = r'\('
- t_RPAREN = r'\)'
- t_LTE = r'<='
- t_GTE = r'>='
- t_LT = r'<'
- t_GT = r'>'
- t_EQ = r'='
- t_NEQ = r'!='
- t_SEMI = r';'
- t_COMMA =r','
- t_WILD = r'\*'
-
- # Ignore space characters
- t_ignore = ' \t\n'
-
- # Actions nexts
-
- # Only integer numbers for this example
-
- @TOKEN(number)
- def t_NUMBER(self, t):
- t.value = int(t.value)
- return t
-
- # Extract literals content
-
- @TOKEN(literal)
- def t_LITERAL(self, t):
- if t.value == "\"\"":
- t.value = ""
- else:
- t.value = t.value[1:len(t.value)-1]
- return t
-
- # Identifiers
- @TOKEN(identifier)
- def t_ID(self, t):
- t.type = self.reserved.get(t.value, 'ID')
- return t
-
- # Ignore comments
- @TOKEN(comment)
- def t_COMMENT(self, t):
- pass
-
- def t_error(self, t):
- print("Illegal character '%s'" % t.value[0])
- raise
- #t.lexer.skip(1)
-
-
- #Build the lexer
- def build(self, **kwargs):
- self.lexer = lex.lex(module=self, **kwargs)
-
-
- def input(self, data):
- self.lexer.input(data)
-
- def token(self):
- return self.lexer.token()
-
- def tokenize(self, data):
- tokens = []
- self.lexer.input(data)
- while True:
- tok = self.lexer.token()
- if not tok:
- break
- tokens.append(tok)
- return tokens
-
- def getTokensHash(self):
- return dict((k,0) for k in self.tokens)
-
-
-
- if __name__ == '__main__':
- sqllex = SQLLexer()
- sqllex.build()
- tok_counter = sqllex.getTokensHash()
- while True:
- try:
- s = raw_input("sql> ")
- except EOFError:
- break
- for tok in sqllex.tokenize(s):
- print tok
- tok_counter[tok.type] += 1
-
- print tok_counter
|