import ply.lex as lex from ply.lex import TOKEN class SQLLexer(object): # List of token names. reserved = { 'SELECT' : 'SELECT', 'select' : 'SELECT', 'FROM' : 'FROM', 'from' : 'FROM', 'WHERE' : 'WHERE', 'where' : 'WHERE', 'AND' : 'AND', 'and' : 'AND', 'or' : 'OR', 'OR' : 'OR' } tokens = [ 'COMMENT', 'NUMBER', 'ID', 'LITERAL', 'LPAREN', 'RPAREN', 'EQ', 'NEQ', 'GT', 'LT', 'LTE', 'GTE', 'SEMI', 'COMMA', 'WILD'] + list(set(list(reserved.values()))) # REGULAR EXPRESSION RULES FOR TOKENS number = r'' identifier = r'' literal = r'' comment = r'' t_LPAREN = r'\(' t_RPAREN = r'\)' t_LTE = r'<=' t_GTE = r'>=' t_LT = r'<' t_GT = r'>' t_EQ = r'=' t_NEQ = r'!=' t_SEMI = r';' t_COMMA =r',' t_WILD = r'\*' # Ignore space characters t_ignore = ' \t\n' # Actions nexts # Only integer numbers for this example @TOKEN(number) def t_NUMBER(self, t): t.value = int(t.value) return t # Extract literals content @TOKEN(literal) def t_LITERAL(self, t): if t.value == "\"\"": t.value = "" else: t.value = t.value[1:len(t.value)-1] return t # Identifiers @TOKEN(identifier) def t_ID(self, t): t.type = self.reserved.get(t.value, 'ID') return t # Ignore comments @TOKEN(comment) def t_COMMENT(self, t): pass def t_error(self, t): print("Illegal character '%s'" % t.value[0]) raise #t.lexer.skip(1) #Build the lexer def build(self, **kwargs): self.lexer = lex.lex(module=self, **kwargs) def input(self, data): self.lexer.input(data) def token(self): return self.lexer.token() def tokenize(self, data): tokens = [] self.lexer.input(data) while True: tok = self.lexer.token() if not tok: break tokens.append(tok) return tokens def getTokensHash(self): return dict((k,0) for k in self.tokens) if __name__ == '__main__': sqllex = SQLLexer() sqllex.build() tok_counter = sqllex.getTokensHash() while True: try: s = raw_input("sql> ") except EOFError: break for tok in sqllex.tokenize(s): print tok tok_counter[tok.type] += 1 print tok_counter