ATACKPR
/
public-inputvalidation-sqlparser


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
							import ply.lex as lex
from ply.lex import TOKEN

class SQLLexer(object):
	# List of token names.  

	reserved = {
	
		'SELECT' : 'SELECT',
		'select' : 'SELECT',
		'FROM' : 'FROM', 
		'from' : 'FROM', 
		'WHERE' : 'WHERE',
		'where' : 'WHERE',
		'AND' : 'AND',
		'and' : 'AND',
		'or' : 'OR',
		'OR' : 'OR'
	}

	tokens = [
			'COMMENT',
			'NUMBER', 'ID', 'LITERAL',
			'LPAREN', 'RPAREN', 
			'EQ', 'NEQ', 'GT', 'LT', 'LTE', 'GTE', 'SEMI', 'COMMA', 'WILD'] + list(set(list(reserved.values())))


	# REGULAR EXPRESSION RULES FOR TOKENS

	number = r'\d+'
	identifier = r'[a-zA-Z_][a-zA-Z0-9_]*'
	literal = r'"([^"\\]|\\.)*"'    #r'\"[a-zA-Z0-9_\-=<>]*\"'
	comment = r'--.*'

	t_LPAREN = r'\('
	t_RPAREN = r'\)'
	t_LTE = r'<='
	t_GTE = r'>='
	t_LT = r'<'
	t_GT = r'>'
	t_EQ = r'='
	t_NEQ = r'!='
	t_SEMI = r';'
	t_COMMA =r','
	t_WILD = r'\*'

	# Ignore space characters
	t_ignore = ' \t\n'

	# Actions nexts

	# Only integer numbers for this example

	@TOKEN(number)
	def t_NUMBER(self, t):
		t.value = int(t.value)
		return t 

	# Extract literals content

	@TOKEN(literal)
	def t_LITERAL(self, t):
		if t.value == "\"\"":
			t.value = ""
		else:
			t.value = t.value[1:len(t.value)-1]
		return t

	# Identifiers
	@TOKEN(identifier)
	def t_ID(self, t):
		t.type = self.reserved.get(t.value, 'ID')
		return t

	# Ignore comments
	@TOKEN(comment)
	def t_COMMENT(self, t):
		pass

	def t_error(self, t):
		print("Illegal character '%s'" % t.value[0])
		raise
		#t.lexer.skip(1)


	#Build the lexer
	def build(self, **kwargs):
		self.lexer = lex.lex(module=self, **kwargs)


	def input(self, data):
		self.lexer.input(data)

	def token(self):
		return self.lexer.token()

	def tokenize(self, data):
		tokens = []
		self.lexer.input(data)
		while True:
			tok = self.lexer.token()
			if not tok:
				break
			tokens.append(tok)
		return tokens

	def getTokensHash(self):
		return dict((k,0) for k in self.tokens)


if __name__ == '__main__':
	sqllex = SQLLexer()
	sqllex.build()
	tok_counter = sqllex.getTokensHash()
	while True:
		try: 
			s = raw_input("sql> ")
		except EOFError:
			break
		for tok in sqllex.tokenize(s):
	 		print tok
	 		tok_counter[tok.type] += 1

	 	print tok_counter