import re class Tokenizer: def __init__(self): self.i = 0 self.file = '' self.symbols = ('(', ')', '[', ']', '}', '{', '>', '<', '=', '*', '+', '-', '/', '.', ';', ',', '&', '|', '~') self.key_word = ( 'class', 'method', 'function', 'constructor', 'int', 'boolean', 'char', 'void', 'var', 'static', 'field', 'let', 'do', 'if', 'else', 'while', 'return', 'true', 'false', 'null', 'this') self.token = '' def token_type(self, token=None): if token is None: token = self.token if token is None or token == '': return None if token in self.key_word: return 'keyword' elif token[0] == '"': return 'stringConstant' elif re.match(r"\d+", token): return 'integerConstant' elif token in self.symbols: return 'symbol' else: return 'identifier' def advance(self): token = '' i = self.i while i < len(self.file): if re.match(r'\s', self.file[i]): i = i + 1 continue else: if self.file[i] in self.symbols: self.token = self.file[i] self.i = i + 1 return elif self.file[i] == '"': i += 1 while self.file[i] != '"': token += self.file[i] i += 1 self.i = i + 1 self.token = '"' + token + '"' return else: while re.match(r'\w', self.file[i]): token += self.file[i] if i + 1 > len(self.file) - 1: break i += 1 self.i = i self.token = token return def clear_file(self, directory): with open(directory, "r") as my_file: txt = my_file.read() txt = re.sub(r"//.*", "", txt) txt = re.sub(r"/[*][*].*[*]/", "", txt) i = 0 # TODO this should be a regex while i < len(txt): if txt[i] == '/' and txt[i + 1] == '*' and txt[i + 2] == '*': start = i while txt[i] != '*' or txt[i + 1] != '/': i += 1 stop = i + 2 txt = txt[:start] + txt[stop:len(txt)] i = start - 1 i += 1 self.file = txt