import re import os from pathlib import Path class Tokenizer: def __init__(self): self.i = 0 self.file = '' self.symbols = ('(', ')', '[', ']', '}', '{', '>', '<', '=', '*', '+', '-', '/', '.', ';', ',', '&', '|', '~', '>', '<', '&') self.key_word = ( 'class', 'method', 'function', 'constructor', 'int', 'boolean', 'char', 'void', 'var', 'static', 'field', 'let', 'do', 'if', 'else', 'while', 'return', 'true', 'false', 'null', 'this') self.token = '' def token_type(self): if self.token is None or self.token == '': return None if self.token in self.key_word: return 'keyword' elif self.token[0] == '"': return 'stringConstant' elif re.match(r"\d+", self.token): return 'integerConstant' elif self.token in self.symbols: return 'symbol' else: return 'identifier' def advance(self): token = '' i = self.i while i < len(self.file): if re.match(r'\s', self.file[i]): i = i + 1 continue else: if self.file[i] in self.symbols: if self.file[i] == '>': self.token = '>' elif self.file[i] == '<': self.token = '<' elif self.file[i] == '&': self.token = '&' else: self.token = self.file[i] # self.token = self.file[i] self.i = i + 1 return elif self.file[i] == '"': i += 1 while self.file[i] != '"': token += self.file[i] i += 1 self.i = i + 1 self.token = '"' + token + '"' return else: while re.match(r'\w', self.file[i]): token += self.file[i] if i + 1 > len(self.file) - 1: break i += 1 self.i = i self.token = token return def clear_file(self, directory): with open(directory, "r") as my_file: txt = my_file.read() txt = re.sub(r"//.*", "", txt) txt = re.sub(r"/[*][*].*[*]/", "", txt) txt = re.sub(r"/[*][*][\w*\W*]*[*]/", "", txt) self.file = txt class CompilationEngine: def __init__(self): self.string = '' self.tab = 0 def write_token(self, tokenizer): if tokenizer.token_type() == 'stringConstant': self.string += ' ' * self.tab + '<' + tokenizer.token_type() + '> ' + tokenizer.token.strip('"') + ' \n' else: self.string += ' ' * self.tab + '<' + tokenizer.token_type() + '> ' + tokenizer.token + ' \n' def compile_class(self, tokenizer): tokenizer.advance() self.string += '\n' + ' ' * self.tab self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() while tokenizer.token != '}': if tokenizer.token in ['static', 'field']: self.compile_class_var_dec(tokenizer) if tokenizer.token in ['constructor', 'function', 'method']: self.compile_subroutine(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_class_var_dec(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() while tokenizer.token == ',': self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_subroutine(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_parameter_list(tokenizer) self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() while tokenizer.token != '}': if tokenizer.token == 'var': self.compile_var_dec(tokenizer) elif tokenizer.token in ['let', 'if', 'while', 'do', 'return']: self.compile_statements(tokenizer) self.write_token(tokenizer) self.tab -= 2 self.string += ' ' * self.tab + '\n' tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_parameter_list(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 while tokenizer.token != ')': self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() while tokenizer.token == ',': self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' self.write_token(tokenizer) tokenizer.advance() def compile_var_dec(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() while tokenizer.token == ',': self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_statements(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 while True: if tokenizer.token == 'let': self.compile_let(tokenizer) elif tokenizer.token == 'if': self.compile_if(tokenizer) elif tokenizer.token == 'while': self.compile_while(tokenizer) elif tokenizer.token == 'do': self.compile_do(tokenizer) elif tokenizer.token == 'return': self.compile_return(tokenizer) else: self.tab -= 2 self.string += ' ' * self.tab + '\n' break def compile_do(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() if tokenizer.token == '(': self.write_token(tokenizer) tokenizer.advance() self.compile_expression_list(tokenizer) self.write_token(tokenizer) tokenizer.advance() elif tokenizer.token == '.': self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_expression_list(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_let(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() if tokenizer.token == '[': self.write_token(tokenizer) tokenizer.advance() self.compile_expression(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_expression(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_while(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_expression(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_statements(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_return(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() if tokenizer.token != ';': self.compile_expression(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_if(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_expression(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_statements(tokenizer) self.write_token(tokenizer) tokenizer.advance() while tokenizer.token == 'else': self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_statements(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_expression(self, tokenizer): if tokenizer.token in ['(', '~', '-'] or tokenizer.token_type() != 'symbol': self.string += ' ' * self.tab + '\n' self.tab += 2 self.compile_term(tokenizer) while tokenizer.token in ['+', '-', '*', '/', '&', '|', '<', '>', '=']: self.write_token(tokenizer) tokenizer.advance() self.compile_term(tokenizer) self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_term(self, tokenizer): if tokenizer.token == '(': self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.compile_expression(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' elif tokenizer.token in ['~', '-']: self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() self.compile_term(tokenizer) self.tab -= 2 self.string += ' ' * self.tab + '\n' elif tokenizer.token_type() != 'symbol': self.string += ' ' * self.tab + '\n' self.tab += 2 self.write_token(tokenizer) tokenizer.advance() if tokenizer.token == '[': self.write_token(tokenizer) tokenizer.advance() self.compile_expression(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' elif tokenizer.token == '(': self.write_token(tokenizer) tokenizer.advance() self.compile_expression_list(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' elif tokenizer.token == '.': self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.write_token(tokenizer) tokenizer.advance() self.compile_expression_list(tokenizer) self.write_token(tokenizer) tokenizer.advance() self.tab -= 2 self.string += ' ' * self.tab + '\n' else: self.tab -= 2 self.string += ' ' * self.tab + '\n' def compile_expression_list(self, tokenizer): self.string += ' ' * self.tab + '\n' self.tab += 2 self.compile_expression(tokenizer) while tokenizer.token == ',': self.write_token(tokenizer) tokenizer.advance() self.compile_expression(tokenizer) self.tab -= 2 self.string += ' ' * self.tab + '\n' path = os.getcwd() for root, dirs, files in os.walk(path, topdown=False): for name in files: if name[-4:] == 'jack': tok = Tokenizer() tok.clear_file(Path(root, name)) ce = CompilationEngine() ce.compile_class(tok) with open(Path(root, name[:-4] + 'xml'), "w+") as my_xml: my_xml.write(ce.string)