Assignment 3 Solutions: Tokenization and Parsing
The Tokenizer
module SmLispTokenize where
import Data.Char
data Token = Comment [Char] |
NumToken Int |
AlphaNumToken [Char] |
SpecialToken [Char] |
Lparen | Rparen | Lbrak | Rbrak | LBrace | RBrace |
Equal | Semicolon | Arrow | Quote | Colon |
BadToken Char deriving Show
tokenize [] = []
tokenize ('\n': ';' : ';' : ';' : more_chars) =
let (rest_of_line, more_lines) = span (/='\n') more_chars
in (Comment rest_of_line): tokenize more_lines
tokenize (' ':more) = tokenize more -- skip white space
tokenize ('\n':more) = tokenize more -- skip white space
tokenize ('(':more) = Lparen: (tokenize more)
tokenize (')':more) = Rparen: (tokenize more)
tokenize ('[':more) = Lbrak: (tokenize more)
tokenize (']':more) = Rbrak: (tokenize more)
tokenize ('{':more) = LBrace: (tokenize more)
tokenize ('}':more) = RBrace: (tokenize more)
tokenize (';':more) = Semicolon: (tokenize more)
tokenize ('"':more) = Quote: (tokenize more)
tokenize t@(c:more)
| isAlpha c = getAlphaNum [c] (span isAlphaNum more)
| isDigit c = getNumToken (digitToInt c) more
| otherwise = getSpecial (span (\c -> elem c "+-*/<>=&|!@#$%?:") t)
getSpecial ("", (c : more)) = BadToken c : (tokenize more)
-- Note: the following tokens may also be used as symbolic atoms;
-- the parser must recognize this situation
getSpecial ("=", more) = Equal : (tokenize more)
getSpecial ("-->", more) = Arrow : (tokenize more)
getSpecial (":", more) = Colon : (tokenize more)
getSpecial ("-", c:more)
| isDigit c = let (NumToken n:tokens) = getNumToken (digitToInt c) more
in (NumToken (-n)): tokens
| otherwise = (SpecialToken "-") : (tokenize (c:more))
getSpecial (specials, more) = (SpecialToken specials) : (tokenize more)
getNumToken accum [] = [NumToken accum]
getNumToken accum (c:more)
| isDigit c = getNumToken (accum * 10 + (digitToInt c)) more
| otherwise = NumToken accum : (tokenize (c:more))
getAlphaNum pfx (alphanum, s@('-':c:more))
| isAlphaNum c = getAlphaNum (pfx ++ alphanum ++ ['-',c]) (span isAlphaNum more)
| otherwise = AlphaNumToken (pfx ++ alphanum) : (tokenize s)
getAlphaNum pfx (alphanum, more) = AlphaNumToken (pfx ++ alphanum) : (tokenize more)
The Parser
module SmLispParse where
import SmLispTokenize
import SExpression
import System.IO (stdout,stderr,hPutStr,hPutStrLn)
-- Parsing Small Lisp values: S-expressions
parseSExpression :: [Token] -> Maybe (SExpression, [Token])
parseSExpression (NumToken n: tokens) = Just (NumAtom n, tokens)
parseSExpression (AlphaNumToken s: tokens) = Just (SymAtom s, tokens)
parseSExpression (SpecialToken s: tokens) = Just (SymAtom s, tokens)
parseSExpression (Arrow: tokens) = Just (SymAtom "-->", tokens)
parseSExpression (Equal: tokens) = Just (SymAtom "=", tokens)
parseSExpression (Colon: tokens) = Just (SymAtom ":", tokens)
parseSExpression (Lparen: tokens) = parseSExpressionList [] tokens
parseSExpression _ = Nothing
parseSExpressionList exprs (Rparen : more) = Just (List exprs, more)
parseSExpressionList exprs tokens =
case parseSExpression tokens of
Just (e, more) -> parseSExpressionList (exprs ++ [e]) more
_ -> Nothing
-- Parsing Small Lisp Expressions
type Identifier = [Char]
data SmLispExpr = SExpr SExpression |
Variable Identifier |
FnCall Identifier [SmLispExpr] |
CondExpr [CondClause] |
LetExpr [LocalDef] SmLispExpr deriving Show
data CondClause = Clause SmLispExpr SmLispExpr deriving Show
data LocalDef = Binding Identifier SmLispExpr deriving Show
parseSmLispExpr :: [Token] -> Maybe (SmLispExpr, [Token])
parseSmLispExpr (Lbrak : tokens) = parseCondExpr [] tokens
parseSmLispExpr (LBrace : tokens) = parseLetExpr [] tokens
parseSmLispExpr (AlphaNumToken s: Lbrak : tokens) = parseFnCall s [] tokens
parseSmLispExpr (AlphaNumToken s: tokens) = Just (Variable s, tokens)
parseSmLispExpr (Quote : t : Quote : tokens) =
case parseSExpression [t] of
Just (SymAtom s, []) -> Just (SExpr (SymAtom s), tokens)
_ -> Nothing
parseSmLispExpr tokens =
case parseSExpression tokens of
Just (List s, more) -> Just (SExpr (List s), more)
Just (NumAtom n, more) -> Just (SExpr (NumAtom n), more)
_ -> Nothing
parseCondExpr clauses tokens =
case parseSmLispExpr tokens of
Just (e, Arrow: more) ->
case parseSmLispExpr more of
Just (rslt, Semicolon:yet_more) ->
parseCondExpr (clauses ++ [Clause e rslt]) yet_more
Just (rslt, Rbrak:yet_more) ->
Just (CondExpr (clauses ++ [Clause e rslt]), yet_more)
_ -> Nothing
_ -> Nothing
parseFnCall s args tokens =
case parseSmLispExpr tokens of
Just (e, (Semicolon:more)) -> parseFnCall s (args ++ [e]) more
Just (e, (Rbrak:more)) -> Just (FnCall s (args ++ [e]), more)
_ -> Nothing
parseLetExpr defs (AlphaNumToken s: Equal: tokens) =
case parseSmLispExpr tokens of
Just (e, Semicolon:more) ->
parseLetExpr (defs ++ [Binding s e]) more
Just (e, Colon:more) ->
case parseSmLispExpr more of
Just (e2, RBrace:yet_more) ->
Just (LetExpr (defs ++ [Binding s e]) e2, yet_more)
_ -> Nothing
_ -> Nothing
parseLetExpr defs tokens = Nothing
-- Parsing Small Lisp Programs (Lists of Definitions)
data Definition = ConstantDef Identifier SmLispExpr |
FunctionDef Identifier [Identifier] SmLispExpr deriving Show
parseSmLispProgram [] = Just []
parseSmLispProgram (Comment t: more) = parseSmLispProgram more
parseSmLispProgram (AlphaNumToken name: Equal : more) =
case parseSmLispExpr more of
Just (e, yet_more) ->
case parseSmLispProgram yet_more of
Just p -> Just ((ConstantDef name e): p)
_ -> Nothing
_ -> Nothing
parseSmLispProgram (AlphaNumToken name: Lbrak : AlphaNumToken arg : more) =
case parseArguments [arg] more of
Just (args, Equal: yet_more) ->
case parseSmLispExpr yet_more of
Just (e, further_more) ->
case parseSmLispProgram further_more of
Just p -> Just ((FunctionDef name args e): p)
_ -> Nothing
_ -> Nothing
_ -> Nothing
parseSmLispProgram _ = Nothing
parseArguments args (Rbrak : more) = Just (args, more)
parseArguments args (Semicolon: AlphaNumToken a: more) =
parseArguments (args ++ [a]) more
parseArguments _ _ = Nothing
parseExpressionText srctxt =
case parseSmLispExpr (tokenize srctxt) of
Just (e, []) -> Just e
_ -> Nothing
parseProgramText srctxt = parseSmLispProgram (tokenize ('\n':srctxt))
Updated Mon March 26 2018, 18:03 by cameron.