From 34a129d1018f884fce9ed0f736fff81f8b600f69 Mon Sep 17 00:00:00 2001 From: Michael Sippel Date: Mon, 13 May 2024 22:55:24 +0200 Subject: [PATCH] lexer: add input region for each token --- src/lexer.rs | 108 ++++++++++++++++++++++++++++++++++++++------------ src/parser.rs | 34 ++++++++-------- 2 files changed, 99 insertions(+), 43 deletions(-) diff --git a/src/lexer.rs b/src/lexer.rs index 41a00f6..fe0677d 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -52,6 +52,7 @@ where It: std::iter::Iterator, { chars: std::iter::Peekable, + position: usize } impl LTIRLexer @@ -70,6 +71,23 @@ where fn from(chars: It) -> Self { LTIRLexer { chars: chars.peekable(), + position: 0, + } + } +} + + +#[derive(Clone, Debug)] +pub struct InputRegionTag { + begin: usize, + end: usize +} + +impl InputRegionTag { + pub fn max( a: InputRegionTag, b: InputRegionTag ) -> InputRegionTag { + InputRegionTag { + begin: usize::min( a.begin, b.begin ), + end: usize::max( a.end, b.end ) } } } @@ -78,10 +96,14 @@ impl Iterator for LTIRLexer where It: Iterator, { - type Item = Result; + type Item = (InputRegionTag, Result); fn next(&mut self) -> Option { let mut state = LexerState::Any; + let mut region = InputRegionTag{ + begin: self.position, + end: self.position + }; while let Some(c) = self.chars.peek() { match &mut state { @@ -89,47 +111,70 @@ where LexerState::Any => match c { 'λ' => { self.chars.next(); - return Some(Ok(LTIRToken::Lambda)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::Lambda))); } - '.' => { + '.' | '↦' => { self.chars.next(); - return Some(Ok(LTIRToken::LambdaBody)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::LambdaBody))); } '(' => { self.chars.next(); - return Some(Ok(LTIRToken::ExprOpen)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::ExprOpen))); } ')' => { self.chars.next(); - return Some(Ok(LTIRToken::ExprClose)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::ExprClose))); } '{' => { self.chars.next(); - return Some(Ok(LTIRToken::BlockOpen)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::BlockOpen))); } '}' => { self.chars.next(); - return Some(Ok(LTIRToken::BlockClose)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::BlockClose))); } ':' => { self.chars.next(); + self.position += 1; + region.end += 1; state = LexerState::TypeTerm(String::new()); } '=' => { self.chars.next(); - return Some(Ok(LTIRToken::AssignValue)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::AssignValue))); } ';' => { self.chars.next(); - return Some(Ok(LTIRToken::StatementSep)); + self.position += 1; + region.end += 1; + return Some((region, Ok(LTIRToken::StatementSep))); } '\'' => { self.chars.next(); + self.position += 1; + region.end += 1; state = LexerState::Char(None); } c => { if c.is_whitespace() { self.chars.next(); + self.position += 1; + region.begin += 1; + region.end += 1; } else if c.is_digit(10) { state = LexerState::Num(0); } else { @@ -139,30 +184,36 @@ where }, LexerState::Char(val) => { + self.position += 2; + region.end += 2; *val = Some(match self.chars.next() { - Some('\\') => match self.chars.next() { - Some('0') => '\0', - Some('n') => '\n', - Some('t') => '\t', - Some(c) => c, - None => { - return Some(Err(LexError::InvalidChar)); + Some('\\') => { + self.position += 1; + region.end += 1; + match self.chars.next() { + Some('0') => '\0', + Some('n') => '\n', + Some('t') => '\t', + Some(c) => c, + None => { + return Some((region, Err(LexError::InvalidChar))); + } } }, Some(c) => c, None => { - return Some(Err(LexError::InvalidChar)); + return Some((region, Err(LexError::InvalidChar))); } }); match self.chars.next() { Some('\'') => { if let Some(token) = state.clone().into_token() { - return Some(Ok(token)); + return Some((region, Ok(token))); } } _ => { - return Some(Err(LexError::InvalidChar)); + return Some((region, Err(LexError::InvalidChar))); } } } @@ -170,10 +221,12 @@ where LexerState::TypeTerm(s) => { if *c == '=' || *c == '.' { if let Some(token) = state.clone().into_token() { - return Some(Ok(token)); + return Some((region, Ok(token))); } } else { if let Some(c) = self.chars.next() { + self.position += 1; + region.end += 1; s.push(c); } } @@ -189,16 +242,19 @@ where || *c == '=' || *c == ':' || *c == '.' + || *c == '↦' { // finish the current token if let Some(token) = state.clone().into_token() { - return Some(Ok(token)); + return Some((region, Ok(token))); } } else { // append to the current token let c = self.chars.next().unwrap(); + self.position += 1; + region.end += 1; match &mut state { LexerState::Sym(s) => { @@ -209,7 +265,7 @@ where if let Some(d) = c.to_digit(10) { *n = (*n) * 10 + d as i64; } else { - return Some(Err(LexError::InvalidDigit)); + return Some((region, Err(LexError::InvalidDigit))); } } @@ -221,7 +277,7 @@ where } if let Some(token) = state.into_token() { - Some(Ok(token)) + Some((region, Ok(token))) } else { None } @@ -244,8 +300,8 @@ mod tests { .chars(), ); - for token in lexer { - eprintln!("token = {:?}", token); + for (range, token) in lexer { + eprintln!("[{:?}] {:?}", range, token); } } } diff --git a/src/parser.rs b/src/parser.rs index eb0daab..57ebf31 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -25,14 +25,14 @@ where It: Iterator, { match tokens.next() { - Some(Ok(t)) => { + Some((region, Ok(t))) => { if t == expected_token { Ok(()) } else { Err(ParseError::UnexpectedToken) } } - Some(Err(err)) => Err(ParseError::LexError(err)), + Some((region, Err(err))) => Err(ParseError::LexError(err)), None => Err(ParseError::UnexpectedEnd), } } @@ -42,9 +42,9 @@ where It: Iterator, { match tokens.next() { - Some(Ok(LTIRToken::Symbol(name))) => Ok(name), - Some(Ok(_)) => Err(ParseError::UnexpectedToken), - Some(Err(err)) => Err(ParseError::LexError(err)), + Some((region, Ok(LTIRToken::Symbol(name)))) => Ok(name), + Some((region, Ok(_))) => Err(ParseError::UnexpectedToken), + Some((region, Err(err))) => Err(ParseError::LexError(err)), None => Err(ParseError::UnexpectedEnd), } } @@ -56,7 +56,7 @@ pub fn parse_type_tag( where It: Iterator, { - if let Some(peektok) = tokens.peek().clone() { + if let Some((region, peektok)) = tokens.peek().clone() { match peektok.clone() { Ok(LTIRToken::AssignType(typeterm_str)) => { tokens.next(); @@ -79,7 +79,7 @@ pub fn parse_statement( where It: Iterator, { - if let Some(peektok) = tokens.peek() { + if let Some((region, peektok)) = tokens.peek() { match peektok { Ok(LTIRToken::Symbol(sym)) => { match sym.as_str() { @@ -154,7 +154,7 @@ where let _ = parse_expect(tokens, LTIRToken::BlockOpen)?; let mut statements = Vec::new(); - while let Some(peektok) = tokens.peek() { + while let Some((region, peektok)) = tokens.peek() { match peektok { Ok(LTIRToken::BlockClose) => { tokens.next(); @@ -179,11 +179,11 @@ where It: Iterator, { match tokens.next() { - Some(Ok(LTIRToken::Symbol(sym))) => Ok(LTExpr::symbol(sym.as_str())), - Some(Ok(LTIRToken::Char(c))) => Ok(LTExpr::lit_uint(c as u64)), - Some(Ok(LTIRToken::Num(n))) => Ok(LTExpr::lit_uint(n as u64)), - Some(Ok(_)) => Err(ParseError::UnexpectedToken), - Some(Err(err)) => Err(ParseError::LexError(err)), + Some((region, Ok(LTIRToken::Symbol(sym)))) => Ok(LTExpr::symbol(sym.as_str())), + Some((region, Ok(LTIRToken::Char(c)))) => Ok(LTExpr::lit_uint(c as u64)), + Some((region, Ok(LTIRToken::Num(n)))) => Ok(LTExpr::lit_uint(n as u64)), + Some((region, Ok(_))) => Err(ParseError::UnexpectedToken), + Some((region, Err(err))) => Err(ParseError::LexError(err)), None => Err(ParseError::UnexpectedEnd), } } @@ -197,14 +197,14 @@ where { let mut children = Vec::new(); - while let Some(tok) = tokens.peek() { + while let Some((region, tok)) = tokens.peek() { match tok { Ok(LTIRToken::Lambda) => { if children.len() == 0 { tokens.next(); let mut args = Vec::new(); - while let Some(Ok(LTIRToken::Symbol(_))) = tokens.peek() { + while let Some((region, Ok(LTIRToken::Symbol(_)))) = tokens.peek() { args.push((parse_symbol(tokens)?, parse_type_tag(typectx, tokens))); } @@ -221,7 +221,7 @@ where } Ok(LTIRToken::ExprOpen) => { tokens.next(); - while let Some(peektok) = tokens.peek() { + while let Some((region, peektok)) = tokens.peek() { match peektok { Ok(LTIRToken::ExprClose) => { tokens.next(); @@ -253,7 +253,7 @@ where let if_expr = LTExpr::block(parse_block(typectx, tokens)?); let mut else_expr = LTExpr::block(vec![]); - if let Some(peektok) = tokens.peek() { + if let Some((region, peektok)) = tokens.peek() { if let Ok(LTIRToken::Symbol(name)) = peektok { if name == "else" { tokens.next();