lexer: add input region for each token

This commit is contained in:
Michael Sippel 2024-05-13 22:55:24 +02:00
parent c910265531
commit 34a129d101
Signed by: senvas
GPG key ID: F96CF119C34B64A6
2 changed files with 99 additions and 43 deletions

View file

@ -52,6 +52,7 @@ where
It: std::iter::Iterator<Item = char>, It: std::iter::Iterator<Item = char>,
{ {
chars: std::iter::Peekable<It>, chars: std::iter::Peekable<It>,
position: usize
} }
impl<It> LTIRLexer<It> impl<It> LTIRLexer<It>
@ -70,6 +71,23 @@ where
fn from(chars: It) -> Self { fn from(chars: It) -> Self {
LTIRLexer { LTIRLexer {
chars: chars.peekable(), chars: chars.peekable(),
position: 0,
}
}
}
#[derive(Clone, Debug)]
pub struct InputRegionTag {
begin: usize,
end: usize
}
impl InputRegionTag {
pub fn max( a: InputRegionTag, b: InputRegionTag ) -> InputRegionTag {
InputRegionTag {
begin: usize::min( a.begin, b.begin ),
end: usize::max( a.end, b.end )
} }
} }
} }
@ -78,10 +96,14 @@ impl<It> Iterator for LTIRLexer<It>
where where
It: Iterator<Item = char>, It: Iterator<Item = char>,
{ {
type Item = Result<LTIRToken, LexError>; type Item = (InputRegionTag, Result<LTIRToken, LexError>);
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let mut state = LexerState::Any; let mut state = LexerState::Any;
let mut region = InputRegionTag{
begin: self.position,
end: self.position
};
while let Some(c) = self.chars.peek() { while let Some(c) = self.chars.peek() {
match &mut state { match &mut state {
@ -89,47 +111,70 @@ where
LexerState::Any => match c { LexerState::Any => match c {
'λ' => { 'λ' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::Lambda)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::Lambda)));
} }
'.' => { '.' | '↦' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::LambdaBody)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::LambdaBody)));
} }
'(' => { '(' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::ExprOpen)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprOpen)));
} }
')' => { ')' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::ExprClose)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprClose)));
} }
'{' => { '{' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::BlockOpen)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockOpen)));
} }
'}' => { '}' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::BlockClose)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockClose)));
} }
':' => { ':' => {
self.chars.next(); self.chars.next();
self.position += 1;
region.end += 1;
state = LexerState::TypeTerm(String::new()); state = LexerState::TypeTerm(String::new());
} }
'=' => { '=' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::AssignValue)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::AssignValue)));
} }
';' => { ';' => {
self.chars.next(); self.chars.next();
return Some(Ok(LTIRToken::StatementSep)); self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::StatementSep)));
} }
'\'' => { '\'' => {
self.chars.next(); self.chars.next();
self.position += 1;
region.end += 1;
state = LexerState::Char(None); state = LexerState::Char(None);
} }
c => { c => {
if c.is_whitespace() { if c.is_whitespace() {
self.chars.next(); self.chars.next();
self.position += 1;
region.begin += 1;
region.end += 1;
} else if c.is_digit(10) { } else if c.is_digit(10) {
state = LexerState::Num(0); state = LexerState::Num(0);
} else { } else {
@ -139,30 +184,36 @@ where
}, },
LexerState::Char(val) => { LexerState::Char(val) => {
self.position += 2;
region.end += 2;
*val = Some(match self.chars.next() { *val = Some(match self.chars.next() {
Some('\\') => match self.chars.next() { Some('\\') => {
Some('0') => '\0', self.position += 1;
Some('n') => '\n', region.end += 1;
Some('t') => '\t', match self.chars.next() {
Some(c) => c, Some('0') => '\0',
None => { Some('n') => '\n',
return Some(Err(LexError::InvalidChar)); Some('t') => '\t',
Some(c) => c,
None => {
return Some((region, Err(LexError::InvalidChar)));
}
} }
}, },
Some(c) => c, Some(c) => c,
None => { None => {
return Some(Err(LexError::InvalidChar)); return Some((region, Err(LexError::InvalidChar)));
} }
}); });
match self.chars.next() { match self.chars.next() {
Some('\'') => { Some('\'') => {
if let Some(token) = state.clone().into_token() { if let Some(token) = state.clone().into_token() {
return Some(Ok(token)); return Some((region, Ok(token)));
} }
} }
_ => { _ => {
return Some(Err(LexError::InvalidChar)); return Some((region, Err(LexError::InvalidChar)));
} }
} }
} }
@ -170,10 +221,12 @@ where
LexerState::TypeTerm(s) => { LexerState::TypeTerm(s) => {
if *c == '=' || *c == '.' { if *c == '=' || *c == '.' {
if let Some(token) = state.clone().into_token() { if let Some(token) = state.clone().into_token() {
return Some(Ok(token)); return Some((region, Ok(token)));
} }
} else { } else {
if let Some(c) = self.chars.next() { if let Some(c) = self.chars.next() {
self.position += 1;
region.end += 1;
s.push(c); s.push(c);
} }
} }
@ -189,16 +242,19 @@ where
|| *c == '=' || *c == '='
|| *c == ':' || *c == ':'
|| *c == '.' || *c == '.'
|| *c == '↦'
{ {
// finish the current token // finish the current token
if let Some(token) = state.clone().into_token() { if let Some(token) = state.clone().into_token() {
return Some(Ok(token)); return Some((region, Ok(token)));
} }
} else { } else {
// append to the current token // append to the current token
let c = self.chars.next().unwrap(); let c = self.chars.next().unwrap();
self.position += 1;
region.end += 1;
match &mut state { match &mut state {
LexerState::Sym(s) => { LexerState::Sym(s) => {
@ -209,7 +265,7 @@ where
if let Some(d) = c.to_digit(10) { if let Some(d) = c.to_digit(10) {
*n = (*n) * 10 + d as i64; *n = (*n) * 10 + d as i64;
} else { } else {
return Some(Err(LexError::InvalidDigit)); return Some((region, Err(LexError::InvalidDigit)));
} }
} }
@ -221,7 +277,7 @@ where
} }
if let Some(token) = state.into_token() { if let Some(token) = state.into_token() {
Some(Ok(token)) Some((region, Ok(token)))
} else { } else {
None None
} }
@ -244,8 +300,8 @@ mod tests {
.chars(), .chars(),
); );
for token in lexer { for (range, token) in lexer {
eprintln!("token = {:?}", token); eprintln!("[{:?}] {:?}", range, token);
} }
} }
} }

View file

@ -25,14 +25,14 @@ where
It: Iterator<Item = char>, It: Iterator<Item = char>,
{ {
match tokens.next() { match tokens.next() {
Some(Ok(t)) => { Some((region, Ok(t))) => {
if t == expected_token { if t == expected_token {
Ok(()) Ok(())
} else { } else {
Err(ParseError::UnexpectedToken) Err(ParseError::UnexpectedToken)
} }
} }
Some(Err(err)) => Err(ParseError::LexError(err)), Some((region, Err(err))) => Err(ParseError::LexError(err)),
None => Err(ParseError::UnexpectedEnd), None => Err(ParseError::UnexpectedEnd),
} }
} }
@ -42,9 +42,9 @@ where
It: Iterator<Item = char>, It: Iterator<Item = char>,
{ {
match tokens.next() { match tokens.next() {
Some(Ok(LTIRToken::Symbol(name))) => Ok(name), Some((region, Ok(LTIRToken::Symbol(name)))) => Ok(name),
Some(Ok(_)) => Err(ParseError::UnexpectedToken), Some((region, Ok(_))) => Err(ParseError::UnexpectedToken),
Some(Err(err)) => Err(ParseError::LexError(err)), Some((region, Err(err))) => Err(ParseError::LexError(err)),
None => Err(ParseError::UnexpectedEnd), None => Err(ParseError::UnexpectedEnd),
} }
} }
@ -56,7 +56,7 @@ pub fn parse_type_tag<It>(
where where
It: Iterator<Item = char>, It: Iterator<Item = char>,
{ {
if let Some(peektok) = tokens.peek().clone() { if let Some((region, peektok)) = tokens.peek().clone() {
match peektok.clone() { match peektok.clone() {
Ok(LTIRToken::AssignType(typeterm_str)) => { Ok(LTIRToken::AssignType(typeterm_str)) => {
tokens.next(); tokens.next();
@ -79,7 +79,7 @@ pub fn parse_statement<It>(
where where
It: Iterator<Item = char>, It: Iterator<Item = char>,
{ {
if let Some(peektok) = tokens.peek() { if let Some((region, peektok)) = tokens.peek() {
match peektok { match peektok {
Ok(LTIRToken::Symbol(sym)) => { Ok(LTIRToken::Symbol(sym)) => {
match sym.as_str() { match sym.as_str() {
@ -154,7 +154,7 @@ where
let _ = parse_expect(tokens, LTIRToken::BlockOpen)?; let _ = parse_expect(tokens, LTIRToken::BlockOpen)?;
let mut statements = Vec::new(); let mut statements = Vec::new();
while let Some(peektok) = tokens.peek() { while let Some((region, peektok)) = tokens.peek() {
match peektok { match peektok {
Ok(LTIRToken::BlockClose) => { Ok(LTIRToken::BlockClose) => {
tokens.next(); tokens.next();
@ -179,11 +179,11 @@ where
It: Iterator<Item = char>, It: Iterator<Item = char>,
{ {
match tokens.next() { match tokens.next() {
Some(Ok(LTIRToken::Symbol(sym))) => Ok(LTExpr::symbol(sym.as_str())), Some((region, Ok(LTIRToken::Symbol(sym)))) => Ok(LTExpr::symbol(sym.as_str())),
Some(Ok(LTIRToken::Char(c))) => Ok(LTExpr::lit_uint(c as u64)), Some((region, Ok(LTIRToken::Char(c)))) => Ok(LTExpr::lit_uint(c as u64)),
Some(Ok(LTIRToken::Num(n))) => Ok(LTExpr::lit_uint(n as u64)), Some((region, Ok(LTIRToken::Num(n)))) => Ok(LTExpr::lit_uint(n as u64)),
Some(Ok(_)) => Err(ParseError::UnexpectedToken), Some((region, Ok(_))) => Err(ParseError::UnexpectedToken),
Some(Err(err)) => Err(ParseError::LexError(err)), Some((region, Err(err))) => Err(ParseError::LexError(err)),
None => Err(ParseError::UnexpectedEnd), None => Err(ParseError::UnexpectedEnd),
} }
} }
@ -197,14 +197,14 @@ where
{ {
let mut children = Vec::new(); let mut children = Vec::new();
while let Some(tok) = tokens.peek() { while let Some((region, tok)) = tokens.peek() {
match tok { match tok {
Ok(LTIRToken::Lambda) => { Ok(LTIRToken::Lambda) => {
if children.len() == 0 { if children.len() == 0 {
tokens.next(); tokens.next();
let mut args = Vec::new(); let mut args = Vec::new();
while let Some(Ok(LTIRToken::Symbol(_))) = tokens.peek() { while let Some((region, Ok(LTIRToken::Symbol(_)))) = tokens.peek() {
args.push((parse_symbol(tokens)?, parse_type_tag(typectx, tokens))); args.push((parse_symbol(tokens)?, parse_type_tag(typectx, tokens)));
} }
@ -221,7 +221,7 @@ where
} }
Ok(LTIRToken::ExprOpen) => { Ok(LTIRToken::ExprOpen) => {
tokens.next(); tokens.next();
while let Some(peektok) = tokens.peek() { while let Some((region, peektok)) = tokens.peek() {
match peektok { match peektok {
Ok(LTIRToken::ExprClose) => { Ok(LTIRToken::ExprClose) => {
tokens.next(); tokens.next();
@ -253,7 +253,7 @@ where
let if_expr = LTExpr::block(parse_block(typectx, tokens)?); let if_expr = LTExpr::block(parse_block(typectx, tokens)?);
let mut else_expr = LTExpr::block(vec![]); let mut else_expr = LTExpr::block(vec![]);
if let Some(peektok) = tokens.peek() { if let Some((region, peektok)) = tokens.peek() {
if let Ok(LTIRToken::Symbol(name)) = peektok { if let Ok(LTIRToken::Symbol(name)) = peektok {
if name == "else" { if name == "else" {
tokens.next(); tokens.next();