lexer: add input region for each token

This commit is contained in:
Michael Sippel 2024-05-13 22:55:24 +02:00
parent c910265531
commit 34a129d101
Signed by: senvas
GPG key ID: F96CF119C34B64A6
2 changed files with 99 additions and 43 deletions

View file

@ -52,6 +52,7 @@ where
It: std::iter::Iterator<Item = char>,
{
chars: std::iter::Peekable<It>,
position: usize
}
impl<It> LTIRLexer<It>
@ -70,6 +71,23 @@ where
fn from(chars: It) -> Self {
LTIRLexer {
chars: chars.peekable(),
position: 0,
}
}
}
#[derive(Clone, Debug)]
pub struct InputRegionTag {
begin: usize,
end: usize
}
impl InputRegionTag {
pub fn max( a: InputRegionTag, b: InputRegionTag ) -> InputRegionTag {
InputRegionTag {
begin: usize::min( a.begin, b.begin ),
end: usize::max( a.end, b.end )
}
}
}
@ -78,10 +96,14 @@ impl<It> Iterator for LTIRLexer<It>
where
It: Iterator<Item = char>,
{
type Item = Result<LTIRToken, LexError>;
type Item = (InputRegionTag, Result<LTIRToken, LexError>);
fn next(&mut self) -> Option<Self::Item> {
let mut state = LexerState::Any;
let mut region = InputRegionTag{
begin: self.position,
end: self.position
};
while let Some(c) = self.chars.peek() {
match &mut state {
@ -89,47 +111,70 @@ where
LexerState::Any => match c {
'λ' => {
self.chars.next();
return Some(Ok(LTIRToken::Lambda));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::Lambda)));
}
'.' => {
'.' | '↦' => {
self.chars.next();
return Some(Ok(LTIRToken::LambdaBody));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::LambdaBody)));
}
'(' => {
self.chars.next();
return Some(Ok(LTIRToken::ExprOpen));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprOpen)));
}
')' => {
self.chars.next();
return Some(Ok(LTIRToken::ExprClose));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprClose)));
}
'{' => {
self.chars.next();
return Some(Ok(LTIRToken::BlockOpen));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockOpen)));
}
'}' => {
self.chars.next();
return Some(Ok(LTIRToken::BlockClose));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockClose)));
}
':' => {
self.chars.next();
self.position += 1;
region.end += 1;
state = LexerState::TypeTerm(String::new());
}
'=' => {
self.chars.next();
return Some(Ok(LTIRToken::AssignValue));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::AssignValue)));
}
';' => {
self.chars.next();
return Some(Ok(LTIRToken::StatementSep));
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::StatementSep)));
}
'\'' => {
self.chars.next();
self.position += 1;
region.end += 1;
state = LexerState::Char(None);
}
c => {
if c.is_whitespace() {
self.chars.next();
self.position += 1;
region.begin += 1;
region.end += 1;
} else if c.is_digit(10) {
state = LexerState::Num(0);
} else {
@ -139,30 +184,36 @@ where
},
LexerState::Char(val) => {
self.position += 2;
region.end += 2;
*val = Some(match self.chars.next() {
Some('\\') => match self.chars.next() {
Some('0') => '\0',
Some('n') => '\n',
Some('t') => '\t',
Some(c) => c,
None => {
return Some(Err(LexError::InvalidChar));
Some('\\') => {
self.position += 1;
region.end += 1;
match self.chars.next() {
Some('0') => '\0',
Some('n') => '\n',
Some('t') => '\t',
Some(c) => c,
None => {
return Some((region, Err(LexError::InvalidChar)));
}
}
},
Some(c) => c,
None => {
return Some(Err(LexError::InvalidChar));
return Some((region, Err(LexError::InvalidChar)));
}
});
match self.chars.next() {
Some('\'') => {
if let Some(token) = state.clone().into_token() {
return Some(Ok(token));
return Some((region, Ok(token)));
}
}
_ => {
return Some(Err(LexError::InvalidChar));
return Some((region, Err(LexError::InvalidChar)));
}
}
}
@ -170,10 +221,12 @@ where
LexerState::TypeTerm(s) => {
if *c == '=' || *c == '.' {
if let Some(token) = state.clone().into_token() {
return Some(Ok(token));
return Some((region, Ok(token)));
}
} else {
if let Some(c) = self.chars.next() {
self.position += 1;
region.end += 1;
s.push(c);
}
}
@ -189,16 +242,19 @@ where
|| *c == '='
|| *c == ':'
|| *c == '.'
|| *c == '↦'
{
// finish the current token
if let Some(token) = state.clone().into_token() {
return Some(Ok(token));
return Some((region, Ok(token)));
}
} else {
// append to the current token
let c = self.chars.next().unwrap();
self.position += 1;
region.end += 1;
match &mut state {
LexerState::Sym(s) => {
@ -209,7 +265,7 @@ where
if let Some(d) = c.to_digit(10) {
*n = (*n) * 10 + d as i64;
} else {
return Some(Err(LexError::InvalidDigit));
return Some((region, Err(LexError::InvalidDigit)));
}
}
@ -221,7 +277,7 @@ where
}
if let Some(token) = state.into_token() {
Some(Ok(token))
Some((region, Ok(token)))
} else {
None
}
@ -244,8 +300,8 @@ mod tests {
.chars(),
);
for token in lexer {
eprintln!("token = {:?}", token);
for (range, token) in lexer {
eprintln!("[{:?}] {:?}", range, token);
}
}
}

View file

@ -25,14 +25,14 @@ where
It: Iterator<Item = char>,
{
match tokens.next() {
Some(Ok(t)) => {
Some((region, Ok(t))) => {
if t == expected_token {
Ok(())
} else {
Err(ParseError::UnexpectedToken)
}
}
Some(Err(err)) => Err(ParseError::LexError(err)),
Some((region, Err(err))) => Err(ParseError::LexError(err)),
None => Err(ParseError::UnexpectedEnd),
}
}
@ -42,9 +42,9 @@ where
It: Iterator<Item = char>,
{
match tokens.next() {
Some(Ok(LTIRToken::Symbol(name))) => Ok(name),
Some(Ok(_)) => Err(ParseError::UnexpectedToken),
Some(Err(err)) => Err(ParseError::LexError(err)),
Some((region, Ok(LTIRToken::Symbol(name)))) => Ok(name),
Some((region, Ok(_))) => Err(ParseError::UnexpectedToken),
Some((region, Err(err))) => Err(ParseError::LexError(err)),
None => Err(ParseError::UnexpectedEnd),
}
}
@ -56,7 +56,7 @@ pub fn parse_type_tag<It>(
where
It: Iterator<Item = char>,
{
if let Some(peektok) = tokens.peek().clone() {
if let Some((region, peektok)) = tokens.peek().clone() {
match peektok.clone() {
Ok(LTIRToken::AssignType(typeterm_str)) => {
tokens.next();
@ -79,7 +79,7 @@ pub fn parse_statement<It>(
where
It: Iterator<Item = char>,
{
if let Some(peektok) = tokens.peek() {
if let Some((region, peektok)) = tokens.peek() {
match peektok {
Ok(LTIRToken::Symbol(sym)) => {
match sym.as_str() {
@ -154,7 +154,7 @@ where
let _ = parse_expect(tokens, LTIRToken::BlockOpen)?;
let mut statements = Vec::new();
while let Some(peektok) = tokens.peek() {
while let Some((region, peektok)) = tokens.peek() {
match peektok {
Ok(LTIRToken::BlockClose) => {
tokens.next();
@ -179,11 +179,11 @@ where
It: Iterator<Item = char>,
{
match tokens.next() {
Some(Ok(LTIRToken::Symbol(sym))) => Ok(LTExpr::symbol(sym.as_str())),
Some(Ok(LTIRToken::Char(c))) => Ok(LTExpr::lit_uint(c as u64)),
Some(Ok(LTIRToken::Num(n))) => Ok(LTExpr::lit_uint(n as u64)),
Some(Ok(_)) => Err(ParseError::UnexpectedToken),
Some(Err(err)) => Err(ParseError::LexError(err)),
Some((region, Ok(LTIRToken::Symbol(sym)))) => Ok(LTExpr::symbol(sym.as_str())),
Some((region, Ok(LTIRToken::Char(c)))) => Ok(LTExpr::lit_uint(c as u64)),
Some((region, Ok(LTIRToken::Num(n)))) => Ok(LTExpr::lit_uint(n as u64)),
Some((region, Ok(_))) => Err(ParseError::UnexpectedToken),
Some((region, Err(err))) => Err(ParseError::LexError(err)),
None => Err(ParseError::UnexpectedEnd),
}
}
@ -197,14 +197,14 @@ where
{
let mut children = Vec::new();
while let Some(tok) = tokens.peek() {
while let Some((region, tok)) = tokens.peek() {
match tok {
Ok(LTIRToken::Lambda) => {
if children.len() == 0 {
tokens.next();
let mut args = Vec::new();
while let Some(Ok(LTIRToken::Symbol(_))) = tokens.peek() {
while let Some((region, Ok(LTIRToken::Symbol(_)))) = tokens.peek() {
args.push((parse_symbol(tokens)?, parse_type_tag(typectx, tokens)));
}
@ -221,7 +221,7 @@ where
}
Ok(LTIRToken::ExprOpen) => {
tokens.next();
while let Some(peektok) = tokens.peek() {
while let Some((region, peektok)) = tokens.peek() {
match peektok {
Ok(LTIRToken::ExprClose) => {
tokens.next();
@ -253,7 +253,7 @@ where
let if_expr = LTExpr::block(parse_block(typectx, tokens)?);
let mut else_expr = LTExpr::block(vec![]);
if let Some(peektok) = tokens.peek() {
if let Some((region, peektok)) = tokens.peek() {
if let Ok(LTIRToken::Symbol(name)) = peektok {
if name == "else" {
tokens.next();