lt-core/src/lexer.rs
2024-05-12 18:58:39 +02:00

251 lines
7.3 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LTIRToken {
Symbol(String),
Char(char),
Num(i64),
// SingleQuote(String),
// DoubleQuote(String),
// TripleQuote(String),
Lambda,
LambdaBody,
AssignType(String),
AssignValue,
ExprOpen,
ExprClose,
BlockOpen,
BlockClose,
StatementSep,
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LexError {
InvalidDigit,
InvalidChar,
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LexerState {
Any,
TypeTerm(String),
Sym(String),
Num(i64),
Char(Option<char>),
}
impl LexerState {
fn into_token(self) -> Option<LTIRToken> {
match self {
LexerState::Any => None,
LexerState::TypeTerm(s) => Some(LTIRToken::AssignType(s)),
LexerState::Sym(s) => Some(LTIRToken::Symbol(s)),
LexerState::Num(n) => Some(LTIRToken::Num(n)),
LexerState::Char(c) => Some(LTIRToken::Char(c?)),
}
}
}
pub struct LTIRLexer<It>
where
It: std::iter::Iterator<Item = char>,
{
chars: std::iter::Peekable<It>,
}
impl<It> LTIRLexer<It>
where
It: Iterator<Item = char>,
{
pub fn into_inner(self) -> std::iter::Peekable<It> {
self.chars
}
}
impl<It> From<It> for LTIRLexer<It>
where
It: Iterator<Item = char>,
{
fn from(chars: It) -> Self {
LTIRLexer {
chars: chars.peekable(),
}
}
}
impl<It> Iterator for LTIRLexer<It>
where
It: Iterator<Item = char>,
{
type Item = Result<LTIRToken, LexError>;
fn next(&mut self) -> Option<Self::Item> {
let mut state = LexerState::Any;
while let Some(c) = self.chars.peek() {
match &mut state {
// determine token type
LexerState::Any => match c {
'λ' => {
self.chars.next();
return Some(Ok(LTIRToken::Lambda));
}
'.' => {
self.chars.next();
return Some(Ok(LTIRToken::LambdaBody));
}
'(' => {
self.chars.next();
return Some(Ok(LTIRToken::ExprOpen));
}
')' => {
self.chars.next();
return Some(Ok(LTIRToken::ExprClose));
}
'{' => {
self.chars.next();
return Some(Ok(LTIRToken::BlockOpen));
}
'}' => {
self.chars.next();
return Some(Ok(LTIRToken::BlockClose));
}
':' => {
self.chars.next();
state = LexerState::TypeTerm(String::new());
}
'=' => {
self.chars.next();
return Some(Ok(LTIRToken::AssignValue));
}
';' => {
self.chars.next();
return Some(Ok(LTIRToken::StatementSep));
}
'\'' => {
self.chars.next();
state = LexerState::Char(None);
}
c => {
if c.is_whitespace() {
self.chars.next();
} else if c.is_digit(10) {
state = LexerState::Num(0);
} else {
state = LexerState::Sym(String::new());
}
}
},
LexerState::Char(val) => {
*val = Some(match self.chars.next() {
Some('\\') => match self.chars.next() {
Some('0') => '\0',
Some('n') => '\n',
Some('t') => '\t',
Some(c) => c,
None => {
return Some(Err(LexError::InvalidChar));
}
},
Some(c) => c,
None => {
return Some(Err(LexError::InvalidChar));
}
});
match self.chars.next() {
Some('\'') => {
if let Some(token) = state.clone().into_token() {
return Some(Ok(token));
}
}
_ => {
return Some(Err(LexError::InvalidChar));
}
}
}
LexerState::TypeTerm(s) => {
if *c == '=' || *c == '.' {
if let Some(token) = state.clone().into_token() {
return Some(Ok(token));
}
} else {
if let Some(c) = self.chars.next() {
s.push(c);
}
}
}
_ => {
if c.is_whitespace()
|| *c == '('
|| *c == ')'
|| *c == '{'
|| *c == '}'
|| *c == ';'
|| *c == '='
|| *c == ':'
|| *c == '.'
{
// finish the current token
if let Some(token) = state.clone().into_token() {
return Some(Ok(token));
}
} else {
// append to the current token
let c = self.chars.next().unwrap();
match &mut state {
LexerState::Sym(s) => {
s.push(c);
}
LexerState::Num(n) => {
if let Some(d) = c.to_digit(10) {
*n = (*n) * 10 + d as i64;
} else {
return Some(Err(LexError::InvalidDigit));
}
}
_ => {}
}
}
}
}
}
if let Some(token) = state.into_token() {
Some(Ok(token))
} else {
None
}
}
}
mod tests {
#[test]
fn test_lexer() {
let mut lexer = crate::lexer::LTIRLexer::from(
"let var1:=123;
let square =λx.* x x;
let sqrt = λx:~machine::Float64~machine::Word.(f64-sqrt x);
let magnitude =
λx:
.λy:
.sqrt (+ (* x x) (* y y));
"
.chars(),
);
for token in lexer {
eprintln!("token = {:?}", token);
}
}
}