lt-core/lib-ltcore/src/lexer.rs

366 lines
12 KiB
Rust
Raw Normal View History

2024-05-09 20:13:10 +02:00
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LTIRToken {
2024-05-15 23:51:20 +02:00
Comment(String),
2024-05-12 18:58:39 +02:00
Symbol(String),
Char(char),
Num(i64),
2024-05-09 20:13:10 +02:00
// SingleQuote(String),
// DoubleQuote(String),
// TripleQuote(String),
2024-05-11 00:00:20 +02:00
Lambda,
MapsTo,
2024-05-12 18:58:39 +02:00
AssignType(String),
AssignValue,
2024-05-11 00:00:20 +02:00
2024-05-09 20:13:10 +02:00
ExprOpen,
ExprClose,
BlockOpen,
BlockClose,
StatementSep,
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LexError {
InvalidDigit,
2024-05-12 18:58:39 +02:00
InvalidChar,
2024-05-09 20:13:10 +02:00
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LexerState {
Any,
2024-05-15 23:51:20 +02:00
Comment(String),
2024-05-12 18:58:39 +02:00
TypeTerm(String),
Sym(String),
Num(i64),
Char(Option<char>),
2024-05-09 20:13:10 +02:00
}
impl LexerState {
2024-05-12 18:58:39 +02:00
fn into_token(self) -> Option<LTIRToken> {
2024-05-09 20:13:10 +02:00
match self {
LexerState::Any => None,
2024-05-15 23:51:20 +02:00
LexerState::Comment(s) => Some(LTIRToken::Comment(s)),
2024-05-12 18:56:10 +02:00
LexerState::TypeTerm(s) => Some(LTIRToken::AssignType(s)),
2024-05-09 20:13:10 +02:00
LexerState::Sym(s) => Some(LTIRToken::Symbol(s)),
LexerState::Num(n) => Some(LTIRToken::Num(n)),
2024-05-12 18:58:39 +02:00
LexerState::Char(c) => Some(LTIRToken::Char(c?)),
2024-05-09 20:13:10 +02:00
}
}
}
pub struct LTIRLexer<It>
2024-05-12 18:58:39 +02:00
where
It: std::iter::Iterator<Item = char>,
2024-05-09 20:13:10 +02:00
{
chars: std::iter::Peekable<It>,
2024-05-13 22:55:24 +02:00
position: usize
2024-05-09 20:13:10 +02:00
}
impl<It> LTIRLexer<It>
2024-05-12 18:58:39 +02:00
where
It: Iterator<Item = char>,
2024-05-09 20:13:10 +02:00
{
pub fn into_inner(self) -> std::iter::Peekable<It> {
self.chars
}
}
impl<It> From<It> for LTIRLexer<It>
2024-05-12 18:58:39 +02:00
where
It: Iterator<Item = char>,
2024-05-09 20:13:10 +02:00
{
fn from(chars: It) -> Self {
LTIRLexer {
2024-05-12 18:58:39 +02:00
chars: chars.peekable(),
2024-05-13 22:55:24 +02:00
position: 0,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
2024-05-13 22:55:24 +02:00
pub struct InputRegionTag {
pub begin: usize,
pub end: usize
}
impl Default for InputRegionTag {
fn default() -> Self {
InputRegionTag {
begin: 0,
end: 0
}
}
2024-05-13 22:55:24 +02:00
}
impl InputRegionTag {
pub fn max( a: InputRegionTag, b: InputRegionTag ) -> InputRegionTag {
InputRegionTag {
begin: usize::min( a.begin, b.begin ),
end: usize::max( a.end, b.end )
2024-05-09 20:13:10 +02:00
}
}
}
impl<It> Iterator for LTIRLexer<It>
2024-05-12 18:58:39 +02:00
where
It: Iterator<Item = char>,
2024-05-09 20:13:10 +02:00
{
2024-05-13 22:55:24 +02:00
type Item = (InputRegionTag, Result<LTIRToken, LexError>);
2024-05-09 20:13:10 +02:00
fn next(&mut self) -> Option<Self::Item> {
let mut state = LexerState::Any;
2024-05-13 22:55:24 +02:00
let mut region = InputRegionTag{
begin: self.position,
end: self.position
};
2024-05-09 20:13:10 +02:00
while let Some(c) = self.chars.peek() {
match &mut state {
// determine token type
2024-05-12 18:58:39 +02:00
LexerState::Any => match c {
'λ' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::Lambda)));
2024-05-12 18:58:39 +02:00
}
'↦' => {
2024-05-12 18:58:39 +02:00
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::MapsTo)));
2024-05-12 18:58:39 +02:00
}
'(' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprOpen)));
2024-05-12 18:58:39 +02:00
}
')' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprClose)));
2024-05-12 18:58:39 +02:00
}
'{' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockOpen)));
2024-05-12 18:58:39 +02:00
}
'}' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockClose)));
2024-05-12 18:58:39 +02:00
}
':' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
2024-05-12 18:58:39 +02:00
state = LexerState::TypeTerm(String::new());
}
'=' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::AssignValue)));
2024-05-12 18:58:39 +02:00
}
';' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::StatementSep)));
2024-05-12 18:58:39 +02:00
}
'\'' => {
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
2024-05-12 18:58:39 +02:00
state = LexerState::Char(None);
}
2024-05-15 23:51:20 +02:00
'/' => {
self.chars.next();
self.position += 1;
region.end += 1;
match self.chars.next() {
Some('*') => {
self.position += 1;
region.end += 1;
state = LexerState::Comment(String::new());
}
_ => {
return Some((region, Err(LexError::InvalidChar)));
}
}
}
2024-05-12 18:58:39 +02:00
c => {
if c.is_whitespace() {
2024-05-12 18:56:10 +02:00
self.chars.next();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.begin += 1;
region.end += 1;
2024-05-12 18:58:39 +02:00
} else if c.is_digit(10) {
state = LexerState::Num(0);
} else {
state = LexerState::Sym(String::new());
2024-05-09 20:13:10 +02:00
}
}
2024-05-12 18:58:39 +02:00
},
2024-05-09 20:13:10 +02:00
2024-05-15 23:51:20 +02:00
LexerState::Comment(s) => {
match self.chars.next() {
Some('*') => {
match self.chars.peek() {
Some('/') => {
self.chars.next();
self.position += 2;
region.end += 2;
if let Some(token) = state.clone().into_token() {
return Some((region, Ok(token)));
}
}
_ => {
s.push('*');
self.position += 1;
region.end += 1;
}
}
}
Some(c) => {
s.push(c);
self.position += 1;
region.end += 1;
}
None => {
return Some((region, Err(LexError::InvalidChar)));
}
}
}
2024-05-09 20:13:10 +02:00
LexerState::Char(val) => {
2024-05-13 22:55:24 +02:00
self.position += 2;
region.end += 2;
2024-05-12 18:58:39 +02:00
*val = Some(match self.chars.next() {
2024-05-13 22:55:24 +02:00
Some('\\') => {
self.position += 1;
region.end += 1;
match self.chars.next() {
Some('0') => '\0',
Some('n') => '\n',
Some('t') => '\t',
Some(c) => c,
None => {
return Some((region, Err(LexError::InvalidChar)));
}
2024-05-09 20:13:10 +02:00
}
2024-05-12 18:58:39 +02:00
},
Some(c) => c,
None => {
2024-05-13 22:55:24 +02:00
return Some((region, Err(LexError::InvalidChar)));
2024-05-12 18:58:39 +02:00
}
});
2024-05-09 20:13:10 +02:00
match self.chars.next() {
Some('\'') => {
if let Some(token) = state.clone().into_token() {
2024-05-13 22:55:24 +02:00
return Some((region, Ok(token)));
2024-05-12 18:58:39 +02:00
}
2024-05-09 20:13:10 +02:00
}
_ => {
2024-05-13 22:55:24 +02:00
return Some((region, Err(LexError::InvalidChar)));
2024-05-09 20:13:10 +02:00
}
}
}
2024-05-12 18:56:10 +02:00
LexerState::TypeTerm(s) => {
if *c == '=' || *c == '↦' || *c == ';' {
let token = state.clone().into_token().unwrap();
return Some((region, Ok(token)));
2024-05-12 18:56:10 +02:00
} else {
if let Some(c) = self.chars.next() {
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
2024-05-12 18:56:10 +02:00
s.push(c);
}
}
}
2024-05-09 20:13:10 +02:00
_ => {
if c.is_whitespace()
2024-05-12 18:58:39 +02:00
|| *c == '('
|| *c == ')'
|| *c == '{'
|| *c == '}'
|| *c == ';'
|| *c == '='
|| *c == ':'
2024-05-13 22:55:24 +02:00
|| *c == '↦'
2024-05-09 20:13:10 +02:00
{
// finish the current token
if let Some(token) = state.clone().into_token() {
2024-05-13 22:55:24 +02:00
return Some((region, Ok(token)));
2024-05-09 20:13:10 +02:00
}
} else {
// append to the current token
let c = self.chars.next().unwrap();
2024-05-13 22:55:24 +02:00
self.position += 1;
region.end += 1;
2024-05-09 20:13:10 +02:00
match &mut state {
LexerState::Sym(s) => {
s.push(c);
}
LexerState::Num(n) => {
if let Some(d) = c.to_digit(10) {
*n = (*n) * 10 + d as i64;
} else {
2024-05-13 22:55:24 +02:00
return Some((region, Err(LexError::InvalidDigit)));
2024-05-09 20:13:10 +02:00
}
}
_ => {}
}
}
}
}
}
if let Some(token) = state.into_token() {
2024-05-13 22:55:24 +02:00
Some((region, Ok(token)))
2024-05-09 20:13:10 +02:00
} else {
None
}
}
}
2024-05-12 18:56:10 +02:00
mod tests {
#[test]
fn test_lexer() {
let mut lexer = crate::lexer::LTIRLexer::from(
"let var1:=123;
2024-05-15 23:51:20 +02:00
/* comment */
2024-05-12 18:56:10 +02:00
let square =λx.* x x;
let sqrt = λx:~machine::Float64~machine::Word.(f64-sqrt x);
let magnitude =
λx:
.λy:
.sqrt (+ (* x x) (* y y));
2024-05-12 18:58:39 +02:00
"
.chars(),
2024-05-12 18:56:10 +02:00
);
2024-05-13 22:55:24 +02:00
for (range, token) in lexer {
eprintln!("[{:?}] {:?}", range, token);
2024-05-12 18:56:10 +02:00
}
}
}