lt-core/lib-ltcore/src/lexer.rs

365 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LTIRToken {
Comment(String),
Symbol(String),
Char(char),
Num(i64),
// SingleQuote(String),
// DoubleQuote(String),
// TripleQuote(String),
Lambda,
MapsTo,
AssignType(String),
AssignValue,
ExprOpen,
ExprClose,
BlockOpen,
BlockClose,
StatementSep,
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LexError {
InvalidDigit,
InvalidChar,
}
#[derive(PartialEq, Eq, Clone, Debug)]
pub enum LexerState {
Any,
Comment(String),
TypeTerm(String),
Sym(String),
Num(i64),
Char(Option<char>),
}
impl LexerState {
fn into_token(self) -> Option<LTIRToken> {
match self {
LexerState::Any => None,
LexerState::Comment(s) => Some(LTIRToken::Comment(s)),
LexerState::TypeTerm(s) => Some(LTIRToken::AssignType(s)),
LexerState::Sym(s) => Some(LTIRToken::Symbol(s)),
LexerState::Num(n) => Some(LTIRToken::Num(n)),
LexerState::Char(c) => Some(LTIRToken::Char(c?)),
}
}
}
pub struct LTIRLexer<It>
where
It: std::iter::Iterator<Item = char>,
{
chars: std::iter::Peekable<It>,
position: usize
}
impl<It> LTIRLexer<It>
where
It: Iterator<Item = char>,
{
pub fn into_inner(self) -> std::iter::Peekable<It> {
self.chars
}
}
impl<It> From<It> for LTIRLexer<It>
where
It: Iterator<Item = char>,
{
fn from(chars: It) -> Self {
LTIRLexer {
chars: chars.peekable(),
position: 0,
}
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct InputRegionTag {
pub begin: usize,
pub end: usize
}
impl Default for InputRegionTag {
fn default() -> Self {
InputRegionTag {
begin: 0,
end: 0
}
}
}
impl InputRegionTag {
pub fn max( a: InputRegionTag, b: InputRegionTag ) -> InputRegionTag {
InputRegionTag {
begin: usize::min( a.begin, b.begin ),
end: usize::max( a.end, b.end )
}
}
}
impl<It> Iterator for LTIRLexer<It>
where
It: Iterator<Item = char>,
{
type Item = (InputRegionTag, Result<LTIRToken, LexError>);
fn next(&mut self) -> Option<Self::Item> {
let mut state = LexerState::Any;
let mut region = InputRegionTag{
begin: self.position,
end: self.position
};
while let Some(c) = self.chars.peek() {
match &mut state {
// determine token type
LexerState::Any => match c {
'λ' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::Lambda)));
}
'↦' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::MapsTo)));
}
'(' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprOpen)));
}
')' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::ExprClose)));
}
'{' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockOpen)));
}
'}' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::BlockClose)));
}
':' => {
self.chars.next();
self.position += 1;
region.end += 1;
state = LexerState::TypeTerm(String::new());
}
'=' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::AssignValue)));
}
';' => {
self.chars.next();
self.position += 1;
region.end += 1;
return Some((region, Ok(LTIRToken::StatementSep)));
}
'\'' => {
self.chars.next();
self.position += 1;
region.end += 1;
state = LexerState::Char(None);
}
'/' => {
self.chars.next();
self.position += 1;
region.end += 1;
match self.chars.next() {
Some('*') => {
self.position += 1;
region.end += 1;
state = LexerState::Comment(String::new());
}
_ => {
return Some((region, Err(LexError::InvalidChar)));
}
}
}
c => {
if c.is_whitespace() {
self.chars.next();
self.position += 1;
region.begin += 1;
region.end += 1;
} else if c.is_digit(10) {
state = LexerState::Num(0);
} else {
state = LexerState::Sym(String::new());
}
}
},
LexerState::Comment(s) => {
match self.chars.next() {
Some('*') => {
match self.chars.peek() {
Some('/') => {
self.chars.next();
self.position += 2;
region.end += 2;
if let Some(token) = state.clone().into_token() {
return Some((region, Ok(token)));
}
}
_ => {
s.push('*');
self.position += 1;
region.end += 1;
}
}
}
Some(c) => {
s.push(c);
self.position += 1;
region.end += 1;
}
None => {
return Some((region, Err(LexError::InvalidChar)));
}
}
}
LexerState::Char(val) => {
self.position += 2;
region.end += 2;
*val = Some(match self.chars.next() {
Some('\\') => {
self.position += 1;
region.end += 1;
match self.chars.next() {
Some('0') => '\0',
Some('n') => '\n',
Some('t') => '\t',
Some(c) => c,
None => {
return Some((region, Err(LexError::InvalidChar)));
}
}
},
Some(c) => c,
None => {
return Some((region, Err(LexError::InvalidChar)));
}
});
match self.chars.next() {
Some('\'') => {
if let Some(token) = state.clone().into_token() {
return Some((region, Ok(token)));
}
}
_ => {
return Some((region, Err(LexError::InvalidChar)));
}
}
}
LexerState::TypeTerm(s) => {
if *c == '=' || *c == '↦' || *c == ';' {
let token = state.clone().into_token().unwrap();
return Some((region, Ok(token)));
} else {
if let Some(c) = self.chars.next() {
self.position += 1;
region.end += 1;
s.push(c);
}
}
}
_ => {
if c.is_whitespace()
|| *c == '('
|| *c == ')'
|| *c == '{'
|| *c == '}'
|| *c == ';'
|| *c == '='
|| *c == ':'
|| *c == '↦'
{
// finish the current token
if let Some(token) = state.clone().into_token() {
return Some((region, Ok(token)));
}
} else {
// append to the current token
let c = self.chars.next().unwrap();
self.position += 1;
region.end += 1;
match &mut state {
LexerState::Sym(s) => {
s.push(c);
}
LexerState::Num(n) => {
if let Some(d) = c.to_digit(10) {
*n = (*n) * 10 + d as i64;
} else {
return Some((region, Err(LexError::InvalidDigit)));
}
}
_ => {}
}
}
}
}
}
if let Some(token) = state.into_token() {
Some((region, Ok(token)))
} else {
None
}
}
}
mod tests {
#[test]
fn test_lexer() {
let mut lexer = crate::lexer::LTIRLexer::from(
"let var1:=123;
/* comment */
let square =λx.* x x;
let sqrt = λx:~machine::Float64~machine::Word.(f64-sqrt x);
let magnitude =
λx:
.λy:
.sqrt (+ (* x x) (* y y));
"
.chars(),
);
for (range, token) in lexer {
eprintln!("[{:?}] {:?}", range, token);
}
}
}