From 695cbb24f1d27c347fe21bbb98e61d5028551ac8 Mon Sep 17 00:00:00 2001
From: Michael Sippel <micha@fragmental.art>
Date: Thu, 26 Oct 2023 20:25:56 +0200
Subject: [PATCH] basic parser

---
 src/ast.rs   |  68 ++++----
 src/main.rs  |   8 +-
 src/parse.rs | 429 ++++++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 392 insertions(+), 113 deletions(-)

diff --git a/src/ast.rs b/src/ast.rs
index 5a20bbc..87e7aae 100644
--- a/src/ast.rs
+++ b/src/ast.rs
@@ -1,30 +1,32 @@
+use std::boxed::Box;
+
 //<<<<>>>><<>><><<>><<<*>>><<>><><<>><<<<>>>>\\
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum Command {
     Simple {
-        assignments: Vec<(String, Word)>,
+        assignments: Vec<Assignment>,
         command_word: Word,
         redirections: Vec<Redirection>
     },
     Pipeline(Vec<Command>),
     Sequence(Vec<Command>),
-    ShortCircuitConjection(Vec<Command>),
+    ShortCircuitConjunction(Vec<Command>),
     ShortCircuitDisjunction(Vec<Command>),
-    Negation(Command),
+    Negation(Box<Command>),
     While {
-        condition: Command,
-        loop_body: Command
+        condition: Box<Command>,
+        loop_body: Box<Command>
     },
     For {
         varname: String,
         sequence: Word,
-        loop_body: Command
-    }
+        loop_body: Box<Command>
+    },
     If {
-        condition: Command,
-        then_branch: Command,
-        else_branch: Command
+        condition: Box<Command>,
+        then_branch: Box<Command>,
+        else_branch: Box<Command>
     },
     Case {
         expr: Word,
@@ -32,35 +34,25 @@ pub enum Command {
     },
     Function {
         name: String,
-        body: Command
+        body: Box<Command>
     }
 }
 
-/*
- * We are all luminous beings.
- * Why then, do we not appear before each
- * other radiant in our illumination ?
- */
-
-/*
- * Bewteen the idea
- * And the reality
- * Between the motion
- * And the act
- * Falls the Shadow
- * (T.S. Eliot)
- */
-
 //<<<<>>>><<>><><<>><<<*>>><<>><><<>><<<<>>>>\\
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
+pub struct Assignment {
+    pub name: String,
+    pub value: Word
+}
+
+#[derive(Debug, PartialEq)]
 pub struct Word {
     pub segments: Vec<WordSegment>
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum WordSegment {
-    FieldSeparator,
     Tilde(String),
     Literal(String),
     Parameter(String, ParameterFormat),
@@ -68,7 +60,7 @@ pub enum WordSegment {
     DoubleQuote(Word),
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum ParameterFormat {
     Normal,
     Length,
@@ -79,42 +71,42 @@ pub enum ParameterFormat {
     Sub(ParamSubSide, ParamSubMode, Word),
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum ParamSubMode {
     Shortest, Longest
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum ParamSubSide {
     Prefix, Suffix
 }
 
 //<<<<>>>><<>><><<>><<<*>>><<>><><<>><<<<>>>>\\
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub struct Redirection {
     redirection_type: RedirectionType,
     fd: u64,
     target: Word
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum RedirectionType {
     File(FileRedirectionType),
     Dup(DupRedirectionType),
     Heredoc // '<<'
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum FileRedirectionType {
     In,         // '<'
     InOut,      // '<>'
     Out,        // '>'
     OutReplace, // '>|'
-    OutAppend,  // '>|'
+    OutAppend,  // '>>'
 }
 
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum DupRedirectionType {
     In,  // '<&'
     Out  // '>&'
diff --git a/src/main.rs b/src/main.rs
index d6025bd..3752690 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,5 @@
+#![feature(iterator_try_collect)]
+
 use {
     laddertypes::*,
     std::io::BufRead,
@@ -38,10 +40,14 @@ fn main() {
     let stdin = std::io::stdin();
     for line in std::io::BufReader::new(stdin).lines() {
         if let Ok(line) = line {
+            let cmd = parse::parse_cmd( &mut line.chars().peekable() );
+            eprintln!("parsed cmd: {:?}", cmd);
+            /*
             let mut lex = parse::WordLexer::from( line.chars() );
             for word in lex {
                 eprintln!("word-segment: {:?}", word);
-            }
+        }
+            */
         }
     }
 
diff --git a/src/parse.rs b/src/parse.rs
index 9a54df4..af691f9 100644
--- a/src/parse.rs
+++ b/src/parse.rs
@@ -1,105 +1,368 @@
 use {
     crate::ast::*,
-    std::iter::{Peekable, FromIterator},
+    std::iter::{Peekable},
 };
 
-pub struct WordLexer<It>
-where It: Iterator<Item = char> {
-    chars: Peekable<It>
+
+#[derive(Debug, PartialEq)]
+pub enum LexError {
+    UnexpectedEnd(Vec<Option<char>>),
+    UnexpectedToken(char),
+    InvalidFileRedirectionType
 }
 
-impl<It> From<It> for WordLexer<It>
+
+///! iterates chars until it finds some char in `delim`
+pub struct DelimIter<'a, It>
 where It: Iterator<Item = char> {
-    fn from(iter: It) -> Self {
-        WordLexer {
-            chars: iter.into_iter().peekable()
-        }
+    chars: &'a mut Peekable<It>,
+    delim: Vec<(Option<char>, bool)>
+}
+
+impl<'a, It> DelimIter<'a, It>
+where It: Iterator<Item = char> {
+    fn new(chars: &'a mut Peekable<It>, delim: Vec<(Option<char>, bool)>) -> Self {
+        DelimIter { chars, delim }
+    }
+
+    fn new_whitespace(chars: &'a mut Peekable<It>) -> Self {
+        DelimIter::new(chars, vec![
+            (None, true),
+            (Some(' '), true),
+            (Some('\t'), true),
+            (Some('\n'), true)
+        ])
+    }
+
+    fn new_shell_word(chars: &'a mut Peekable<It>) -> Self {
+        DelimIter::new(chars, vec![
+            (None, true),
+            (Some(' '), true),
+            (Some('\t'), true),
+            (Some('\n'), true),
+            (Some('|'), false),
+            (Some('&'), false),
+            (Some(';'), false),
+            (Some('\"'), false),
+            (Some('\''), false)
+        ])
     }
 }
 
-#[derive(Debug)]
-pub enum LexError {
-    UnexpectedEnd(char)
-}
+impl<'a, It> Iterator for DelimIter<'a, It>
+where It: 'a + Iterator<Item = char> {
+    type Item = Result<char, LexError>;
 
-impl<It> WordLexer<It>
-where It: Iterator<Item = char> {
-    fn collect_until(&mut self, close: char) -> Result<String, LexError> {
-        let mut val = String::new();
-        while let Some(c) = self.chars.peek().cloned() {
-            if c == close {
-                return Ok(val)
-            } else {
-                self.chars.next();
-                val.push(c);
+    fn next(&mut self) -> Option<Result<char, LexError>> {
+        for (delim, consume) in self.delim.iter() {
+            if self.chars.peek().cloned() == *delim {
+                if *consume {
+                    self.chars.next();
+                }
+                return None;
             }
         }
 
-        if close.is_whitespace() {
-            Ok(val)
-        } else {
-            Err(LexError::UnexpectedEnd(close))
+        match self.chars.next() {
+            Some(c) => Some(Ok(c)),
+            None => Some(Err(LexError::UnexpectedEnd(vec![])))
         }
     }
 }
 
-impl<It> Iterator for WordLexer<It>
+
+pub struct WordLexer<'a, It>
+where It: 'a + Iterator<Item = char> {
+    chars: &'a mut Peekable<It>
+}
+
+impl<'a, It> WordLexer<'a, It>
 where It: Iterator<Item = char> {
+    fn collect_until(&mut self, close: Option<char>) -> Result<String, LexError> {
+        DelimIter::new(&mut self.chars, vec![(close, true)])
+            .try_collect::<String>()
+    }
+}
+
+pub fn skip_whitespace<It>(chars: &mut Peekable<It>)
+where It: Iterator<Item = char>
+{
+    while let Some(c) = chars.peek() {
+        if c.is_whitespace() {
+            chars.next();
+        } else {
+            break;
+        }
+    }
+}
+
+pub fn parse_quoted<It>(chars: &mut Peekable<It>) -> Result<WordSegment, LexError>
+where It: Iterator<Item = char>
+{
+    assert_eq!( chars.next(), Some('\''));
+    let quoted = DelimIter::new(chars, vec![(Some('\''), true)]).try_collect::<String>();
+    match quoted {
+        Ok(s) => {
+            Ok(WordSegment::Literal(s))
+        },
+        Err(e) => Err(e)
+    }
+}
+
+pub fn parse_doublequoted<It>(chars: &mut Peekable<It>) -> Result<WordSegment, LexError>
+where It: Iterator<Item = char>
+{
+    assert_eq!( chars.next(), Some('\"'));
+    let quoted = DelimIter::new(chars, vec![(Some('\"'), true)]).try_collect::<String>();
+    match quoted {
+        Ok(s) => {
+            let word = Word {
+                segments: // fixme: handle spaces correctly -> create QuoteLexer
+                WordLexer { chars: &mut s.chars().peekable() }
+                .scan((), |_, x| x.ok())
+                    .collect::<Vec<_>>()
+            };
+
+            Ok(WordSegment::DoubleQuote(word))
+        },
+        Err(e) => Err(e)
+    }    
+}
+
+pub fn parse_word<It>(chars: &mut Peekable<It>) -> Result<Word, LexError>
+where It: Iterator<Item = char>
+{
+    Ok(Word {
+        segments: WordLexer{ chars }.try_collect::<Vec<_>>()?
+    })
+}
+
+pub fn parse_assignment<It>(chars: &mut Peekable<It>) -> Result<Assignment, LexError>
+where It: Iterator<Item = char>
+{
+    let name = DelimIter::new(chars, vec![(Some('='), true)]).try_collect::<String>()?;
+    let value_str = DelimIter::new_whitespace(chars).try_collect::<String>()?;
+    let value = parse_word(&mut value_str.chars().peekable())?;
+    Ok(Assignment{ name, value })
+}
+
+impl std::str::FromStr for FileRedirectionType {
+    type Err = LexError;
+
+    fn from_str(s: &str) -> Result<FileRedirectionType, LexError> {
+        match s {
+            "<" => Ok(FileRedirectionType::In),
+            "<>" => Ok(FileRedirectionType::InOut),
+            ">" => Ok(FileRedirectionType::Out),
+            ">|" => Ok(FileRedirectionType::OutReplace),
+            ">>" => Ok(FileRedirectionType::OutAppend),
+            _ => Err(LexError::InvalidFileRedirectionType)
+        }
+    }
+}
+
+pub fn parse_redirection<It>(chars: &mut Peekable<It>) -> Result<Redirection, LexError>
+where It: Iterator<Item = char>
+{
+    Err(LexError::InvalidFileRedirectionType)
+    //    let name = DelimIterator::new(chars, vec!['<', '>']).collect::<String>();
+}
+
+pub fn parse_simple_cmd<It>(chars: &mut Peekable<It>) -> Result<Option<Command>, LexError>
+where It: Iterator<Item = char>
+{
+    let mut assignments = Vec::new();
+    let mut redirections = Vec::new();
+
+    if chars.peek() == None {
+        return Ok(None);
+    }
+
+    let mut first = DelimIter::new_shell_word(chars).try_collect::<String>()?;
+
+    while first.contains('=') {
+        assignments.push( parse_assignment(chars)? );
+        first = DelimIter::new_shell_word(chars).try_collect::<String>()?;
+    }
+
+    let mut cmd_segments = WordLexer{ chars }.try_collect::<Vec<_>>()?;
+    cmd_segments.insert(0, WordSegment::Literal(first));
+
+    Ok(Some(Command::Simple {
+        assignments,
+        command_word: Word { segments: cmd_segments },
+        redirections,
+    }))
+}
+
+pub fn parse_cmd<It>(chars: &mut Peekable<It>) -> Result<Option<Command>, LexError>
+where It: Iterator<Item = char>
+{
+    skip_whitespace(chars);
+    match chars.peek() {
+        Some('!') => {
+            chars.next();
+            if let Some(cmd) = parse_cmd(chars)? {
+                Ok(Some(Command::Negation(Box::new(cmd))))
+            } else {
+                Err(LexError::UnexpectedEnd(vec![]))
+            }
+        }
+        _ => {
+            if let Some(head) = parse_simple_cmd(chars)? {
+                skip_whitespace(chars);
+
+                match chars.peek() {
+                    Some(';') => {
+                        chars.next();
+
+                        let tail = parse_cmd( chars ) ?;
+                        match tail {
+                            Some(Command::Sequence(mut s)) => {
+                                s.insert(0, head);
+                                Ok(Some(Command::Sequence(s)))
+                            }
+                            Some(tail) => {
+                                Ok(Some(Command::Sequence(vec![ head, tail ])))
+                            }
+                            None => {
+                                Ok(Some(head))
+                            }
+                        }
+                    }
+                    Some('|') => {
+                        chars.next();
+                        match chars.peek() {
+                            Some('|') => {
+                                chars.next();
+
+                                let tail = parse_cmd( chars ) ?;
+                                match tail {
+                                    Some(Command::ShortCircuitDisjunction(mut s)) => {
+                                        s.insert(0, head);
+                                        Ok(Some(Command::ShortCircuitDisjunction(s)))
+                                    }
+                                    Some(tail) => {
+                                        Ok(Some(Command::ShortCircuitDisjunction(vec![ head, tail ])))
+                                    }
+                                    None => {
+                                        Err(LexError::UnexpectedEnd(vec![Some('|')]))
+                                    }
+                                }
+                            }
+                            _ => {
+                                let tail = parse_cmd( chars ) ?;
+                                match tail {
+                                    Some(Command::Pipeline(mut s)) => {
+                                        s.insert(0, head);
+                                        Ok(Some(Command::Pipeline(s)))
+                                    }
+                                    Some(tail) => {
+                                        Ok(Some(Command::Pipeline(vec![ head, tail ])))
+                                    }
+                                    None => {
+                                        Err(LexError::UnexpectedEnd(vec![]))
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    Some('&') => {
+                        chars.next();
+                        match chars.peek() {
+                            Some('&') => {
+                                chars.next();
+
+                                let tail = parse_cmd( chars ) ?;
+                                match tail {
+                                    Some(Command::ShortCircuitConjunction(mut s)) => {
+                                        s.insert(0, head);
+                                        Ok(Some(Command::ShortCircuitConjunction(s)))
+                                    }
+                                    Some(tail) => {
+                                        Ok(Some(Command::ShortCircuitConjunction(vec![ head, tail ])))
+                                    }
+                                    None => {
+                                        Err(LexError::UnexpectedEnd(vec![Some('&'), Some('&')]))
+                                    }
+                                }
+                            }
+                            Some(c) => {
+                                Err(LexError::UnexpectedToken(*c))
+                            }
+                            None => {
+                                // todo:
+                                // background job
+                                Ok(Some(head))
+                            }
+                        }
+                    }
+                    Some(c) => {
+                        Err(LexError::UnexpectedToken(*c))
+                    }
+                    None => {
+                        Ok(Some(head))
+                    }
+                }
+            } else {
+                Ok(None)
+            }
+        }
+    }
+}
+
+impl<'a, It> Iterator for WordLexer<'a, It>
+where It: 'a + Iterator<Item = char> {
     type Item = Result<WordSegment, LexError>;
 
     fn next(&mut self) -> Option<Result<WordSegment, LexError>> {
+        skip_whitespace(self.chars);
         match self.chars.peek().cloned() {
+            Some('|') => { None }
+            Some('&') => { None }
+            Some(';') => { None }
             Some('~') => {
                 self.chars.next();
-                match self.collect_until(' ') {
-                    Ok(s) => Some(Ok(WordSegment::Tilde(s))),
+                let user = DelimIter::new_whitespace(self.chars).collect();
+                match user {
+                    Ok(user) => Some(Ok(WordSegment::Tilde(user))),
                     Err(e) => Some(Err(e))
                 }
             }
-            Some('"') => {
-                self.chars.next();
-                match self.collect_until('"') {
-                    Ok(s) => {
-                        self.chars.next();
-
-                        let word = Word {
-                            segments: WordLexer { chars: s.chars().peekable() }
-                            .scan((), |_, x| x.ok())
-                                .collect::<Vec<_>>()
-                        };
-
-                        Some(Ok(WordSegment::DoubleQuote(word)))
-                    },
-                    Err(e) => Some(Err(e))
-                }
-            },
-            Some('\'') => {
-                self.chars.next();
-                match self.collect_until('\'') {
-                    Ok(s) => {
-                        self.chars.next();
-                        Some(Ok(WordSegment::Literal(s)))
-                    },
-                    Err(e) => Some(Err(e))
-                }
-            },
+            Some('"') => { Some(parse_doublequoted(self.chars)) },
+            Some('\'') => { Some(parse_quoted(self.chars)) },
             Some('$') => {
                 self.chars.next();
                 match self.chars.peek() {
                     Some('{') => {
                         self.chars.next();
-                        match self.collect_until('}') {
+                        match DelimIter::new(&mut self.chars, vec![(Some('}'), true)]).try_collect::<String>() {
                             Ok(s) => {
-                                self.chars.next();
-                                Some(Ok(WordSegment::Variable(s)))
+                                Some(Ok(WordSegment::Parameter(s, ParameterFormat::Normal)))
                             }
                             Err(e) => Some(Err(e))
                         }
                     }
+                    Some('(') => {
+                        self.chars.next();
+                        let subcmd_str = DelimIter::new(&mut self.chars, vec![(Some(')'), true)]).try_collect::<String>();
+                        match subcmd_str {
+                            Ok(subcmd_str) => {
+                                match parse_cmd(&mut subcmd_str.chars().peekable()) {
+                                    Ok(Some(subcmd)) => {
+                                        Some(Ok(WordSegment::Subshell(subcmd)))        
+                                    }
+                                    Ok(None) => None,
+                                    Err(err) => Some(Err(err))
+                                }
+                            }
+                            Err(err) => Some(Err(err))
+                        }
+                    }
                     _ => {
-                        match self.collect_until(' ') {
+                        match DelimIter::new_whitespace(self.chars).collect() {
                             Ok(s) => {
-                                Some(Ok(WordSegment::Variable(s)))
+                                Some(Ok(WordSegment::Parameter(s, ParameterFormat::Normal)))
                             }
                             Err(e) => Some(Err(e))
                         }
@@ -107,19 +370,11 @@ where It: Iterator<Item = char> {
                 }
             }
             Some(c) => {
-                while let Some(c) = self.chars.peek() {
-                    if c.is_whitespace() {
-                        self.chars.next();
-                    } else {
-                        return match self.collect_until(' ') {
-                            Ok(s) => {
-                                Some(Ok(WordSegment::Literal(s)))
-                            }
-                            Err(e) => Some(Err(e))
-                        };
-                    }
+                let s : Result<String, LexError> = DelimIter::new_shell_word(self.chars).collect();
+                match s {
+                    Ok(s) => Some(Ok(WordSegment::Literal(s))),
+                    Err(e) => Some(Err(e))
                 }
-                None
             }
             None => {
                 None
@@ -128,3 +383,29 @@ where It: Iterator<Item = char> {
     }
 }
 
+
+mod test {
+    use crate::parse::*;
+
+    #[test]
+    fn test_delim_iter() {
+        let mut cs = "test 1234".chars().peekable();
+        let mut lexer = DelimIter::new_shell_word(&mut cs);
+        assert_eq!(lexer.try_collect::<String>(), Ok(String::from("test")));
+    }
+
+    #[test]
+    fn test_word_lexer() {
+        let mut cs = "test   1234|test".chars().peekable();
+
+        {
+            let mut lexer = WordLexer{ chars: &mut cs };
+            assert_eq!(lexer.next(), Some(Ok(WordSegment::Literal(String::from("test")))));
+            assert_eq!(lexer.next(), Some(Ok(WordSegment::Literal(String::from("1234")))));
+            assert_eq!(lexer.next(), None);
+        }
+
+        assert_eq!(cs.next(), Some('|'));
+    }
+}
+