compose_syntax/
lexer.rs

1use crate::file::FileId;
2use crate::kind::SyntaxKind;
3use crate::node::{SyntaxError, SyntaxNode};
4use crate::span::Span;
5use ecow::EcoString;
6use std::ops::Range;
7use unscanny::Scanner;
8
9#[derive(Debug, Clone)]
10pub struct Lexer<'s> {
11    /// Scanner: Contains the source text and a cursor in the source text
12    s: Scanner<'s>,
13    newline: bool,
14    error: Option<SyntaxError>,
15    pub(crate) file_id: FileId,
16}
17
18impl<'s> Lexer<'s> {
19    pub fn new(text: &'s str, file_id: FileId) -> Self {
20        Self {
21            s: Scanner::new(text),
22            newline: false,
23            error: None,
24            file_id,
25        }
26    }
27
28    /// The index in the string at which the last token ended and the next token will start
29    pub fn cursor(&self) -> usize {
30        self.s.cursor()
31    }
32
33    /// Jump to the given index in the string.
34    pub fn jump(&mut self, index: usize) {
35        self.s.jump(index);
36    }
37
38    /// Whether the last token had a trailing newline
39    pub fn newline(&self) -> bool {
40        self.newline
41    }
42
43    /// The number of characters until the most recent newline from an index.
44    pub fn column(&self, index: usize) -> usize {
45        let mut s = self.s; // Make a new temporary scanner (inexpensive).
46        s.jump(index);
47        s.before()
48            .chars()
49            .rev()
50            .take_while(|&c| !is_newline(c))
51            .count()
52    }
53}
54
55/// Whether a character is interpreted as a newline
56#[inline]
57pub fn is_newline(character: char) -> bool {
58    matches!(
59        character,
60        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
61        '\n' | '\x0B' | '\x0C' | '\r' |
62        // Next Line, Line Separator, Paragraph Separator.
63        '\u{0085}' | '\u{2028}' | '\u{2029}'
64    )
65}
66
67impl Lexer<'_> {
68    fn error(&mut self, message: impl Into<EcoString>, range: Range<usize>) -> SyntaxKind {
69        self.error = Some(SyntaxError::new(message, Span::new(self.file_id, range)));
70        SyntaxKind::Error
71    }
72
73    fn hint(&mut self, message: impl Into<EcoString>) {
74        if let Some(error) = &mut self.error {
75            error.hints.push(message.into())
76        }
77    }
78}
79
80impl Lexer<'_> {
81    #[allow(clippy::should_implement_trait)]
82    pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
83        debug_assert!(self.error.is_none());
84
85        self.newline = self.skip_whitespace(self.cursor());
86        let start = self.cursor();
87
88        let kind = match self.s.eat() {
89            Some(c) => self.kind(start, c),
90            None => SyntaxKind::End,
91        };
92
93        let text = self.s.from(start);
94        let span = Span::new(self.file_id, start..self.s.cursor());
95        let node = match self.error.take() {
96            Some(error) => SyntaxNode::error(error, text),
97            None => SyntaxNode::leaf(kind, text, span),
98        };
99
100        (kind, node)
101    }
102
103    fn kind(&mut self, start: usize, c: char) -> SyntaxKind {
104        match c {
105            '/' if self.s.eat_if('/') => {
106                if self.s.eat_if('/') {
107                    self.lex_doc_comment()
108                } else {
109                    self.lex_line_comment()
110                }
111            }
112            '/' if self.s.eat_if('*') => self.lex_block_comment(start),
113
114            '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
115            '!' if self.s.eat_if('=') => SyntaxKind::BangEq,
116            '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
117            '-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::MinusEq,
118            '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
119            '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
120            '!' if self.s.eat_if('=') => SyntaxKind::BangEq,
121
122            '.' if self.s.at('.') && self.s.scout(1) == Some('.') => {
123                self.s.eat();
124                self.s.eat();
125                SyntaxKind::Ellipsis
126            }
127            '.' if self.s.at('.') && self.s.scout(1) == Some('=') => {
128                self.s.eat();
129                self.s.eat();
130                SyntaxKind::DotsEq
131            }
132            '.' if self.s.eat_if('.') => SyntaxKind::Dots,
133
134            '<' if self.s.eat_if('<') => SyntaxKind::LtLt,
135            '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
136            '>' if self.s.eat_if('>') => SyntaxKind::GtGt,
137            '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
138
139            '|' if self.s.eat_if('|') => SyntaxKind::PipePipe,
140            '|' if self.s.eat_if('=') => SyntaxKind::PipeEq,
141            '&' if self.s.eat_if('&') => SyntaxKind::AmpAmp,
142            '&' if self.s.eat_if('=') => SyntaxKind::AmpersandEq,
143
144            '~' if self.s.eat_if('=') => SyntaxKind::TildeEq,
145            '^' if self.s.eat_if('=') => SyntaxKind::HatEq,
146
147            ':' if self.s.eat_if(':') => SyntaxKind::ColonColon,
148
149            '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
150
151            '{' => SyntaxKind::LeftBrace,
152            '}' => SyntaxKind::RightBrace,
153            '[' => SyntaxKind::LeftBracket,
154            ']' => SyntaxKind::RightBracket,
155            '(' => SyntaxKind::LeftParen,
156            ')' => SyntaxKind::RightParen,
157
158            '.' => SyntaxKind::Dot,
159            ',' => SyntaxKind::Comma,
160            ';' => SyntaxKind::Semicolon,
161            ':' => SyntaxKind::Colon,
162            '*' => SyntaxKind::Star,
163            '+' => SyntaxKind::Plus,
164            '-' => SyntaxKind::Minus,
165            '/' => SyntaxKind::Slash,
166            '%' => SyntaxKind::Percent,
167            '$' => SyntaxKind::Dollar,
168            '#' => SyntaxKind::Hash,
169            '@' => SyntaxKind::At,
170            '^' => SyntaxKind::Hat,
171            '`' => SyntaxKind::Backtick,
172            '\'' => SyntaxKind::Apostrophe,
173            '!' => SyntaxKind::Bang,
174            '~' => SyntaxKind::Tilde,
175            '|' => SyntaxKind::Pipe,
176            '&' => SyntaxKind::Amp,
177            '>' => SyntaxKind::Gt,
178            '<' => SyntaxKind::Lt,
179            '=' => SyntaxKind::Eq,
180
181            '"' => self.lex_string(start),
182            '0'..='9' => self.lex_number(start),
183
184            c if is_ident_start(c) => self.lex_ident(start),
185
186            c => self.error(
187                format!("unexpected character `{c}`"),
188                self.range_from(start),
189            ),
190        }
191    }
192
193    fn lex_number(&mut self, start: usize) -> SyntaxKind {
194        // Read until we find a non-digit (. or something else).
195        self.s.eat_while(char::is_ascii_digit);
196
197        let is_fractional = {
198            let dot = self.s.at('.');
199            let number = matches!(self.s.scout(1), Some('0'..='9' | 'e' | 'E'));
200            dot && number
201        };
202
203        if is_fractional {
204            // Read the fractional part.
205            if self.s.eat_if('.') {
206                self.s.eat_while(char::is_ascii_digit);
207            }
208
209            // Read the exponent.
210            if self.s.eat_if('e') || self.s.eat_if('E') {
211                self.s.eat_if(['+', '-']);
212                self.s.eat_while(char::is_ascii_digit);
213            }
214        }
215
216        let number = self.s.from(start);
217
218        if number.parse::<i64>().is_ok() {
219            SyntaxKind::Int
220        } else if number.parse::<f64>().is_ok() {
221            SyntaxKind::Float
222        } else {
223            self.error(format!("invalid number `{number}`"), self.range_from(start))
224        }
225    }
226
227    fn lex_ident(&mut self, start: usize) -> SyntaxKind {
228        self.s.eat_while(is_ident_mid);
229        let ident = self.s.from(start);
230
231        let prev = self.s.get(0..start);
232        if !(prev.ends_with(['.']) || prev.ends_with("..")) {
233            if let Some(keyword) = keyword(ident) {
234                return keyword;
235            }
236        }
237
238        if ident == "_" {
239            SyntaxKind::Underscore
240        } else {
241            SyntaxKind::Ident
242        }
243    }
244
245    /// Create a range from the start of the string to the current cursor.
246    fn range_from(&self, start: usize) -> Range<usize> {
247        start..self.s.cursor()
248    }
249
250    fn lex_string(&mut self, start: usize) -> SyntaxKind {
251        let mut escaped = false;
252        self.s.eat_until(|c| {
253            let stop = c == '"' && !escaped;
254            escaped = c == '\\' && !escaped;
255            stop
256        });
257
258        if !self.s.eat_if('"') {
259            return self.error("unclosed string", self.range_from(start));
260        }
261
262        SyntaxKind::Str
263    }
264
265    fn lex_line_comment(&mut self) -> SyntaxKind {
266        self.s.eat_while(|c| !is_newline(c));
267        SyntaxKind::Comment
268    }
269
270    fn lex_block_comment(&mut self, start: usize) -> SyntaxKind {
271        while let Some(c) = self.s.eat() {
272            if c == '*' && self.s.eat_if('/') {
273                return SyntaxKind::Comment;
274            }
275        }
276        self.error("unterminated block comment", self.range_from(start))
277    }
278
279    fn lex_doc_comment(&mut self) -> SyntaxKind {
280        self.s.eat_while(|c| !is_newline(c));
281        SyntaxKind::DocComment
282    }
283
284    fn skip_whitespace(&mut self, start: usize) -> bool {
285        self.s.eat_while(is_space);
286
287        // count newlines
288        let mut newline_count = 0;
289        let mut s = Scanner::new(self.s.from(start));
290        while let Some(c) = s.eat() {
291            if matches!(c, '\n' | '\x0B' | '\x0C' | '\r') {
292                // Handle \r\n and \r as a single newline.
293                if c == '\r' {
294                    s.eat_if('\n');
295                }
296                newline_count += 1;
297            }
298        }
299
300        newline_count > 0
301    }
302}
303
304fn is_space(c: char) -> bool {
305    matches!(c, ' ' | '\t' | '\n' | '\x0B' | '\x0C' | '\r')
306}
307
308fn keyword(ident: &str) -> Option<SyntaxKind> {
309    Some(match ident {
310        "as" => SyntaxKind::As,
311        "break" => SyntaxKind::Break,
312        "continue" => SyntaxKind::Continue,
313        "else" => SyntaxKind::Else,
314        "enum" => SyntaxKind::Enum,
315        "false" => SyntaxKind::Bool,
316        "for" => SyntaxKind::For,
317        "if" => SyntaxKind::If,
318        "import" => SyntaxKind::Import,
319        "in" => SyntaxKind::In,
320        "let" => SyntaxKind::Let,
321        "loop" => SyntaxKind::Loop,
322        "mut" => SyntaxKind::Mut,
323        "ref" => SyntaxKind::Ref,
324        "pub" => SyntaxKind::Pub,
325        "return" => SyntaxKind::Return,
326        "true" => SyntaxKind::Bool,
327        "while" => SyntaxKind::While,
328        _ => return None,
329    })
330}
331
332fn is_ident_mid(c: char) -> bool {
333    c.is_alphanumeric() || c == '_'
334}
335
336fn is_ident_start(c: char) -> bool {
337    c.is_alphabetic() || c == '_'
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343    use crate::assert_tokens;
344    use crate::test_utils::{test_file_id, LexerAssert};
345
346    #[test]
347    fn test_int() {
348        assert_tokens!("123", Int("123", 0..3));
349    }
350
351    #[test]
352    fn test_float() {
353        assert_tokens!("1.3", Float("1.3", 0..3))
354    }
355
356    #[test]
357    fn test_integer_floats() {
358        assert_tokens!("1.0", Float("1.0", 0..3));
359    }
360
361    #[test]
362    fn test_float_methods_disambiguation() {
363        assert_tokens!(
364            "1.3.method()",
365            Float("1.3", 0..3)
366            Dot(".", 3..4)
367            Ident("method", 4..10)
368            LeftParen("(", 10..11)
369            RightParen(")", 11..12)
370        );
371    }
372
373    #[test]
374    fn test_int_methods_disambiguation() {
375        assert_tokens!(
376            "1.method()",
377            Int("1", 0..1)
378            Dot(".", 1..2)
379            Ident("method", 2..8)
380            LeftParen("(", 8..9)
381            RightParen(")", 9..10)
382        );
383    }
384
385    #[test]
386    fn test_float_with_exponent() {
387        assert_tokens!("1.0e1", Float("1.0e1", 0..5));
388        assert_tokens!("1.0e-1", Float("1.0e-1", 0..6));
389    }
390
391    #[test]
392    fn test_string() {
393        assert_tokens!("\"abc\"", Str("\"abc\"", 0..5));
394    }
395
396    #[test]
397    fn test_ident() {
398        assert_tokens!("abc", Ident("abc", 0..3));
399    }
400
401    #[test]
402    fn test_unterminated_string() {
403        assert_tokens!("\"abc", !Error("unclosed string", "\"abc", 0..4));
404    }
405
406    #[test]
407    fn test_escaped_strings() {
408        assert_tokens!("\"abc\\\"def\"", Str("\"abc\\\"def\"", 0..10));
409    }
410
411    #[test]
412    fn test_newline() {
413        let file_id = test_file_id();
414        let mut lexer = Lexer::new("a\nb", file_id);
415
416        lexer.assert_next(SyntaxKind::Ident, "a", 0..1);
417        assert!(!lexer.newline());
418        lexer.assert_next(SyntaxKind::Ident, "b", 2..3);
419        assert!(lexer.newline());
420    }
421
422    #[test]
423    fn test_repeating_next_after_end() {
424        let file_id = test_file_id();
425        let mut lexer = Lexer::new("", file_id);
426        lexer.assert_end(0);
427        lexer.assert_end(0);
428        lexer.assert_end(0);
429        lexer.assert_end(0);
430        lexer.assert_end(0);
431    }
432
433    #[test]
434    fn test_comment_kinds() {
435        assert_tokens!("// regular comment", Comment("// regular comment", 0..18));
436        assert_tokens!("/// doc comment", DocComment("/// doc comment", 0..15));
437        assert_tokens!("/* block\ncomment */", Comment("/* block\ncomment */", 0..19));
438    }
439}