1use crate::file::FileId;
2use crate::kind::SyntaxKind;
3use crate::node::{SyntaxError, SyntaxNode};
4use crate::span::Span;
5use ecow::EcoString;
6use std::ops::Range;
7use unscanny::Scanner;
8
9#[derive(Debug, Clone)]
10pub struct Lexer<'s> {
11 s: Scanner<'s>,
13 newline: bool,
14 error: Option<SyntaxError>,
15 pub(crate) file_id: FileId,
16}
17
18impl<'s> Lexer<'s> {
19 pub fn new(text: &'s str, file_id: FileId) -> Self {
20 Self {
21 s: Scanner::new(text),
22 newline: false,
23 error: None,
24 file_id,
25 }
26 }
27
28 pub fn cursor(&self) -> usize {
30 self.s.cursor()
31 }
32
33 pub fn jump(&mut self, index: usize) {
35 self.s.jump(index);
36 }
37
38 pub fn newline(&self) -> bool {
40 self.newline
41 }
42
43 pub fn column(&self, index: usize) -> usize {
45 let mut s = self.s; s.jump(index);
47 s.before()
48 .chars()
49 .rev()
50 .take_while(|&c| !is_newline(c))
51 .count()
52 }
53}
54
55#[inline]
57pub fn is_newline(character: char) -> bool {
58 matches!(
59 character,
60 '\n' | '\x0B' | '\x0C' | '\r' |
62 '\u{0085}' | '\u{2028}' | '\u{2029}'
64 )
65}
66
67impl Lexer<'_> {
68 fn error(&mut self, message: impl Into<EcoString>, range: Range<usize>) -> SyntaxKind {
69 self.error = Some(SyntaxError::new(message, Span::new(self.file_id, range)));
70 SyntaxKind::Error
71 }
72
73 fn hint(&mut self, message: impl Into<EcoString>) {
74 if let Some(error) = &mut self.error {
75 error.hints.push(message.into())
76 }
77 }
78}
79
80impl Lexer<'_> {
81 #[allow(clippy::should_implement_trait)]
82 pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
83 debug_assert!(self.error.is_none());
84
85 self.newline = self.skip_whitespace(self.cursor());
86 let start = self.cursor();
87
88 let kind = match self.s.eat() {
89 Some(c) => self.kind(start, c),
90 None => SyntaxKind::End,
91 };
92
93 let text = self.s.from(start);
94 let span = Span::new(self.file_id, start..self.s.cursor());
95 let node = match self.error.take() {
96 Some(error) => SyntaxNode::error(error, text),
97 None => SyntaxNode::leaf(kind, text, span),
98 };
99
100 (kind, node)
101 }
102
103 fn kind(&mut self, start: usize, c: char) -> SyntaxKind {
104 match c {
105 '/' if self.s.eat_if('/') => {
106 if self.s.eat_if('/') {
107 self.lex_doc_comment()
108 } else {
109 self.lex_line_comment()
110 }
111 }
112 '/' if self.s.eat_if('*') => self.lex_block_comment(start),
113
114 '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
115 '!' if self.s.eat_if('=') => SyntaxKind::BangEq,
116 '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
117 '-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::MinusEq,
118 '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
119 '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
120 '!' if self.s.eat_if('=') => SyntaxKind::BangEq,
121
122 '.' if self.s.at('.') && self.s.scout(1) == Some('.') => {
123 self.s.eat();
124 self.s.eat();
125 SyntaxKind::Ellipsis
126 }
127 '.' if self.s.at('.') && self.s.scout(1) == Some('=') => {
128 self.s.eat();
129 self.s.eat();
130 SyntaxKind::DotsEq
131 }
132 '.' if self.s.eat_if('.') => SyntaxKind::Dots,
133
134 '<' if self.s.eat_if('<') => SyntaxKind::LtLt,
135 '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
136 '>' if self.s.eat_if('>') => SyntaxKind::GtGt,
137 '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
138
139 '|' if self.s.eat_if('|') => SyntaxKind::PipePipe,
140 '|' if self.s.eat_if('=') => SyntaxKind::PipeEq,
141 '&' if self.s.eat_if('&') => SyntaxKind::AmpAmp,
142 '&' if self.s.eat_if('=') => SyntaxKind::AmpersandEq,
143
144 '~' if self.s.eat_if('=') => SyntaxKind::TildeEq,
145 '^' if self.s.eat_if('=') => SyntaxKind::HatEq,
146
147 ':' if self.s.eat_if(':') => SyntaxKind::ColonColon,
148
149 '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
150
151 '{' => SyntaxKind::LeftBrace,
152 '}' => SyntaxKind::RightBrace,
153 '[' => SyntaxKind::LeftBracket,
154 ']' => SyntaxKind::RightBracket,
155 '(' => SyntaxKind::LeftParen,
156 ')' => SyntaxKind::RightParen,
157
158 '.' => SyntaxKind::Dot,
159 ',' => SyntaxKind::Comma,
160 ';' => SyntaxKind::Semicolon,
161 ':' => SyntaxKind::Colon,
162 '*' => SyntaxKind::Star,
163 '+' => SyntaxKind::Plus,
164 '-' => SyntaxKind::Minus,
165 '/' => SyntaxKind::Slash,
166 '%' => SyntaxKind::Percent,
167 '$' => SyntaxKind::Dollar,
168 '#' => SyntaxKind::Hash,
169 '@' => SyntaxKind::At,
170 '^' => SyntaxKind::Hat,
171 '`' => SyntaxKind::Backtick,
172 '\'' => SyntaxKind::Apostrophe,
173 '!' => SyntaxKind::Bang,
174 '~' => SyntaxKind::Tilde,
175 '|' => SyntaxKind::Pipe,
176 '&' => SyntaxKind::Amp,
177 '>' => SyntaxKind::Gt,
178 '<' => SyntaxKind::Lt,
179 '=' => SyntaxKind::Eq,
180
181 '"' => self.lex_string(start),
182 '0'..='9' => self.lex_number(start),
183
184 c if is_ident_start(c) => self.lex_ident(start),
185
186 c => self.error(
187 format!("unexpected character `{c}`"),
188 self.range_from(start),
189 ),
190 }
191 }
192
193 fn lex_number(&mut self, start: usize) -> SyntaxKind {
194 self.s.eat_while(char::is_ascii_digit);
196
197 let is_fractional = {
198 let dot = self.s.at('.');
199 let number = matches!(self.s.scout(1), Some('0'..='9' | 'e' | 'E'));
200 dot && number
201 };
202
203 if is_fractional {
204 if self.s.eat_if('.') {
206 self.s.eat_while(char::is_ascii_digit);
207 }
208
209 if self.s.eat_if('e') || self.s.eat_if('E') {
211 self.s.eat_if(['+', '-']);
212 self.s.eat_while(char::is_ascii_digit);
213 }
214 }
215
216 let number = self.s.from(start);
217
218 if number.parse::<i64>().is_ok() {
219 SyntaxKind::Int
220 } else if number.parse::<f64>().is_ok() {
221 SyntaxKind::Float
222 } else {
223 self.error(format!("invalid number `{number}`"), self.range_from(start))
224 }
225 }
226
227 fn lex_ident(&mut self, start: usize) -> SyntaxKind {
228 self.s.eat_while(is_ident_mid);
229 let ident = self.s.from(start);
230
231 let prev = self.s.get(0..start);
232 if !(prev.ends_with(['.']) || prev.ends_with("..")) {
233 if let Some(keyword) = keyword(ident) {
234 return keyword;
235 }
236 }
237
238 if ident == "_" {
239 SyntaxKind::Underscore
240 } else {
241 SyntaxKind::Ident
242 }
243 }
244
245 fn range_from(&self, start: usize) -> Range<usize> {
247 start..self.s.cursor()
248 }
249
250 fn lex_string(&mut self, start: usize) -> SyntaxKind {
251 let mut escaped = false;
252 self.s.eat_until(|c| {
253 let stop = c == '"' && !escaped;
254 escaped = c == '\\' && !escaped;
255 stop
256 });
257
258 if !self.s.eat_if('"') {
259 return self.error("unclosed string", self.range_from(start));
260 }
261
262 SyntaxKind::Str
263 }
264
265 fn lex_line_comment(&mut self) -> SyntaxKind {
266 self.s.eat_while(|c| !is_newline(c));
267 SyntaxKind::Comment
268 }
269
270 fn lex_block_comment(&mut self, start: usize) -> SyntaxKind {
271 while let Some(c) = self.s.eat() {
272 if c == '*' && self.s.eat_if('/') {
273 return SyntaxKind::Comment;
274 }
275 }
276 self.error("unterminated block comment", self.range_from(start))
277 }
278
279 fn lex_doc_comment(&mut self) -> SyntaxKind {
280 self.s.eat_while(|c| !is_newline(c));
281 SyntaxKind::DocComment
282 }
283
284 fn skip_whitespace(&mut self, start: usize) -> bool {
285 self.s.eat_while(is_space);
286
287 let mut newline_count = 0;
289 let mut s = Scanner::new(self.s.from(start));
290 while let Some(c) = s.eat() {
291 if matches!(c, '\n' | '\x0B' | '\x0C' | '\r') {
292 if c == '\r' {
294 s.eat_if('\n');
295 }
296 newline_count += 1;
297 }
298 }
299
300 newline_count > 0
301 }
302}
303
304fn is_space(c: char) -> bool {
305 matches!(c, ' ' | '\t' | '\n' | '\x0B' | '\x0C' | '\r')
306}
307
308fn keyword(ident: &str) -> Option<SyntaxKind> {
309 Some(match ident {
310 "as" => SyntaxKind::As,
311 "break" => SyntaxKind::Break,
312 "continue" => SyntaxKind::Continue,
313 "else" => SyntaxKind::Else,
314 "enum" => SyntaxKind::Enum,
315 "false" => SyntaxKind::Bool,
316 "for" => SyntaxKind::For,
317 "if" => SyntaxKind::If,
318 "import" => SyntaxKind::Import,
319 "in" => SyntaxKind::In,
320 "let" => SyntaxKind::Let,
321 "loop" => SyntaxKind::Loop,
322 "mut" => SyntaxKind::Mut,
323 "ref" => SyntaxKind::Ref,
324 "pub" => SyntaxKind::Pub,
325 "return" => SyntaxKind::Return,
326 "true" => SyntaxKind::Bool,
327 "while" => SyntaxKind::While,
328 _ => return None,
329 })
330}
331
332fn is_ident_mid(c: char) -> bool {
333 c.is_alphanumeric() || c == '_'
334}
335
336fn is_ident_start(c: char) -> bool {
337 c.is_alphabetic() || c == '_'
338}
339
340#[cfg(test)]
341mod tests {
342 use super::*;
343 use crate::assert_tokens;
344 use crate::test_utils::{test_file_id, LexerAssert};
345
346 #[test]
347 fn test_int() {
348 assert_tokens!("123", Int("123", 0..3));
349 }
350
351 #[test]
352 fn test_float() {
353 assert_tokens!("1.3", Float("1.3", 0..3))
354 }
355
356 #[test]
357 fn test_integer_floats() {
358 assert_tokens!("1.0", Float("1.0", 0..3));
359 }
360
361 #[test]
362 fn test_float_methods_disambiguation() {
363 assert_tokens!(
364 "1.3.method()",
365 Float("1.3", 0..3)
366 Dot(".", 3..4)
367 Ident("method", 4..10)
368 LeftParen("(", 10..11)
369 RightParen(")", 11..12)
370 );
371 }
372
373 #[test]
374 fn test_int_methods_disambiguation() {
375 assert_tokens!(
376 "1.method()",
377 Int("1", 0..1)
378 Dot(".", 1..2)
379 Ident("method", 2..8)
380 LeftParen("(", 8..9)
381 RightParen(")", 9..10)
382 );
383 }
384
385 #[test]
386 fn test_float_with_exponent() {
387 assert_tokens!("1.0e1", Float("1.0e1", 0..5));
388 assert_tokens!("1.0e-1", Float("1.0e-1", 0..6));
389 }
390
391 #[test]
392 fn test_string() {
393 assert_tokens!("\"abc\"", Str("\"abc\"", 0..5));
394 }
395
396 #[test]
397 fn test_ident() {
398 assert_tokens!("abc", Ident("abc", 0..3));
399 }
400
401 #[test]
402 fn test_unterminated_string() {
403 assert_tokens!("\"abc", !Error("unclosed string", "\"abc", 0..4));
404 }
405
406 #[test]
407 fn test_escaped_strings() {
408 assert_tokens!("\"abc\\\"def\"", Str("\"abc\\\"def\"", 0..10));
409 }
410
411 #[test]
412 fn test_newline() {
413 let file_id = test_file_id();
414 let mut lexer = Lexer::new("a\nb", file_id);
415
416 lexer.assert_next(SyntaxKind::Ident, "a", 0..1);
417 assert!(!lexer.newline());
418 lexer.assert_next(SyntaxKind::Ident, "b", 2..3);
419 assert!(lexer.newline());
420 }
421
422 #[test]
423 fn test_repeating_next_after_end() {
424 let file_id = test_file_id();
425 let mut lexer = Lexer::new("", file_id);
426 lexer.assert_end(0);
427 lexer.assert_end(0);
428 lexer.assert_end(0);
429 lexer.assert_end(0);
430 lexer.assert_end(0);
431 }
432
433 #[test]
434 fn test_comment_kinds() {
435 assert_tokens!("// regular comment", Comment("// regular comment", 0..18));
436 assert_tokens!("/// doc comment", DocComment("/// doc comment", 0..15));
437 assert_tokens!("/* block\ncomment */", Comment("/* block\ncomment */", 0..19));
438 }
439}