sphinx_ultra/
parser.rs

1use anyhow::Result;
2use log::debug;
3use pulldown_cmark::{Event, Parser as MarkdownParser, Tag};
4use regex::Regex;
5use std::collections::HashMap;
6use std::path::Path;
7
8use crate::config::BuildConfig;
9use crate::directives::DirectiveRegistry;
10use crate::document::{
11    CrossReference, Document, DocumentContent, MarkdownContent, MarkdownNode, RstContent,
12    RstDirective, RstNode, TocEntry,
13};
14use crate::roles::RoleRegistry;
15use crate::utils;
16
17pub struct Parser {
18    rst_directive_regex: Regex,
19    cross_ref_regex: Regex,
20    #[allow(dead_code)]
21    directive_registry: DirectiveRegistry,
22    #[allow(dead_code)]
23    role_registry: RoleRegistry,
24}
25
26impl Parser {
27    pub fn new(_config: &BuildConfig) -> Result<Self> {
28        let rst_directive_regex = Regex::new(r"^\s*\.\.\s+(\w+)::\s*(.*?)$")?;
29        let cross_ref_regex = Regex::new(r":(\w+):`([^`]+)`")?;
30        let directive_registry = DirectiveRegistry::new();
31        let role_registry = RoleRegistry::new();
32
33        Ok(Self {
34            rst_directive_regex,
35            cross_ref_regex,
36            directive_registry,
37            role_registry,
38        })
39    }
40
41    pub fn parse(&self, file_path: &Path, content: &str) -> Result<Document> {
42        let output_path = self.get_output_path(file_path)?;
43        let mut document = Document::new(file_path.to_path_buf(), output_path);
44
45        // Set source modification time
46        document.source_mtime = utils::get_file_mtime(file_path)?;
47
48        // Determine file type and parse accordingly
49        let extension = file_path
50            .extension()
51            .and_then(|ext| ext.to_str())
52            .unwrap_or("");
53
54        match extension {
55            "rst" => {
56                document.content = self.parse_rst(content)?;
57            }
58            "md" => {
59                document.content = self.parse_markdown(content)?;
60            }
61            _ => {
62                document.content = DocumentContent::PlainText(content.to_string());
63            }
64        }
65
66        // Extract title from content
67        document.title = self.extract_title(&document.content);
68
69        // Extract table of contents
70        document.toc = self.extract_toc(&document.content);
71
72        // Extract cross-references
73        document.cross_refs = self.extract_cross_refs(content);
74
75        debug!(
76            "Parsed document: {} ({} chars)",
77            file_path.display(),
78            content.len()
79        );
80
81        Ok(document)
82    }
83
84    fn parse_rst(&self, content: &str) -> Result<DocumentContent> {
85        let mut nodes = Vec::new();
86        let mut directives = Vec::new();
87        let lines: Vec<&str> = content.lines().collect();
88
89        let mut i = 0;
90        while i < lines.len() {
91            let line = lines[i];
92            let trimmed = line.trim();
93
94            if trimmed.is_empty() {
95                i += 1;
96                continue;
97            }
98
99            // Check for RST directive
100            if let Some(captures) = self.rst_directive_regex.captures(line) {
101                let directive_name = captures.get(1).unwrap().as_str();
102                let directive_args = captures.get(2).unwrap().as_str();
103
104                let (directive, consumed_lines) =
105                    self.parse_rst_directive(&lines[i..], directive_name, directive_args, i + 1)?;
106
107                directives.push(directive.clone());
108                nodes.push(RstNode::Directive {
109                    name: directive.name,
110                    args: directive.args,
111                    options: directive.options,
112                    content: directive.content,
113                    line: i + 1,
114                });
115
116                i += consumed_lines;
117                continue;
118            }
119
120            // Check for title (underlined with =, -, ~, etc.)
121            if i + 1 < lines.len() {
122                let next_line = lines[i + 1];
123                if !next_line.trim().is_empty()
124                    && next_line.chars().all(|c| "=-~^\"'*+#<>".contains(c))
125                    && next_line.len() >= trimmed.len()
126                {
127                    let level = self.get_rst_title_level(next_line.chars().next().unwrap());
128                    nodes.push(RstNode::Title {
129                        text: trimmed.to_string(),
130                        level,
131                        line: i + 1,
132                    });
133
134                    i += 2;
135                    continue;
136                }
137            }
138
139            // Check for code block (indented text after ::)
140            if line.ends_with("::") {
141                let (code_content, consumed_lines) = self.parse_code_block(&lines[i + 1..]);
142                nodes.push(RstNode::CodeBlock {
143                    language: None,
144                    content: code_content,
145                    line: i + 1,
146                });
147                i += consumed_lines + 1;
148                continue;
149            }
150
151            // Default to paragraph
152            let (paragraph_content, consumed_lines) = self.parse_paragraph(&lines[i..]);
153            nodes.push(RstNode::Paragraph {
154                content: paragraph_content,
155                line: i + 1,
156            });
157            i += consumed_lines;
158        }
159
160        Ok(DocumentContent::RestructuredText(RstContent {
161            raw: content.to_string(),
162            ast: nodes,
163            directives,
164        }))
165    }
166
167    fn parse_markdown(&self, content: &str) -> Result<DocumentContent> {
168        let mut nodes = Vec::new();
169        let parser = MarkdownParser::new(content);
170        let current_line = 1;
171
172        for event in parser {
173            match event {
174                Event::Start(Tag::Heading { .. }) => {
175                    // We'll handle this in the text event
176                }
177                Event::End(_) => {
178                    // Handle end tags generically
179                }
180                Event::Start(Tag::Paragraph) => {
181                    // Start of paragraph
182                }
183                Event::Start(Tag::CodeBlock(_)) => {
184                    // Start of code block
185                }
186                Event::Text(text) => {
187                    // Handle text content based on context
188                    nodes.push(MarkdownNode::Paragraph {
189                        content: text.to_string(),
190                        line: current_line,
191                    });
192                }
193                Event::Code(_code) => {
194                    // Inline code
195                }
196                _ => {
197                    // Handle other events as needed
198                }
199            }
200        }
201
202        Ok(DocumentContent::Markdown(MarkdownContent {
203            raw: content.to_string(),
204            ast: nodes,
205            front_matter: None, // TODO: Parse YAML front matter
206        }))
207    }
208
209    fn parse_rst_directive(
210        &self,
211        lines: &[&str],
212        name: &str,
213        args: &str,
214        start_line: usize,
215    ) -> Result<(RstDirective, usize)> {
216        let mut options = HashMap::new();
217        let mut content = String::new();
218        let mut consumed_lines = 1;
219        let mut i = 1;
220
221        // Parse options (lines starting with :option:)
222        while i < lines.len() {
223            let line = lines[i];
224            if line.trim().is_empty() {
225                i += 1;
226                consumed_lines += 1;
227                continue;
228            }
229
230            if let Some(stripped) = line.strip_prefix("   :") {
231                // This is an option
232                if let Some(colon_pos) = stripped.find(':') {
233                    let option_name = &stripped[..colon_pos];
234                    let option_value = stripped[colon_pos + 1..].trim();
235                    options.insert(option_name.to_string(), option_value.to_string());
236                }
237                i += 1;
238                consumed_lines += 1;
239            } else if line.starts_with("   ") || line.starts_with("\t") {
240                // This is content
241                break;
242            } else {
243                // End of directive
244                break;
245            }
246        }
247
248        // Parse content (indented lines)
249        while i < lines.len() {
250            let line = lines[i];
251            if line.starts_with("   ") || line.starts_with("\t") {
252                content.push_str(&line[3..]); // Remove 3 spaces of indentation
253                content.push('\n');
254                i += 1;
255                consumed_lines += 1;
256            } else if line.trim().is_empty() {
257                content.push('\n');
258                i += 1;
259                consumed_lines += 1;
260            } else {
261                break;
262            }
263        }
264
265        let directive = RstDirective {
266            name: name.to_string(),
267            args: if args.is_empty() {
268                Vec::new()
269            } else {
270                vec![args.to_string()]
271            },
272            options,
273            content: content.trim_end().to_string(),
274            line: start_line,
275        };
276
277        Ok((directive, consumed_lines))
278    }
279
280    fn get_rst_title_level(&self, char: char) -> usize {
281        match char {
282            '#' => 1,
283            '*' => 2,
284            '=' => 3,
285            '-' => 4,
286            '^' => 5,
287            '"' => 6,
288            _ => 7,
289        }
290    }
291
292    fn parse_code_block(&self, lines: &[&str]) -> (String, usize) {
293        let mut content = String::new();
294        let mut consumed_lines = 0;
295
296        for line in lines {
297            if line.starts_with("   ") || line.starts_with("\t") || line.trim().is_empty() {
298                content.push_str(line);
299                content.push('\n');
300                consumed_lines += 1;
301            } else {
302                break;
303            }
304        }
305
306        (content.trim().to_string(), consumed_lines)
307    }
308
309    fn parse_paragraph(&self, lines: &[&str]) -> (String, usize) {
310        let mut content = String::new();
311        let mut consumed_lines = 0;
312
313        for line in lines {
314            let trimmed = line.trim();
315            if trimmed.is_empty() {
316                break;
317            }
318
319            content.push_str(trimmed);
320            content.push(' ');
321            consumed_lines += 1;
322        }
323
324        (content.trim().to_string(), consumed_lines)
325    }
326
327    fn extract_title(&self, content: &DocumentContent) -> String {
328        match content {
329            DocumentContent::RestructuredText(rst) => {
330                for node in &rst.ast {
331                    if let RstNode::Title { text, level: 1, .. } = node {
332                        return text.clone();
333                    }
334                }
335            }
336            DocumentContent::Markdown(md) => {
337                for node in &md.ast {
338                    if let MarkdownNode::Heading { text, level: 1, .. } = node {
339                        return text.clone();
340                    }
341                }
342            }
343            DocumentContent::PlainText(_) => {}
344        }
345
346        "Untitled".to_string()
347    }
348
349    fn extract_toc(&self, content: &DocumentContent) -> Vec<TocEntry> {
350        let mut toc = Vec::new();
351
352        match content {
353            DocumentContent::RestructuredText(rst) => {
354                for node in &rst.ast {
355                    if let RstNode::Title { text, level, line } = node {
356                        let anchor = text.to_lowercase().replace(' ', "-");
357                        toc.push(TocEntry::new(text.clone(), *level, anchor, *line));
358                    }
359                }
360            }
361            DocumentContent::Markdown(md) => {
362                for node in &md.ast {
363                    if let MarkdownNode::Heading { text, level, line } = node {
364                        let anchor = text.to_lowercase().replace(' ', "-");
365                        toc.push(TocEntry::new(text.clone(), *level, anchor, *line));
366                    }
367                }
368            }
369            DocumentContent::PlainText(_) => {}
370        }
371
372        toc
373    }
374
375    fn extract_cross_refs(&self, content: &str) -> Vec<CrossReference> {
376        let mut cross_refs = Vec::new();
377
378        for (line_num, line) in content.lines().enumerate() {
379            for captures in self.cross_ref_regex.captures_iter(line) {
380                let ref_type = captures.get(1).unwrap().as_str();
381                let target = captures.get(2).unwrap().as_str();
382
383                cross_refs.push(CrossReference {
384                    ref_type: ref_type.to_string(),
385                    target: target.to_string(),
386                    text: None,
387                    line_number: line_num + 1,
388                });
389            }
390        }
391
392        cross_refs
393    }
394
395    fn get_output_path(&self, source_path: &Path) -> Result<std::path::PathBuf> {
396        let mut output_path = source_path.to_path_buf();
397        output_path.set_extension("html");
398        Ok(output_path)
399    }
400}