sphinx_ultra/directives/validation/
parser.rs

1//! Parser for extracting directives and roles from RST content
2
3use super::{ParsedDirective, ParsedRole, SourceLocation};
4use lazy_static::lazy_static;
5use regex::Regex;
6use std::collections::HashMap;
7
8lazy_static! {
9    /// Regex for matching directive patterns
10    static ref DIRECTIVE_REGEX: Regex = Regex::new(
11        r"(?m)^\.\. ([a-zA-Z][a-zA-Z0-9_-]*)::(.*?)$"
12    ).unwrap();
13
14    /// Regex for matching directive options
15    static ref OPTION_REGEX: Regex = Regex::new(
16        r"(?m)^\s+:([a-zA-Z][a-zA-Z0-9_-]*): ?(.*?)$"
17    ).unwrap();
18
19    /// Regex for matching role patterns
20    static ref ROLE_REGEX: Regex = Regex::new(
21        r":([a-zA-Z][a-zA-Z0-9_-]*):(`[^`]+`|[^\s]+)"
22    ).unwrap();
23
24    /// Regex for parsing role with display text
25    static ref ROLE_WITH_TEXT_REGEX: Regex = Regex::new(
26        r"`([^<]+)<([^>]+)>`"
27    ).unwrap();
28}
29
30/// Parser for extracting directives and roles from RST content
31pub struct DirectiveRoleParser {
32    /// Source file being parsed
33    source_file: String,
34}
35
36impl DirectiveRoleParser {
37    /// Creates a new parser for the given source file
38    pub fn new(source_file: String) -> Self {
39        Self { source_file }
40    }
41
42    /// Extracts all directives from the given content
43    pub fn extract_directives(&self, content: &str) -> Vec<ParsedDirective> {
44        let mut directives = Vec::new();
45        let lines: Vec<&str> = content.lines().collect();
46
47        for (line_num, line) in lines.iter().enumerate() {
48            if let Some(captures) = DIRECTIVE_REGEX.captures(line) {
49                let directive_name = captures.get(1).unwrap().as_str().to_string();
50                let args_str = captures.get(2).unwrap().as_str().trim();
51
52                // Parse arguments
53                let arguments: Vec<String> = if args_str.is_empty() {
54                    Vec::new()
55                } else {
56                    args_str.split_whitespace().map(|s| s.to_string()).collect()
57                };
58
59                // Look for options and content in following lines
60                let (options, content, _content_end_line) =
61                    self.parse_directive_body(&lines, line_num + 1);
62
63                let directive = ParsedDirective {
64                    name: directive_name,
65                    arguments,
66                    options,
67                    content,
68                    location: SourceLocation {
69                        file: self.source_file.clone(),
70                        line: line_num + 1,
71                        column: line.find("..").unwrap_or(0) + 1,
72                    },
73                };
74
75                directives.push(directive);
76            }
77        }
78
79        directives
80    }
81
82    /// Extracts all roles from the given content
83    pub fn extract_roles(&self, content: &str) -> Vec<ParsedRole> {
84        let mut roles = Vec::new();
85        let lines: Vec<&str> = content.lines().collect();
86
87        for (line_num, line) in lines.iter().enumerate() {
88            for captures in ROLE_REGEX.captures_iter(line) {
89                let role_name = captures.get(1).unwrap().as_str().to_string();
90                let role_content = captures.get(2).unwrap().as_str();
91
92                // Remove backticks if present
93                let role_content = if role_content.starts_with('`') && role_content.ends_with('`') {
94                    &role_content[1..role_content.len() - 1]
95                } else {
96                    role_content
97                };
98
99                // Check for display text format: `Display Text <target>`
100                let (target, display_text) = if role_content.contains('<')
101                    && role_content.contains('>')
102                {
103                    // Try to parse "Display Text <target>" format (without expecting backticks)
104                    if let Some(angle_start) = role_content.rfind('<') {
105                        if let Some(angle_end) = role_content.rfind('>') {
106                            if angle_start < angle_end {
107                                let display = role_content[..angle_start].trim().to_string();
108                                let target = role_content[angle_start + 1..angle_end].to_string();
109                                (
110                                    target,
111                                    if display.is_empty() {
112                                        None
113                                    } else {
114                                        Some(display)
115                                    },
116                                )
117                            } else {
118                                (role_content.to_string(), None)
119                            }
120                        } else {
121                            (role_content.to_string(), None)
122                        }
123                    } else {
124                        (role_content.to_string(), None)
125                    }
126                } else {
127                    (role_content.to_string(), None)
128                };
129
130                let role = ParsedRole {
131                    name: role_name,
132                    target,
133                    display_text,
134                    location: SourceLocation {
135                        file: self.source_file.clone(),
136                        line: line_num + 1,
137                        column: line.find(':').unwrap_or(0) + 1,
138                    },
139                };
140
141                roles.push(role);
142            }
143        }
144
145        roles
146    }
147
148    /// Parses directive body (options and content)
149    fn parse_directive_body(
150        &self,
151        lines: &[&str],
152        start_line: usize,
153    ) -> (HashMap<String, String>, String, usize) {
154        let mut options = HashMap::new();
155        let mut content_lines = Vec::new();
156        let mut current_line = start_line;
157        let mut in_content = false;
158
159        while current_line < lines.len() {
160            let line = lines[current_line];
161
162            // Empty line
163            if line.trim().is_empty() {
164                if in_content {
165                    content_lines.push(String::new());
166                }
167                current_line += 1;
168                continue;
169            }
170
171            // Check for option
172            if let Some(option_captures) = OPTION_REGEX.captures(line) {
173                if !in_content {
174                    let option_name = option_captures.get(1).unwrap().as_str().to_string();
175                    let option_value = option_captures.get(2).unwrap().as_str().to_string();
176                    options.insert(option_name, option_value);
177                    current_line += 1;
178                    continue;
179                }
180            }
181
182            // Check if line is indented (content)
183            if line.starts_with("   ") || line.starts_with('\t') {
184                in_content = true;
185                // Remove common indentation
186                let content_line = if let Some(stripped) = line.strip_prefix("   ") {
187                    stripped
188                } else if let Some(stripped) = line.strip_prefix('\t') {
189                    stripped
190                } else {
191                    line
192                };
193                content_lines.push(content_line.to_string());
194                current_line += 1;
195                continue;
196            }
197
198            // Non-indented line after we've seen content means end of directive
199            if in_content {
200                break;
201            }
202
203            // If we haven't seen options or content, this might be the start of content
204            if !line.starts_with(':') {
205                break;
206            }
207
208            current_line += 1;
209        }
210
211        let content = content_lines.join("\n");
212        (options, content, current_line)
213    }
214
215    /// Extracts both directives and roles from content
216    pub fn parse_content(&self, content: &str) -> (Vec<ParsedDirective>, Vec<ParsedRole>) {
217        let directives = self.extract_directives(content);
218        let roles = self.extract_roles(content);
219        (directives, roles)
220    }
221
222    /// Validates that a line contains a properly formatted directive
223    pub fn is_directive_line(line: &str) -> bool {
224        DIRECTIVE_REGEX.is_match(line)
225    }
226
227    /// Validates that text contains a role
228    pub fn contains_role(text: &str) -> bool {
229        ROLE_REGEX.is_match(text)
230    }
231
232    /// Counts the number of directives in content
233    pub fn count_directives(content: &str) -> usize {
234        DIRECTIVE_REGEX.find_iter(content).count()
235    }
236
237    /// Counts the number of roles in content
238    pub fn count_roles(content: &str) -> usize {
239        ROLE_REGEX.find_iter(content).count()
240    }
241}
242
243/// Statistics about parsed content
244#[derive(Debug, Default, Clone)]
245pub struct ParseStatistics {
246    /// Number of directives found
247    pub directive_count: usize,
248    /// Number of roles found
249    pub role_count: usize,
250    /// Breakdown by directive type
251    pub directives_by_type: HashMap<String, usize>,
252    /// Breakdown by role type
253    pub roles_by_type: HashMap<String, usize>,
254    /// Lines processed
255    pub lines_processed: usize,
256}
257
258impl ParseStatistics {
259    /// Creates new parse statistics
260    pub fn new() -> Self {
261        Self::default()
262    }
263
264    /// Records a directive
265    pub fn record_directive(&mut self, directive: &ParsedDirective) {
266        self.directive_count += 1;
267        *self
268            .directives_by_type
269            .entry(directive.name.clone())
270            .or_insert(0) += 1;
271    }
272
273    /// Records a role
274    pub fn record_role(&mut self, role: &ParsedRole) {
275        self.role_count += 1;
276        *self.roles_by_type.entry(role.name.clone()).or_insert(0) += 1;
277    }
278
279    /// Records lines processed
280    pub fn set_lines_processed(&mut self, lines: usize) {
281        self.lines_processed = lines;
282    }
283
284    /// Returns total items parsed
285    pub fn total_items(&self) -> usize {
286        self.directive_count + self.role_count
287    }
288}
289
290/// Enhanced parser with statistics tracking
291pub struct StatisticalDirectiveRoleParser {
292    parser: DirectiveRoleParser,
293    statistics: ParseStatistics,
294}
295
296impl StatisticalDirectiveRoleParser {
297    /// Creates a new statistical parser
298    pub fn new(source_file: String) -> Self {
299        Self {
300            parser: DirectiveRoleParser::new(source_file),
301            statistics: ParseStatistics::new(),
302        }
303    }
304
305    /// Parses content and updates statistics
306    pub fn parse_with_statistics(
307        &mut self,
308        content: &str,
309    ) -> (Vec<ParsedDirective>, Vec<ParsedRole>) {
310        let (directives, roles) = self.parser.parse_content(content);
311
312        // Update statistics
313        self.statistics.set_lines_processed(content.lines().count());
314
315        for directive in &directives {
316            self.statistics.record_directive(directive);
317        }
318
319        for role in &roles {
320            self.statistics.record_role(role);
321        }
322
323        (directives, roles)
324    }
325
326    /// Returns current statistics
327    pub fn statistics(&self) -> &ParseStatistics {
328        &self.statistics
329    }
330
331    /// Resets statistics
332    pub fn reset_statistics(&mut self) {
333        self.statistics = ParseStatistics::new();
334    }
335}
336
337#[cfg(test)]
338mod tests {
339    use super::*;
340
341    #[test]
342    fn test_directive_parsing() {
343        let parser = DirectiveRoleParser::new("test.rst".to_string());
344
345        let content = r#"
346.. note:: This is a note
347
348   This is the content of the note.
349   It can span multiple lines.
350
351.. code-block:: python
352   :linenos:
353   :caption: Example code
354
355   def hello():
356       print("Hello, world!")
357"#;
358
359        let directives = parser.extract_directives(content);
360        assert_eq!(directives.len(), 2);
361
362        // Check note directive
363        assert_eq!(directives[0].name, "note");
364        assert_eq!(directives[0].arguments.len(), 4); // "This", "is", "a", "note"
365        assert_eq!(directives[0].arguments[0], "This");
366        assert_eq!(directives[0].arguments[1], "is");
367        assert_eq!(directives[0].arguments[2], "a");
368        assert_eq!(directives[0].arguments[3], "note");
369        assert!(directives[0].content.contains("content of the note"));
370
371        // Check code-block directive
372        assert_eq!(directives[1].name, "code-block");
373        assert_eq!(directives[1].arguments.len(), 1);
374        assert_eq!(directives[1].arguments[0], "python");
375        assert_eq!(directives[1].options.len(), 2);
376        assert!(directives[1].options.contains_key("linenos"));
377        assert_eq!(
378            directives[1].options.get("caption"),
379            Some(&"Example code".to_string())
380        );
381        assert!(directives[1].content.contains("def hello()"));
382    }
383
384    #[test]
385    fn test_role_parsing() {
386        let parser = DirectiveRoleParser::new("test.rst".to_string());
387
388        let content = r#"
389See :doc:`installation` for setup instructions.
390Use :ref:`advanced-config` for configuration.
391Download the :download:`example.pdf` file.
392For math, use :math:`x = \frac{a}{b}`.
393See :doc:`Custom Title <installation>` for details.
394"#;
395
396        let roles = parser.extract_roles(content);
397        assert_eq!(roles.len(), 5);
398
399        // Check doc role
400        assert_eq!(roles[0].name, "doc");
401        assert_eq!(roles[0].target, "installation");
402        assert_eq!(roles[0].display_text, None);
403
404        // Check ref role
405        assert_eq!(roles[1].name, "ref");
406        assert_eq!(roles[1].target, "advanced-config");
407
408        // Check download role
409        assert_eq!(roles[2].name, "download");
410        assert_eq!(roles[2].target, "example.pdf");
411
412        // Check math role
413        assert_eq!(roles[3].name, "math");
414        assert_eq!(roles[3].target, r"x = \frac{a}{b}");
415
416        // Check doc role with display text
417        assert_eq!(roles[4].name, "doc");
418        assert_eq!(roles[4].target, "installation");
419        assert_eq!(roles[4].display_text, Some("Custom Title".to_string()));
420    }
421
422    #[test]
423    fn test_statistical_parser() {
424        let mut parser = StatisticalDirectiveRoleParser::new("test.rst".to_string());
425
426        let content = r#"
427.. note:: Test note
428
429   Content here.
430
431See :doc:`test` and :ref:`section`.
432"#;
433
434        let (directives, roles) = parser.parse_with_statistics(content);
435
436        assert_eq!(directives.len(), 1);
437        assert_eq!(roles.len(), 2);
438
439        let stats = parser.statistics();
440        assert_eq!(stats.directive_count, 1);
441        assert_eq!(stats.role_count, 2);
442        assert_eq!(stats.total_items(), 3);
443        assert_eq!(stats.directives_by_type.get("note"), Some(&1));
444        assert_eq!(stats.roles_by_type.get("doc"), Some(&1));
445        assert_eq!(stats.roles_by_type.get("ref"), Some(&1));
446    }
447
448    #[test]
449    fn test_utility_functions() {
450        assert!(DirectiveRoleParser::is_directive_line(".. note:: Test"));
451        assert!(!DirectiveRoleParser::is_directive_line(
452            "This is not a directive"
453        ));
454
455        assert!(DirectiveRoleParser::contains_role("See :doc:`test` here"));
456        assert!(!DirectiveRoleParser::contains_role("No roles here"));
457
458        let content = ".. note:: Test\n.. warning:: Another\nSee :doc:`test` and :ref:`section`.";
459        assert_eq!(DirectiveRoleParser::count_directives(content), 2);
460        assert_eq!(DirectiveRoleParser::count_roles(content), 2);
461    }
462
463    #[test]
464    fn test_directive_options_parsing() {
465        let parser = DirectiveRoleParser::new("test.rst".to_string());
466
467        let content = r#"
468.. figure:: image.png
469   :width: 100px
470   :alt: Test image
471   :align: center
472
473   This is the caption.
474"#;
475
476        let directives = parser.extract_directives(content);
477        assert_eq!(directives.len(), 1);
478
479        let directive = &directives[0];
480        assert_eq!(directive.name, "figure");
481        assert_eq!(directive.arguments.len(), 1);
482        assert_eq!(directive.arguments[0], "image.png");
483        assert_eq!(directive.options.len(), 3);
484        assert_eq!(directive.options.get("width"), Some(&"100px".to_string()));
485        assert_eq!(
486            directive.options.get("alt"),
487            Some(&"Test image".to_string())
488        );
489        assert_eq!(directive.options.get("align"), Some(&"center".to_string()));
490        assert_eq!(directive.content.trim(), "This is the caption.");
491    }
492}