1use anyhow::Result;
2use log::debug;
3use pulldown_cmark::{Event, Parser as MarkdownParser, Tag};
4use regex::Regex;
5use std::collections::HashMap;
6use std::path::Path;
7
8use crate::config::BuildConfig;
9use crate::directives::DirectiveRegistry;
10use crate::document::{
11 CrossReference, Document, DocumentContent, MarkdownContent, MarkdownNode, RstContent,
12 RstDirective, RstNode, TocEntry,
13};
14use crate::roles::RoleRegistry;
15use crate::utils;
16
17pub struct Parser {
18 rst_directive_regex: Regex,
19 cross_ref_regex: Regex,
20 #[allow(dead_code)]
21 directive_registry: DirectiveRegistry,
22 #[allow(dead_code)]
23 role_registry: RoleRegistry,
24}
25
26impl Parser {
27 pub fn new(_config: &BuildConfig) -> Result<Self> {
28 let rst_directive_regex = Regex::new(r"^\s*\.\.\s+(\w+)::\s*(.*?)$")?;
29 let cross_ref_regex = Regex::new(r":(\w+):`([^`]+)`")?;
30 let directive_registry = DirectiveRegistry::new();
31 let role_registry = RoleRegistry::new();
32
33 Ok(Self {
34 rst_directive_regex,
35 cross_ref_regex,
36 directive_registry,
37 role_registry,
38 })
39 }
40
41 pub fn parse(&self, file_path: &Path, content: &str) -> Result<Document> {
42 let output_path = self.get_output_path(file_path)?;
43 let mut document = Document::new(file_path.to_path_buf(), output_path);
44
45 document.source_mtime = utils::get_file_mtime(file_path)?;
47
48 let extension = file_path
50 .extension()
51 .and_then(|ext| ext.to_str())
52 .unwrap_or("");
53
54 match extension {
55 "rst" => {
56 document.content = self.parse_rst(content)?;
57 }
58 "md" => {
59 document.content = self.parse_markdown(content)?;
60 }
61 _ => {
62 document.content = DocumentContent::PlainText(content.to_string());
63 }
64 }
65
66 document.title = self.extract_title(&document.content);
68
69 document.toc = self.extract_toc(&document.content);
71
72 document.cross_refs = self.extract_cross_refs(content);
74
75 debug!(
76 "Parsed document: {} ({} chars)",
77 file_path.display(),
78 content.len()
79 );
80
81 Ok(document)
82 }
83
84 fn parse_rst(&self, content: &str) -> Result<DocumentContent> {
85 let mut nodes = Vec::new();
86 let mut directives = Vec::new();
87 let lines: Vec<&str> = content.lines().collect();
88
89 let mut i = 0;
90 while i < lines.len() {
91 let line = lines[i];
92 let trimmed = line.trim();
93
94 if trimmed.is_empty() {
95 i += 1;
96 continue;
97 }
98
99 if let Some(captures) = self.rst_directive_regex.captures(line) {
101 let directive_name = captures.get(1).unwrap().as_str();
102 let directive_args = captures.get(2).unwrap().as_str();
103
104 let (directive, consumed_lines) =
105 self.parse_rst_directive(&lines[i..], directive_name, directive_args, i + 1)?;
106
107 directives.push(directive.clone());
108 nodes.push(RstNode::Directive {
109 name: directive.name,
110 args: directive.args,
111 options: directive.options,
112 content: directive.content,
113 line: i + 1,
114 });
115
116 i += consumed_lines;
117 continue;
118 }
119
120 if i + 1 < lines.len() {
122 let next_line = lines[i + 1];
123 if !next_line.trim().is_empty()
124 && next_line.chars().all(|c| "=-~^\"'*+#<>".contains(c))
125 && next_line.len() >= trimmed.len()
126 {
127 let level = self.get_rst_title_level(next_line.chars().next().unwrap());
128 nodes.push(RstNode::Title {
129 text: trimmed.to_string(),
130 level,
131 line: i + 1,
132 });
133
134 i += 2;
135 continue;
136 }
137 }
138
139 if line.ends_with("::") {
141 let (code_content, consumed_lines) = self.parse_code_block(&lines[i + 1..]);
142 nodes.push(RstNode::CodeBlock {
143 language: None,
144 content: code_content,
145 line: i + 1,
146 });
147 i += consumed_lines + 1;
148 continue;
149 }
150
151 let (paragraph_content, consumed_lines) = self.parse_paragraph(&lines[i..]);
153 nodes.push(RstNode::Paragraph {
154 content: paragraph_content,
155 line: i + 1,
156 });
157 i += consumed_lines;
158 }
159
160 Ok(DocumentContent::RestructuredText(RstContent {
161 raw: content.to_string(),
162 ast: nodes,
163 directives,
164 }))
165 }
166
167 fn parse_markdown(&self, content: &str) -> Result<DocumentContent> {
168 let mut nodes = Vec::new();
169 let parser = MarkdownParser::new(content);
170 let current_line = 1;
171
172 for event in parser {
173 match event {
174 Event::Start(Tag::Heading { .. }) => {
175 }
177 Event::End(_) => {
178 }
180 Event::Start(Tag::Paragraph) => {
181 }
183 Event::Start(Tag::CodeBlock(_)) => {
184 }
186 Event::Text(text) => {
187 nodes.push(MarkdownNode::Paragraph {
189 content: text.to_string(),
190 line: current_line,
191 });
192 }
193 Event::Code(_code) => {
194 }
196 _ => {
197 }
199 }
200 }
201
202 Ok(DocumentContent::Markdown(MarkdownContent {
203 raw: content.to_string(),
204 ast: nodes,
205 front_matter: None, }))
207 }
208
209 fn parse_rst_directive(
210 &self,
211 lines: &[&str],
212 name: &str,
213 args: &str,
214 start_line: usize,
215 ) -> Result<(RstDirective, usize)> {
216 let mut options = HashMap::new();
217 let mut content = String::new();
218 let mut consumed_lines = 1;
219 let mut i = 1;
220
221 while i < lines.len() {
223 let line = lines[i];
224 if line.trim().is_empty() {
225 i += 1;
226 consumed_lines += 1;
227 continue;
228 }
229
230 if let Some(stripped) = line.strip_prefix(" :") {
231 if let Some(colon_pos) = stripped.find(':') {
233 let option_name = &stripped[..colon_pos];
234 let option_value = stripped[colon_pos + 1..].trim();
235 options.insert(option_name.to_string(), option_value.to_string());
236 }
237 i += 1;
238 consumed_lines += 1;
239 } else if line.starts_with(" ") || line.starts_with("\t") {
240 break;
242 } else {
243 break;
245 }
246 }
247
248 while i < lines.len() {
250 let line = lines[i];
251 if line.starts_with(" ") || line.starts_with("\t") {
252 content.push_str(&line[3..]); content.push('\n');
254 i += 1;
255 consumed_lines += 1;
256 } else if line.trim().is_empty() {
257 content.push('\n');
258 i += 1;
259 consumed_lines += 1;
260 } else {
261 break;
262 }
263 }
264
265 let directive = RstDirective {
266 name: name.to_string(),
267 args: if args.is_empty() {
268 Vec::new()
269 } else {
270 vec![args.to_string()]
271 },
272 options,
273 content: content.trim_end().to_string(),
274 line: start_line,
275 };
276
277 Ok((directive, consumed_lines))
278 }
279
280 fn get_rst_title_level(&self, char: char) -> usize {
281 match char {
282 '#' => 1,
283 '*' => 2,
284 '=' => 3,
285 '-' => 4,
286 '^' => 5,
287 '"' => 6,
288 _ => 7,
289 }
290 }
291
292 fn parse_code_block(&self, lines: &[&str]) -> (String, usize) {
293 let mut content = String::new();
294 let mut consumed_lines = 0;
295
296 for line in lines {
297 if line.starts_with(" ") || line.starts_with("\t") || line.trim().is_empty() {
298 content.push_str(line);
299 content.push('\n');
300 consumed_lines += 1;
301 } else {
302 break;
303 }
304 }
305
306 (content.trim().to_string(), consumed_lines)
307 }
308
309 fn parse_paragraph(&self, lines: &[&str]) -> (String, usize) {
310 let mut content = String::new();
311 let mut consumed_lines = 0;
312
313 for line in lines {
314 let trimmed = line.trim();
315 if trimmed.is_empty() {
316 break;
317 }
318
319 content.push_str(trimmed);
320 content.push(' ');
321 consumed_lines += 1;
322 }
323
324 (content.trim().to_string(), consumed_lines)
325 }
326
327 fn extract_title(&self, content: &DocumentContent) -> String {
328 match content {
329 DocumentContent::RestructuredText(rst) => {
330 for node in &rst.ast {
331 if let RstNode::Title { text, level: 1, .. } = node {
332 return text.clone();
333 }
334 }
335 }
336 DocumentContent::Markdown(md) => {
337 for node in &md.ast {
338 if let MarkdownNode::Heading { text, level: 1, .. } = node {
339 return text.clone();
340 }
341 }
342 }
343 DocumentContent::PlainText(_) => {}
344 }
345
346 "Untitled".to_string()
347 }
348
349 fn extract_toc(&self, content: &DocumentContent) -> Vec<TocEntry> {
350 let mut toc = Vec::new();
351
352 match content {
353 DocumentContent::RestructuredText(rst) => {
354 for node in &rst.ast {
355 if let RstNode::Title { text, level, line } = node {
356 let anchor = text.to_lowercase().replace(' ', "-");
357 toc.push(TocEntry::new(text.clone(), *level, anchor, *line));
358 }
359 }
360 }
361 DocumentContent::Markdown(md) => {
362 for node in &md.ast {
363 if let MarkdownNode::Heading { text, level, line } = node {
364 let anchor = text.to_lowercase().replace(' ', "-");
365 toc.push(TocEntry::new(text.clone(), *level, anchor, *line));
366 }
367 }
368 }
369 DocumentContent::PlainText(_) => {}
370 }
371
372 toc
373 }
374
375 fn extract_cross_refs(&self, content: &str) -> Vec<CrossReference> {
376 let mut cross_refs = Vec::new();
377
378 for (line_num, line) in content.lines().enumerate() {
379 for captures in self.cross_ref_regex.captures_iter(line) {
380 let ref_type = captures.get(1).unwrap().as_str();
381 let target = captures.get(2).unwrap().as_str();
382
383 cross_refs.push(CrossReference {
384 ref_type: ref_type.to_string(),
385 target: target.to_string(),
386 text: None,
387 line_number: line_num + 1,
388 });
389 }
390 }
391
392 cross_refs
393 }
394
395 fn get_output_path(&self, source_path: &Path) -> Result<std::path::PathBuf> {
396 let mut output_path = source_path.to_path_buf();
397 output_path.set_extension("html");
398 Ok(output_path)
399 }
400}