sphinx_ultra/
matching.rs

1//! Pattern matching utilities for file filtering.
2//!
3//! This module provides glob-style pattern matching compatible with Sphinx's
4//! include_patterns and exclude_patterns functionality. It implements the same
5//! pattern translation and matching logic as Sphinx's util/matching.py.
6
7use regex::Regex;
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10use std::sync::Mutex;
11
12lazy_static::lazy_static! {
13    /// Cache for compiled regex patterns
14    static ref PATTERN_CACHE: Mutex<HashMap<String, Regex>> = Mutex::new(HashMap::new());
15}
16
17/// Translates shell-style glob pattern to regex pattern.
18///
19/// This implements the same logic as Sphinx's _translate_pattern function:
20/// - ** matches any files and zero or more directories and subdirectories  
21/// - * matches everything except a directory separator
22/// - ? matches any single character except a directory separator
23/// - [seq] matches any character in seq
24/// - [!seq] matches any character not in seq
25///
26/// Based on Python's fnmatch.translate but with modifications for path handling.
27pub fn translate_pattern(pattern: &str) -> String {
28    let mut regex_pattern = String::new();
29    let mut i = 0;
30    let chars: Vec<char> = pattern.chars().collect();
31    let n = chars.len();
32
33    while i < n {
34        let c = chars[i];
35        match c {
36            '*' => {
37                if i + 1 < n && chars[i + 1] == '*' {
38                    // Handle ** - matches any files and directories
39                    if i + 2 < n && chars[i + 2] == '/' {
40                        // **/
41                        regex_pattern.push_str("(?:[^/]+/)*");
42                        i += 3;
43                    } else if i + 2 == n {
44                        // ** at end
45                        regex_pattern.push_str(".*");
46                        i += 2;
47                    } else {
48                        // **something
49                        regex_pattern.push_str(".*");
50                        i += 2;
51                    }
52                } else {
53                    // Single * - matches everything except directory separator
54                    regex_pattern.push_str("[^/]*");
55                    i += 1;
56                }
57            }
58            '?' => {
59                // ? matches any single character except directory separator
60                regex_pattern.push_str("[^/]");
61                i += 1;
62            }
63            '[' => {
64                // Character class
65                let mut j = i + 1;
66                if j < n && (chars[j] == '!' || chars[j] == '^') {
67                    j += 1;
68                }
69                if j < n && chars[j] == ']' {
70                    j += 1;
71                }
72                while j < n && chars[j] != ']' {
73                    j += 1;
74                }
75                if j >= n {
76                    // No closing ], treat [ as literal
77                    regex_pattern.push_str("\\[");
78                    i += 1;
79                } else {
80                    // Valid character class
81                    let mut class_content = String::new();
82                    let mut k = i + 1;
83
84                    if k < n && (chars[k] == '!' || chars[k] == '^') {
85                        class_content.push('^');
86                        k += 1;
87                    }
88
89                    while k < j {
90                        let ch = chars[k];
91                        if ch == '\\' && k + 1 < j {
92                            class_content.push('\\');
93                            class_content.push(chars[k + 1]);
94                            k += 2;
95                        } else {
96                            class_content.push(ch);
97                            k += 1;
98                        }
99                    }
100
101                    regex_pattern.push('[');
102                    regex_pattern.push_str(&class_content);
103                    regex_pattern.push(']');
104                    i = j + 1;
105                }
106            }
107            _ => {
108                // Escape regex special characters
109                match c {
110                    '\\' | '.' | '^' | '$' | '+' | '{' | '}' | '|' | '(' | ')' => {
111                        regex_pattern.push('\\');
112                        regex_pattern.push(c);
113                    }
114                    _ => {
115                        regex_pattern.push(c);
116                    }
117                }
118                i += 1;
119            }
120        }
121    }
122
123    // Anchor the pattern to match the entire string
124    format!("^{}$", regex_pattern)
125}
126
127/// Compiles a pattern into a regex, using cache for performance.
128pub fn compile_pattern(pattern: &str) -> Result<Regex, regex::Error> {
129    let mut cache = PATTERN_CACHE.lock().unwrap();
130
131    if let Some(regex) = cache.get(pattern) {
132        return Ok(regex.clone());
133    }
134
135    let regex_pattern = translate_pattern(pattern);
136    let regex = Regex::new(&regex_pattern)?;
137    cache.insert(pattern.to_string(), regex.clone());
138
139    Ok(regex)
140}
141
142/// Tests if a name matches a glob pattern.
143pub fn pattern_match(name: &str, pattern: &str) -> Result<bool, regex::Error> {
144    let regex = compile_pattern(pattern)?;
145    Ok(regex.is_match(name))
146}
147
148/// Filters a list of names by a glob pattern.
149pub fn pattern_filter(names: &[String], pattern: &str) -> Result<Vec<String>, regex::Error> {
150    let regex = compile_pattern(pattern)?;
151    Ok(names
152        .iter()
153        .filter(|name| regex.is_match(name))
154        .cloned()
155        .collect())
156}
157
158/// Normalizes a path to use forward slashes for pattern matching.
159/// This ensures consistent behavior across platforms.
160pub fn normalize_path(path: &Path) -> String {
161    path.to_string_lossy().replace('\\', "/")
162}
163
164/// Gets matching files from a directory using include and exclude patterns.
165///
166/// This function implements the same logic as Sphinx's get_matching_files:
167/// - Only files matching some pattern in include_patterns are included
168/// - Exclusions from exclude_patterns take priority over inclusions
169/// - The default include pattern is "**" (all files)
170/// - The default exclude pattern is empty (exclude nothing)
171pub fn get_matching_files<P: AsRef<Path>>(
172    dirname: P,
173    include_patterns: &[String],
174    exclude_patterns: &[String],
175) -> Result<Vec<PathBuf>, Box<dyn std::error::Error>> {
176    let dirname = dirname.as_ref().canonicalize()?;
177    let include_patterns = if include_patterns.is_empty() {
178        vec!["**".to_string()]
179    } else {
180        include_patterns.to_vec()
181    };
182
183    // Compile all patterns
184    let mut include_regexes = Vec::new();
185    for pattern in &include_patterns {
186        include_regexes.push(compile_pattern(pattern)?);
187    }
188
189    let mut exclude_regexes = Vec::new();
190    for pattern in exclude_patterns {
191        exclude_regexes.push(compile_pattern(pattern)?);
192    }
193
194    let mut matched_files = Vec::new();
195
196    // Walk the directory recursively
197    fn walk_dir(
198        dir: &Path,
199        base_dir: &Path,
200        include_regexes: &[Regex],
201        exclude_regexes: &[Regex],
202        matched_files: &mut Vec<PathBuf>,
203    ) -> Result<(), Box<dyn std::error::Error>> {
204        if !dir.is_dir() {
205            return Ok(());
206        }
207
208        for entry in std::fs::read_dir(dir)? {
209            let entry = entry?;
210            let path = entry.path();
211
212            if path.is_dir() {
213                // Recursively walk subdirectories
214                walk_dir(
215                    &path,
216                    base_dir,
217                    include_regexes,
218                    exclude_regexes,
219                    matched_files,
220                )?;
221            } else if path.is_file() {
222                // Get relative path from base directory
223                let relative_path = path.strip_prefix(base_dir)?;
224                let normalized_path = normalize_path(relative_path);
225
226                // Check if file matches any include pattern
227                let included = include_regexes
228                    .iter()
229                    .any(|regex| regex.is_match(&normalized_path));
230
231                if included {
232                    // Check if file matches any exclude pattern
233                    let excluded = exclude_regexes
234                        .iter()
235                        .any(|regex| regex.is_match(&normalized_path));
236
237                    if !excluded {
238                        matched_files.push(path);
239                    }
240                }
241            }
242        }
243
244        Ok(())
245    }
246
247    walk_dir(
248        &dirname,
249        &dirname,
250        &include_regexes,
251        &exclude_regexes,
252        &mut matched_files,
253    )?;
254
255    // Sort for consistent results
256    matched_files.sort();
257
258    Ok(matched_files)
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264    use std::fs;
265    use tempfile::TempDir;
266
267    #[test]
268    fn test_translate_pattern() {
269        // Basic patterns
270        assert_eq!(translate_pattern("*.rst"), "^[^/]*\\.rst$");
271        assert_eq!(translate_pattern("**"), "^.*$");
272        assert_eq!(
273            translate_pattern("**/index.rst"),
274            "^(?:[^/]+/)*index\\.rst$"
275        );
276        assert_eq!(translate_pattern("docs/*.rst"), "^docs/[^/]*\\.rst$");
277
278        // Character classes
279        assert_eq!(translate_pattern("[abc].rst"), "^[abc]\\.rst$");
280        assert_eq!(translate_pattern("[!abc].rst"), "^[^abc]\\.rst$");
281    }
282
283    #[test]
284    fn test_pattern_match() {
285        // Test basic patterns
286        assert!(pattern_match("index.rst", "*.rst").unwrap());
287        assert!(pattern_match("docs/index.rst", "**/*.rst").unwrap());
288        assert!(pattern_match("docs/api/module.rst", "**/api/*.rst").unwrap());
289
290        // Test exclusions
291        assert!(!pattern_match("_build/index.html", "*.rst").unwrap());
292        assert!(pattern_match("_build/index.html", "**").unwrap());
293
294        // Test character classes
295        assert!(pattern_match("a.rst", "[abc].rst").unwrap());
296        assert!(!pattern_match("d.rst", "[abc].rst").unwrap());
297        assert!(!pattern_match("a.rst", "[!abc].rst").unwrap());
298        assert!(pattern_match("d.rst", "[!abc].rst").unwrap());
299    }
300
301    #[test]
302    fn test_get_matching_files() {
303        let temp_dir = TempDir::new().unwrap();
304        let base_path = temp_dir.path();
305
306        // Create test files
307        fs::create_dir_all(base_path.join("docs")).unwrap();
308        fs::create_dir_all(base_path.join("_build")).unwrap();
309        fs::write(base_path.join("index.rst"), "content").unwrap();
310        fs::write(base_path.join("docs/api.rst"), "content").unwrap();
311        fs::write(base_path.join("_build/index.html"), "content").unwrap();
312        fs::write(base_path.join("README.md"), "content").unwrap();
313
314        // Test include all RST files
315        let files = get_matching_files(base_path, &["**/*.rst".to_string()], &[]).unwrap();
316        assert_eq!(files.len(), 2);
317        assert!(files.iter().any(|p| p.file_name().unwrap() == "index.rst"));
318        assert!(files.iter().any(|p| p.file_name().unwrap() == "api.rst"));
319
320        // Test exclude _build directory
321        let files =
322            get_matching_files(base_path, &["**".to_string()], &["_build/**".to_string()]).unwrap();
323        assert!(!files.iter().any(|p| p.to_string_lossy().contains("_build")));
324
325        // Test include RST files but exclude docs directory
326        let files = get_matching_files(
327            base_path,
328            &["**/*.rst".to_string()],
329            &["docs/**".to_string()],
330        )
331        .unwrap();
332        assert_eq!(files.len(), 1);
333        assert!(files.iter().any(|p| p.file_name().unwrap() == "index.rst"));
334        assert!(!files.iter().any(|p| p.file_name().unwrap() == "api.rst"));
335    }
336}