sphinx_ultra/
search.rs

1use anyhow::Result;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4
5/// Search index that mirrors Sphinx's search functionality
6#[derive(Debug, Clone, Default)]
7pub struct SearchIndex {
8    pub docnames: Vec<String>,
9    pub filenames: Vec<String>,
10    pub titles: Vec<String>,
11    pub terms: HashMap<String, Vec<DocumentMatch>>,
12    pub objects: HashMap<String, ObjectReference>,
13    pub objnames: HashMap<String, String>,
14    pub objtypes: HashMap<String, String>,
15    pub language: String,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct DocumentMatch {
20    pub docname_idx: usize,
21    pub title_score: f32,
22    pub content_score: f32,
23    pub positions: Vec<usize>,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct ObjectReference {
28    pub docname_idx: usize,
29    pub anchor: Option<String>,
30    pub name: String,
31    pub description: Option<String>,
32}
33
34impl SearchIndex {
35    pub fn new(language: String) -> Self {
36        Self {
37            language,
38            ..Default::default()
39        }
40    }
41
42    /// Add a document to the search index
43    pub fn add_document(
44        &mut self,
45        docname: String,
46        filename: String,
47        title: String,
48        content: &str,
49    ) -> Result<()> {
50        let docname_idx = self.docnames.len();
51        self.docnames.push(docname);
52        self.filenames.push(filename);
53        self.titles.push(title);
54
55        // Extract and index terms from content
56        self.index_content(docname_idx, content)?;
57
58        Ok(())
59    }
60
61    /// Add an object to the search index
62    pub fn add_object(
63        &mut self,
64        name: String,
65        docname: &str,
66        anchor: Option<String>,
67        obj_type: &str,
68        description: Option<String>,
69    ) -> Result<()> {
70        let docname_idx = self
71            .docnames
72            .iter()
73            .position(|d| d == docname)
74            .unwrap_or_else(|| {
75                self.docnames.push(docname.to_string());
76                self.docnames.len() - 1
77            });
78
79        let object_ref = ObjectReference {
80            docname_idx,
81            anchor,
82            name: name.clone(),
83            description,
84        };
85
86        self.objects.insert(name, object_ref);
87        self.objtypes
88            .insert(obj_type.to_string(), obj_type.to_string());
89
90        Ok(())
91    }
92
93    /// Index content for full-text search
94    fn index_content(&mut self, docname_idx: usize, content: &str) -> Result<()> {
95        let words = self.extract_words(content);
96
97        for (word, positions) in words {
98            let normalized_word = self.normalize_word(&word);
99            if !normalized_word.is_empty() && normalized_word.len() >= 2 {
100                let doc_match = DocumentMatch {
101                    docname_idx,
102                    title_score: 0.0,
103                    content_score: positions.len() as f32,
104                    positions,
105                };
106
107                self.terms
108                    .entry(normalized_word)
109                    .or_default()
110                    .push(doc_match);
111            }
112        }
113
114        Ok(())
115    }
116
117    /// Extract words and their positions from content
118    fn extract_words(&self, content: &str) -> HashMap<String, Vec<usize>> {
119        let mut words = HashMap::new();
120
121        for (position, word) in content.split_whitespace().enumerate() {
122            let cleaned_word = self.clean_word(word);
123            if !cleaned_word.is_empty() {
124                words
125                    .entry(cleaned_word)
126                    .or_insert_with(Vec::new)
127                    .push(position);
128            }
129        }
130
131        words
132    }
133
134    /// Clean a word by removing punctuation
135    fn clean_word(&self, word: &str) -> String {
136        word.chars()
137            .filter(|c| c.is_alphanumeric() || *c == '_' || *c == '-')
138            .collect::<String>()
139            .to_lowercase()
140    }
141
142    /// Normalize a word for indexing
143    fn normalize_word(&self, word: &str) -> String {
144        // Apply language-specific normalization
145        match self.language.as_str() {
146            "en" => self.normalize_english(word),
147            _ => word.to_lowercase(),
148        }
149    }
150
151    /// English-specific word normalization (basic stemming)
152    fn normalize_english(&self, word: &str) -> String {
153        let word = word.to_lowercase();
154
155        // Very basic stemming - remove common suffixes
156        if word.ends_with("ing") && word.len() > 4 {
157            word[..word.len() - 3].to_string()
158        } else if word.ends_with("ed") && word.len() > 3 {
159            word[..word.len() - 2].to_string()
160        } else if word.ends_with("s") && word.len() > 2 {
161            word[..word.len() - 1].to_string()
162        } else {
163            word
164        }
165    }
166
167    /// Search for documents matching a query
168    pub fn search(&self, query: &str) -> Vec<SearchResult> {
169        let query_terms: Vec<String> = query
170            .split_whitespace()
171            .map(|term| self.normalize_word(&self.clean_word(term)))
172            .filter(|term| !term.is_empty())
173            .collect();
174
175        if query_terms.is_empty() {
176            return Vec::new();
177        }
178
179        let mut doc_scores: HashMap<usize, f32> = HashMap::new();
180
181        // Calculate scores for each document
182        for term in &query_terms {
183            if let Some(matches) = self.terms.get(term) {
184                for doc_match in matches {
185                    let score = doc_match.title_score * 5.0 + doc_match.content_score;
186                    *doc_scores.entry(doc_match.docname_idx).or_insert(0.0) += score;
187                }
188            }
189        }
190
191        // Convert to search results and sort by score
192        let mut results: Vec<SearchResult> = doc_scores
193            .into_iter()
194            .map(|(docname_idx, score)| SearchResult {
195                docname: self.docnames[docname_idx].clone(),
196                filename: self.filenames.get(docname_idx).cloned().unwrap_or_default(),
197                title: self.titles.get(docname_idx).cloned().unwrap_or_default(),
198                score,
199                excerpt: self.generate_excerpt(docname_idx, &query_terms),
200            })
201            .collect();
202
203        results.sort_by(|a, b| {
204            b.score
205                .partial_cmp(&a.score)
206                .unwrap_or(std::cmp::Ordering::Equal)
207        });
208        results.truncate(50); // Limit results
209
210        results
211    }
212
213    /// Generate an excerpt for search results
214    fn generate_excerpt(&self, _docname_idx: usize, _query_terms: &[String]) -> String {
215        // TODO: Implement excerpt generation
216        String::new()
217    }
218
219    /// Prune the search index by removing documents not in the given set
220    pub fn prune(&mut self, valid_docs: &std::collections::HashSet<String>) {
221        let mut new_docnames = Vec::new();
222        let mut new_filenames = Vec::new();
223        let mut new_titles = Vec::new();
224        let mut doc_mapping = HashMap::new();
225
226        // Build new document lists and mapping
227        for (old_idx, docname) in self.docnames.iter().enumerate() {
228            if valid_docs.contains(docname) {
229                let new_idx = new_docnames.len();
230                doc_mapping.insert(old_idx, new_idx);
231                new_docnames.push(docname.clone());
232                new_filenames.push(self.filenames.get(old_idx).cloned().unwrap_or_default());
233                new_titles.push(self.titles.get(old_idx).cloned().unwrap_or_default());
234            }
235        }
236
237        // Update document lists
238        self.docnames = new_docnames;
239        self.filenames = new_filenames;
240        self.titles = new_titles;
241
242        // Update terms with new document indices
243        for matches in self.terms.values_mut() {
244            matches.retain_mut(|doc_match| {
245                if let Some(&new_idx) = doc_mapping.get(&doc_match.docname_idx) {
246                    doc_match.docname_idx = new_idx;
247                    true
248                } else {
249                    false
250                }
251            });
252        }
253
254        // Remove empty terms
255        self.terms.retain(|_, matches| !matches.is_empty());
256
257        // Update objects with new document indices
258        self.objects.retain(|_, obj_ref| {
259            if let Some(&new_idx) = doc_mapping.get(&obj_ref.docname_idx) {
260                obj_ref.docname_idx = new_idx;
261                true
262            } else {
263                false
264            }
265        });
266    }
267
268    /// Export search index to JSON format compatible with Sphinx
269    pub fn to_json(&self) -> Result<String> {
270        #[derive(Serialize)]
271        struct JsonSearchIndex<'a> {
272            docnames: &'a Vec<String>,
273            filenames: &'a Vec<String>,
274            titles: &'a Vec<String>,
275            terms: &'a HashMap<String, Vec<DocumentMatch>>,
276            objects: &'a HashMap<String, ObjectReference>,
277            objnames: &'a HashMap<String, String>,
278            objtypes: &'a HashMap<String, String>,
279        }
280
281        let json_index = JsonSearchIndex {
282            docnames: &self.docnames,
283            filenames: &self.filenames,
284            titles: &self.titles,
285            terms: &self.terms,
286            objects: &self.objects,
287            objnames: &self.objnames,
288            objtypes: &self.objtypes,
289        };
290
291        Ok(serde_json::to_string(&json_index)?)
292    }
293}
294
295/// Search result returned by the search index
296#[derive(Debug, Clone, Serialize, Deserialize)]
297pub struct SearchResult {
298    pub docname: String,
299    pub filename: String,
300    pub title: String,
301    pub score: f32,
302    pub excerpt: String,
303}
304
305/// Search index builder for incremental updates
306pub struct SearchIndexBuilder {
307    index: SearchIndex,
308    processed_docs: std::collections::HashSet<String>,
309}
310
311impl SearchIndexBuilder {
312    pub fn new(language: String) -> Self {
313        Self {
314            index: SearchIndex::new(language),
315            processed_docs: std::collections::HashSet::new(),
316        }
317    }
318
319    /// Add or update a document in the search index
320    pub fn add_or_update_document(
321        &mut self,
322        docname: String,
323        filename: String,
324        title: String,
325        content: &str,
326    ) -> Result<()> {
327        // Remove existing document if it exists
328        if self.processed_docs.contains(&docname) {
329            self.remove_document(&docname);
330        }
331
332        // Add the document
333        self.index
334            .add_document(docname.clone(), filename, title, content)?;
335        self.processed_docs.insert(docname);
336
337        Ok(())
338    }
339
340    /// Remove a document from the search index
341    pub fn remove_document(&mut self, docname: &str) {
342        if let Some(docname_idx) = self.index.docnames.iter().position(|d| d == docname) {
343            // Remove from document lists
344            self.index.docnames.remove(docname_idx);
345            if docname_idx < self.index.filenames.len() {
346                self.index.filenames.remove(docname_idx);
347            }
348            if docname_idx < self.index.titles.len() {
349                self.index.titles.remove(docname_idx);
350            }
351
352            // Update indices in terms
353            for matches in self.index.terms.values_mut() {
354                matches.retain_mut(|doc_match| {
355                    if doc_match.docname_idx == docname_idx {
356                        false
357                    } else if doc_match.docname_idx > docname_idx {
358                        doc_match.docname_idx -= 1;
359                        true
360                    } else {
361                        true
362                    }
363                });
364            }
365
366            // Remove empty terms
367            self.index.terms.retain(|_, matches| !matches.is_empty());
368
369            // Update indices in objects
370            self.index.objects.retain(|_, obj_ref| {
371                if obj_ref.docname_idx == docname_idx {
372                    false
373                } else if obj_ref.docname_idx > docname_idx {
374                    obj_ref.docname_idx -= 1;
375                    true
376                } else {
377                    true
378                }
379            });
380        }
381
382        self.processed_docs.remove(docname);
383    }
384
385    /// Get the built search index
386    pub fn build(self) -> SearchIndex {
387        self.index
388    }
389}
390
391#[cfg(test)]
392mod tests {
393    use super::*;
394
395    #[test]
396    fn test_search_index_creation() {
397        let index = SearchIndex::new("en".to_string());
398        assert_eq!(index.language, "en");
399        assert_eq!(index.docnames.len(), 0);
400    }
401
402    #[test]
403    fn test_add_document() {
404        let mut index = SearchIndex::new("en".to_string());
405        index
406            .add_document(
407                "test".to_string(),
408                "test.html".to_string(),
409                "Test Document".to_string(),
410                "This is a test document with some content.",
411            )
412            .unwrap();
413
414        assert_eq!(index.docnames.len(), 1);
415        assert_eq!(index.docnames[0], "test");
416        assert!(index.terms.contains_key("test"));
417        assert!(index.terms.contains_key("document"));
418    }
419
420    #[test]
421    fn test_word_normalization() {
422        let index = SearchIndex::new("en".to_string());
423
424        assert_eq!(index.normalize_english("running"), "runn");
425        assert_eq!(index.normalize_english("walked"), "walk");
426        assert_eq!(index.normalize_english("tests"), "test");
427        assert_eq!(index.normalize_english("test"), "test");
428    }
429
430    #[test]
431    fn test_search() {
432        let mut index = SearchIndex::new("en".to_string());
433        index
434            .add_document(
435                "test1".to_string(),
436                "test1.html".to_string(),
437                "First Test".to_string(),
438                "This is the first test document.",
439            )
440            .unwrap();
441        index
442            .add_document(
443                "test2".to_string(),
444                "test2.html".to_string(),
445                "Second Test".to_string(),
446                "This is the second test document with more content.",
447            )
448            .unwrap();
449
450        let results = index.search("test document");
451        assert!(!results.is_empty());
452        assert!(results
453            .iter()
454            .any(|r| r.docname == "test1" || r.docname == "test2"));
455    }
456
457    #[test]
458    fn test_search_index_builder() {
459        let mut builder = SearchIndexBuilder::new("en".to_string());
460
461        builder
462            .add_or_update_document(
463                "test".to_string(),
464                "test.html".to_string(),
465                "Test".to_string(),
466                "Content",
467            )
468            .unwrap();
469
470        let index = builder.build();
471        assert_eq!(index.docnames.len(), 1);
472    }
473}