1use anyhow::Result;
2use serde::{Deserialize, Serialize};
3use std::collections::HashMap;
4
5#[derive(Debug, Clone, Default)]
7pub struct SearchIndex {
8 pub docnames: Vec<String>,
9 pub filenames: Vec<String>,
10 pub titles: Vec<String>,
11 pub terms: HashMap<String, Vec<DocumentMatch>>,
12 pub objects: HashMap<String, ObjectReference>,
13 pub objnames: HashMap<String, String>,
14 pub objtypes: HashMap<String, String>,
15 pub language: String,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
19pub struct DocumentMatch {
20 pub docname_idx: usize,
21 pub title_score: f32,
22 pub content_score: f32,
23 pub positions: Vec<usize>,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct ObjectReference {
28 pub docname_idx: usize,
29 pub anchor: Option<String>,
30 pub name: String,
31 pub description: Option<String>,
32}
33
34impl SearchIndex {
35 pub fn new(language: String) -> Self {
36 Self {
37 language,
38 ..Default::default()
39 }
40 }
41
42 pub fn add_document(
44 &mut self,
45 docname: String,
46 filename: String,
47 title: String,
48 content: &str,
49 ) -> Result<()> {
50 let docname_idx = self.docnames.len();
51 self.docnames.push(docname);
52 self.filenames.push(filename);
53 self.titles.push(title);
54
55 self.index_content(docname_idx, content)?;
57
58 Ok(())
59 }
60
61 pub fn add_object(
63 &mut self,
64 name: String,
65 docname: &str,
66 anchor: Option<String>,
67 obj_type: &str,
68 description: Option<String>,
69 ) -> Result<()> {
70 let docname_idx = self
71 .docnames
72 .iter()
73 .position(|d| d == docname)
74 .unwrap_or_else(|| {
75 self.docnames.push(docname.to_string());
76 self.docnames.len() - 1
77 });
78
79 let object_ref = ObjectReference {
80 docname_idx,
81 anchor,
82 name: name.clone(),
83 description,
84 };
85
86 self.objects.insert(name, object_ref);
87 self.objtypes
88 .insert(obj_type.to_string(), obj_type.to_string());
89
90 Ok(())
91 }
92
93 fn index_content(&mut self, docname_idx: usize, content: &str) -> Result<()> {
95 let words = self.extract_words(content);
96
97 for (word, positions) in words {
98 let normalized_word = self.normalize_word(&word);
99 if !normalized_word.is_empty() && normalized_word.len() >= 2 {
100 let doc_match = DocumentMatch {
101 docname_idx,
102 title_score: 0.0,
103 content_score: positions.len() as f32,
104 positions,
105 };
106
107 self.terms
108 .entry(normalized_word)
109 .or_default()
110 .push(doc_match);
111 }
112 }
113
114 Ok(())
115 }
116
117 fn extract_words(&self, content: &str) -> HashMap<String, Vec<usize>> {
119 let mut words = HashMap::new();
120
121 for (position, word) in content.split_whitespace().enumerate() {
122 let cleaned_word = self.clean_word(word);
123 if !cleaned_word.is_empty() {
124 words
125 .entry(cleaned_word)
126 .or_insert_with(Vec::new)
127 .push(position);
128 }
129 }
130
131 words
132 }
133
134 fn clean_word(&self, word: &str) -> String {
136 word.chars()
137 .filter(|c| c.is_alphanumeric() || *c == '_' || *c == '-')
138 .collect::<String>()
139 .to_lowercase()
140 }
141
142 fn normalize_word(&self, word: &str) -> String {
144 match self.language.as_str() {
146 "en" => self.normalize_english(word),
147 _ => word.to_lowercase(),
148 }
149 }
150
151 fn normalize_english(&self, word: &str) -> String {
153 let word = word.to_lowercase();
154
155 if word.ends_with("ing") && word.len() > 4 {
157 word[..word.len() - 3].to_string()
158 } else if word.ends_with("ed") && word.len() > 3 {
159 word[..word.len() - 2].to_string()
160 } else if word.ends_with("s") && word.len() > 2 {
161 word[..word.len() - 1].to_string()
162 } else {
163 word
164 }
165 }
166
167 pub fn search(&self, query: &str) -> Vec<SearchResult> {
169 let query_terms: Vec<String> = query
170 .split_whitespace()
171 .map(|term| self.normalize_word(&self.clean_word(term)))
172 .filter(|term| !term.is_empty())
173 .collect();
174
175 if query_terms.is_empty() {
176 return Vec::new();
177 }
178
179 let mut doc_scores: HashMap<usize, f32> = HashMap::new();
180
181 for term in &query_terms {
183 if let Some(matches) = self.terms.get(term) {
184 for doc_match in matches {
185 let score = doc_match.title_score * 5.0 + doc_match.content_score;
186 *doc_scores.entry(doc_match.docname_idx).or_insert(0.0) += score;
187 }
188 }
189 }
190
191 let mut results: Vec<SearchResult> = doc_scores
193 .into_iter()
194 .map(|(docname_idx, score)| SearchResult {
195 docname: self.docnames[docname_idx].clone(),
196 filename: self.filenames.get(docname_idx).cloned().unwrap_or_default(),
197 title: self.titles.get(docname_idx).cloned().unwrap_or_default(),
198 score,
199 excerpt: self.generate_excerpt(docname_idx, &query_terms),
200 })
201 .collect();
202
203 results.sort_by(|a, b| {
204 b.score
205 .partial_cmp(&a.score)
206 .unwrap_or(std::cmp::Ordering::Equal)
207 });
208 results.truncate(50); results
211 }
212
213 fn generate_excerpt(&self, _docname_idx: usize, _query_terms: &[String]) -> String {
215 String::new()
217 }
218
219 pub fn prune(&mut self, valid_docs: &std::collections::HashSet<String>) {
221 let mut new_docnames = Vec::new();
222 let mut new_filenames = Vec::new();
223 let mut new_titles = Vec::new();
224 let mut doc_mapping = HashMap::new();
225
226 for (old_idx, docname) in self.docnames.iter().enumerate() {
228 if valid_docs.contains(docname) {
229 let new_idx = new_docnames.len();
230 doc_mapping.insert(old_idx, new_idx);
231 new_docnames.push(docname.clone());
232 new_filenames.push(self.filenames.get(old_idx).cloned().unwrap_or_default());
233 new_titles.push(self.titles.get(old_idx).cloned().unwrap_or_default());
234 }
235 }
236
237 self.docnames = new_docnames;
239 self.filenames = new_filenames;
240 self.titles = new_titles;
241
242 for matches in self.terms.values_mut() {
244 matches.retain_mut(|doc_match| {
245 if let Some(&new_idx) = doc_mapping.get(&doc_match.docname_idx) {
246 doc_match.docname_idx = new_idx;
247 true
248 } else {
249 false
250 }
251 });
252 }
253
254 self.terms.retain(|_, matches| !matches.is_empty());
256
257 self.objects.retain(|_, obj_ref| {
259 if let Some(&new_idx) = doc_mapping.get(&obj_ref.docname_idx) {
260 obj_ref.docname_idx = new_idx;
261 true
262 } else {
263 false
264 }
265 });
266 }
267
268 pub fn to_json(&self) -> Result<String> {
270 #[derive(Serialize)]
271 struct JsonSearchIndex<'a> {
272 docnames: &'a Vec<String>,
273 filenames: &'a Vec<String>,
274 titles: &'a Vec<String>,
275 terms: &'a HashMap<String, Vec<DocumentMatch>>,
276 objects: &'a HashMap<String, ObjectReference>,
277 objnames: &'a HashMap<String, String>,
278 objtypes: &'a HashMap<String, String>,
279 }
280
281 let json_index = JsonSearchIndex {
282 docnames: &self.docnames,
283 filenames: &self.filenames,
284 titles: &self.titles,
285 terms: &self.terms,
286 objects: &self.objects,
287 objnames: &self.objnames,
288 objtypes: &self.objtypes,
289 };
290
291 Ok(serde_json::to_string(&json_index)?)
292 }
293}
294
295#[derive(Debug, Clone, Serialize, Deserialize)]
297pub struct SearchResult {
298 pub docname: String,
299 pub filename: String,
300 pub title: String,
301 pub score: f32,
302 pub excerpt: String,
303}
304
305pub struct SearchIndexBuilder {
307 index: SearchIndex,
308 processed_docs: std::collections::HashSet<String>,
309}
310
311impl SearchIndexBuilder {
312 pub fn new(language: String) -> Self {
313 Self {
314 index: SearchIndex::new(language),
315 processed_docs: std::collections::HashSet::new(),
316 }
317 }
318
319 pub fn add_or_update_document(
321 &mut self,
322 docname: String,
323 filename: String,
324 title: String,
325 content: &str,
326 ) -> Result<()> {
327 if self.processed_docs.contains(&docname) {
329 self.remove_document(&docname);
330 }
331
332 self.index
334 .add_document(docname.clone(), filename, title, content)?;
335 self.processed_docs.insert(docname);
336
337 Ok(())
338 }
339
340 pub fn remove_document(&mut self, docname: &str) {
342 if let Some(docname_idx) = self.index.docnames.iter().position(|d| d == docname) {
343 self.index.docnames.remove(docname_idx);
345 if docname_idx < self.index.filenames.len() {
346 self.index.filenames.remove(docname_idx);
347 }
348 if docname_idx < self.index.titles.len() {
349 self.index.titles.remove(docname_idx);
350 }
351
352 for matches in self.index.terms.values_mut() {
354 matches.retain_mut(|doc_match| {
355 if doc_match.docname_idx == docname_idx {
356 false
357 } else if doc_match.docname_idx > docname_idx {
358 doc_match.docname_idx -= 1;
359 true
360 } else {
361 true
362 }
363 });
364 }
365
366 self.index.terms.retain(|_, matches| !matches.is_empty());
368
369 self.index.objects.retain(|_, obj_ref| {
371 if obj_ref.docname_idx == docname_idx {
372 false
373 } else if obj_ref.docname_idx > docname_idx {
374 obj_ref.docname_idx -= 1;
375 true
376 } else {
377 true
378 }
379 });
380 }
381
382 self.processed_docs.remove(docname);
383 }
384
385 pub fn build(self) -> SearchIndex {
387 self.index
388 }
389}
390
391#[cfg(test)]
392mod tests {
393 use super::*;
394
395 #[test]
396 fn test_search_index_creation() {
397 let index = SearchIndex::new("en".to_string());
398 assert_eq!(index.language, "en");
399 assert_eq!(index.docnames.len(), 0);
400 }
401
402 #[test]
403 fn test_add_document() {
404 let mut index = SearchIndex::new("en".to_string());
405 index
406 .add_document(
407 "test".to_string(),
408 "test.html".to_string(),
409 "Test Document".to_string(),
410 "This is a test document with some content.",
411 )
412 .unwrap();
413
414 assert_eq!(index.docnames.len(), 1);
415 assert_eq!(index.docnames[0], "test");
416 assert!(index.terms.contains_key("test"));
417 assert!(index.terms.contains_key("document"));
418 }
419
420 #[test]
421 fn test_word_normalization() {
422 let index = SearchIndex::new("en".to_string());
423
424 assert_eq!(index.normalize_english("running"), "runn");
425 assert_eq!(index.normalize_english("walked"), "walk");
426 assert_eq!(index.normalize_english("tests"), "test");
427 assert_eq!(index.normalize_english("test"), "test");
428 }
429
430 #[test]
431 fn test_search() {
432 let mut index = SearchIndex::new("en".to_string());
433 index
434 .add_document(
435 "test1".to_string(),
436 "test1.html".to_string(),
437 "First Test".to_string(),
438 "This is the first test document.",
439 )
440 .unwrap();
441 index
442 .add_document(
443 "test2".to_string(),
444 "test2.html".to_string(),
445 "Second Test".to_string(),
446 "This is the second test document with more content.",
447 )
448 .unwrap();
449
450 let results = index.search("test document");
451 assert!(!results.is_empty());
452 assert!(results
453 .iter()
454 .any(|r| r.docname == "test1" || r.docname == "test2"));
455 }
456
457 #[test]
458 fn test_search_index_builder() {
459 let mut builder = SearchIndexBuilder::new("en".to_string());
460
461 builder
462 .add_or_update_document(
463 "test".to_string(),
464 "test.html".to_string(),
465 "Test".to_string(),
466 "Content",
467 )
468 .unwrap();
469
470 let index = builder.build();
471 assert_eq!(index.docnames.len(), 1);
472 }
473}