sphinx_ultra/
cache.rs

1use anyhow::Result;
2use blake3::Hasher;
3use chrono::{DateTime, Utc};
4use dashmap::DashMap;
5use log::{debug, warn};
6use parking_lot::RwLock;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9use std::path::{Path, PathBuf};
10use std::sync::Arc;
11use std::time::{Duration, UNIX_EPOCH};
12
13use crate::document::Document;
14use crate::error::BuildError;
15
16pub struct BuildCache {
17    cache_dir: PathBuf,
18    documents: Arc<DashMap<PathBuf, CachedDocument>>,
19    file_hashes: Arc<RwLock<HashMap<PathBuf, String>>>,
20    hit_count: Arc<RwLock<usize>>,
21    miss_count: Arc<RwLock<usize>>,
22    max_size_mb: usize,
23    expiration_duration: Duration,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize)]
27struct CachedDocument {
28    document: Document,
29    hash: String,
30    cached_at: DateTime<Utc>,
31    access_count: usize,
32    size_bytes: usize,
33}
34
35impl BuildCache {
36    pub fn new(cache_dir: PathBuf) -> Result<Self> {
37        std::fs::create_dir_all(&cache_dir)?;
38
39        let cache = Self {
40            cache_dir,
41            documents: Arc::new(DashMap::new()),
42            file_hashes: Arc::new(RwLock::new(HashMap::new())),
43            hit_count: Arc::new(RwLock::new(0)),
44            miss_count: Arc::new(RwLock::new(0)),
45            max_size_mb: 500, // Default 500MB cache
46            expiration_duration: Duration::from_secs(24 * 60 * 60), // 24 hours
47        };
48
49        // Load existing cache from disk
50        cache.load_from_disk()?;
51
52        Ok(cache)
53    }
54
55    pub fn get_document(&self, file_path: &Path) -> Result<Document> {
56        let hash = self.calculate_file_hash(file_path)?;
57
58        if let Some(cached) = self.documents.get(file_path) {
59            if cached.hash == hash && !self.is_expired(&cached.cached_at) {
60                // Update access count
61                self.documents.alter(file_path, |_, mut cached| {
62                    cached.access_count += 1;
63                    cached
64                });
65
66                *self.hit_count.write() += 1;
67                debug!("Cache hit for {}", file_path.display());
68                return Ok(cached.document.clone());
69            }
70            // Remove expired or outdated entry
71            self.documents.remove(file_path);
72        }
73
74        *self.miss_count.write() += 1;
75        debug!("Cache miss for {}", file_path.display());
76        Err(BuildError::Cache("Document not found in cache".to_string()).into())
77    }
78
79    pub fn store_document(&self, file_path: &Path, document: &Document) -> Result<()> {
80        let hash = self.calculate_file_hash(file_path)?;
81        let size_bytes = self.estimate_document_size(document);
82
83        let cached_doc = CachedDocument {
84            document: document.clone(),
85            hash: hash.clone(),
86            cached_at: Utc::now(),
87            access_count: 1,
88            size_bytes,
89        };
90
91        // Check if we need to evict some entries
92        self.evict_if_needed(size_bytes)?;
93
94        self.documents.insert(file_path.to_path_buf(), cached_doc);
95        self.file_hashes
96            .write()
97            .insert(file_path.to_path_buf(), hash.clone());
98
99        debug!(
100            "Cached document: {} ({} bytes)",
101            file_path.display(),
102            size_bytes
103        );
104
105        // Persist to disk asynchronously
106        self.persist_to_disk(file_path, document)?;
107
108        Ok(())
109    }
110
111    #[allow(dead_code)]
112    pub fn invalidate(&self, file_path: &Path) {
113        self.documents.remove(file_path);
114        self.file_hashes.write().remove(file_path);
115
116        // Remove from disk cache
117        let cache_file = self.get_cache_file_path(file_path);
118        if cache_file.exists() {
119            if let Err(e) = std::fs::remove_file(&cache_file) {
120                warn!(
121                    "Failed to remove cache file {}: {}",
122                    cache_file.display(),
123                    e
124                );
125            }
126        }
127
128        debug!("Invalidated cache for {}", file_path.display());
129    }
130
131    #[allow(dead_code)]
132    pub fn clear(&self) -> Result<()> {
133        self.documents.clear();
134        self.file_hashes.write().clear();
135        *self.hit_count.write() = 0;
136        *self.miss_count.write() = 0;
137
138        if self.cache_dir.exists() {
139            std::fs::remove_dir_all(&self.cache_dir)?;
140            std::fs::create_dir_all(&self.cache_dir)?;
141        }
142
143        debug!("Cleared all cache");
144        Ok(())
145    }
146
147    pub fn hit_count(&self) -> usize {
148        *self.hit_count.read()
149    }
150
151    #[allow(dead_code)]
152    pub fn miss_count(&self) -> usize {
153        *self.miss_count.read()
154    }
155
156    #[allow(dead_code)]
157    pub fn hit_ratio(&self) -> f64 {
158        let hits = *self.hit_count.read() as f64;
159        let misses = *self.miss_count.read() as f64;
160        if hits + misses > 0.0 {
161            hits / (hits + misses)
162        } else {
163            0.0
164        }
165    }
166
167    pub fn size_mb(&self) -> f64 {
168        let total_bytes: usize = self
169            .documents
170            .iter()
171            .map(|entry| entry.value().size_bytes)
172            .sum();
173        total_bytes as f64 / 1024.0 / 1024.0
174    }
175
176    fn calculate_file_hash(&self, file_path: &Path) -> Result<String> {
177        let content = std::fs::read(file_path)?;
178        let metadata = std::fs::metadata(file_path)?;
179
180        let mut hasher = Hasher::new();
181        hasher.update(&content);
182
183        // Include file metadata in hash
184        if let Ok(modified) = metadata.modified() {
185            if let Ok(duration) = modified.duration_since(UNIX_EPOCH) {
186                hasher.update(&duration.as_secs().to_le_bytes());
187            }
188        }
189
190        Ok(hasher.finalize().to_hex().to_string())
191    }
192
193    fn is_expired(&self, cached_at: &DateTime<Utc>) -> bool {
194        let now = Utc::now();
195        let elapsed = now.signed_duration_since(*cached_at);
196        elapsed.num_seconds() > self.expiration_duration.as_secs() as i64
197    }
198
199    fn estimate_document_size(&self, document: &Document) -> usize {
200        // Rough estimate of document size in memory
201        document.html.len()
202            + document.title.len()
203            + document.source_path.to_string_lossy().len()
204            + document.output_path.to_string_lossy().len()
205            + 1024 // Overhead for other fields
206    }
207
208    fn evict_if_needed(&self, new_size: usize) -> Result<()> {
209        let current_size_mb = self.size_mb();
210        let new_size_mb = (new_size as f64) / 1024.0 / 1024.0;
211
212        if current_size_mb + new_size_mb > self.max_size_mb as f64 {
213            self.evict_lru_entries(new_size_mb)?;
214        }
215
216        Ok(())
217    }
218
219    fn evict_lru_entries(&self, space_needed_mb: f64) -> Result<()> {
220        let mut entries: Vec<_> = self
221            .documents
222            .iter()
223            .map(|entry| {
224                (
225                    entry.key().clone(),
226                    entry.value().access_count,
227                    entry.value().size_bytes,
228                )
229            })
230            .collect();
231
232        // Sort by access count (LRU)
233        entries.sort_by_key(|(_, access_count, _)| *access_count);
234
235        let mut space_freed_mb = 0.0;
236        for (path, _, size_bytes) in entries {
237            if space_freed_mb >= space_needed_mb {
238                break;
239            }
240
241            self.documents.remove(&path);
242            self.file_hashes.write().remove(&path);
243            space_freed_mb += (size_bytes as f64) / 1024.0 / 1024.0;
244
245            debug!(
246                "Evicted {} from cache ({} MB)",
247                path.display(),
248                size_bytes as f64 / 1024.0 / 1024.0
249            );
250        }
251
252        Ok(())
253    }
254
255    fn load_from_disk(&self) -> Result<()> {
256        if !self.cache_dir.exists() {
257            return Ok(());
258        }
259
260        for entry in std::fs::read_dir(&self.cache_dir)? {
261            let entry = entry?;
262            if entry.file_type()?.is_file()
263                && entry.path().extension().is_some_and(|ext| ext == "json")
264            {
265                if let Err(e) = self.load_cache_file(&entry.path()) {
266                    warn!(
267                        "Failed to load cache file {}: {}",
268                        entry.path().display(),
269                        e
270                    );
271                }
272            }
273        }
274
275        debug!("Loaded {} documents from disk cache", self.documents.len());
276        Ok(())
277    }
278
279    fn load_cache_file(&self, cache_file: &Path) -> Result<()> {
280        let content = std::fs::read_to_string(cache_file)?;
281        let cached_doc: CachedDocument = serde_json::from_str(&content)?;
282
283        // Check if the cached document is still valid
284        if !self.is_expired(&cached_doc.cached_at) {
285            let source_path = &cached_doc.document.source_path;
286            if source_path.exists() {
287                let current_hash = self.calculate_file_hash(source_path)?;
288                if current_hash == cached_doc.hash {
289                    self.documents.insert(source_path.clone(), cached_doc);
290                }
291            }
292        }
293
294        Ok(())
295    }
296
297    fn persist_to_disk(&self, file_path: &Path, _document: &Document) -> Result<()> {
298        let cache_file = self.get_cache_file_path(file_path);
299        if let Some(parent) = cache_file.parent() {
300            std::fs::create_dir_all(parent)?;
301        }
302
303        if let Some(cached_doc) = self.documents.get(file_path) {
304            let content = serde_json::to_string_pretty(&*cached_doc)?;
305            std::fs::write(&cache_file, content)?;
306        }
307
308        Ok(())
309    }
310
311    fn get_cache_file_path(&self, file_path: &Path) -> PathBuf {
312        let hash = blake3::hash(file_path.to_string_lossy().as_bytes());
313        let filename = format!("{}.json", hash.to_hex());
314        self.cache_dir.join(filename)
315    }
316}