summaryrefslogtreecommitdiff
path: root/rs/src/sparse_vector.rs
diff options
context:
space:
mode:
authorPatrick Simianer <patrick@lilt.com>2026-02-26 19:28:22 +0100
committerPatrick Simianer <patrick@lilt.com>2026-02-26 19:28:22 +0100
commit0abcdd7e4358cb902c320b008d3c04bde07b749e (patch)
treef26bd36cc16b792ef4acf5450ef9293b55179167 /rs/src/sparse_vector.rs
parent4e62908a1757f83ff703399252ad50758c4eb237 (diff)
Add Rust implementation of SCFG decoder
Rust port of the Ruby prototype decoder with performance optimizations for real Hiero-style grammars: - Rule indexing by first terminal/NT symbol for fast lookup - Chart symbol interning (u16 IDs) instead of string hashing - Passive chart index by (symbol, left) for direct right-endpoint lookup - Items store rule index instead of cloned rule data Includes CKY+ parser, chart-to-hypergraph conversion, Viterbi decoding, derivation extraction, and JSON hypergraph I/O. Self-filling step in parse uses grammar lookup (not just remaining active items) to handle rules that were consumed during the parse loop or skipped by the has_any_at optimization. Produces identical output to the Ruby prototype on all test examples. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat (limited to 'rs/src/sparse_vector.rs')
-rw-r--r--rs/src/sparse_vector.rs86
1 files changed, 86 insertions, 0 deletions
diff --git a/rs/src/sparse_vector.rs b/rs/src/sparse_vector.rs
new file mode 100644
index 0000000..4e62f95
--- /dev/null
+++ b/rs/src/sparse_vector.rs
@@ -0,0 +1,86 @@
+use std::collections::HashMap;
+
+#[derive(Debug, Clone, Default)]
+pub struct SparseVector {
+ pub map: HashMap<String, f64>,
+}
+
+impl SparseVector {
+ pub fn new() -> Self {
+ Self {
+ map: HashMap::new(),
+ }
+ }
+
+ pub fn from_kv(s: &str, kv_sep: char, pair_sep: char) -> Self {
+ let mut map = HashMap::new();
+ for pair in s.split(pair_sep) {
+ let pair = pair.trim();
+ if pair.is_empty() {
+ continue;
+ }
+ if let Some((k, v)) = pair.split_once(kv_sep) {
+ if let Ok(val) = v.trim().parse::<f64>() {
+ map.insert(k.trim().to_string(), val);
+ }
+ }
+ }
+ Self { map }
+ }
+
+ pub fn from_hash(h: &serde_json::Map<String, serde_json::Value>) -> Self {
+ let mut map = HashMap::new();
+ for (k, v) in h {
+ if let Some(val) = v.as_f64() {
+ map.insert(k.clone(), val);
+ }
+ }
+ Self { map }
+ }
+
+ pub fn dot(&self, other: &SparseVector) -> f64 {
+ let mut sum = 0.0;
+ for (k, v) in &self.map {
+ if let Some(ov) = other.map.get(k) {
+ sum += v * ov;
+ }
+ }
+ sum
+ }
+
+ pub fn to_json(&self) -> serde_json::Value {
+ let map: serde_json::Map<String, serde_json::Value> = self
+ .map
+ .iter()
+ .map(|(k, v)| (k.clone(), serde_json::Value::from(*v)))
+ .collect();
+ serde_json::Value::Object(map)
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_from_kv() {
+ let sv = SparseVector::from_kv("logp 2\nuse_house 0\nuse_shell 1", ' ', '\n');
+ assert_eq!(sv.map["logp"], 2.0);
+ assert_eq!(sv.map["use_house"], 0.0);
+ assert_eq!(sv.map["use_shell"], 1.0);
+ }
+
+ #[test]
+ fn test_dot() {
+ let a = SparseVector::from_kv("x=1 y=2", '=', ' ');
+ let b = SparseVector::from_kv("x=3 y=4 z=5", '=', ' ');
+ assert_eq!(a.dot(&b), 11.0);
+ }
+
+ #[test]
+ fn test_empty_dot() {
+ let a = SparseVector::new();
+ let b = SparseVector::from_kv("x=1", '=', ' ');
+ assert_eq!(a.dot(&b), 0.0);
+ }
+}