diff options
| author | Patrick Simianer <patrick@lilt.com> | 2026-02-26 19:28:22 +0100 |
|---|---|---|
| committer | Patrick Simianer <patrick@lilt.com> | 2026-02-26 19:28:22 +0100 |
| commit | 0abcdd7e4358cb902c320b008d3c04bde07b749e (patch) | |
| tree | f26bd36cc16b792ef4acf5450ef9293b55179167 /rs/src/chart_to_hg.rs | |
| parent | 4e62908a1757f83ff703399252ad50758c4eb237 (diff) | |
Add Rust implementation of SCFG decoder
Rust port of the Ruby prototype decoder with performance
optimizations for real Hiero-style grammars:
- Rule indexing by first terminal/NT symbol for fast lookup
- Chart symbol interning (u16 IDs) instead of string hashing
- Passive chart index by (symbol, left) for direct right-endpoint lookup
- Items store rule index instead of cloned rule data
Includes CKY+ parser, chart-to-hypergraph conversion, Viterbi
decoding, derivation extraction, and JSON hypergraph I/O.
Self-filling step in parse uses grammar lookup (not just
remaining active items) to handle rules that were consumed
during the parse loop or skipped by the has_any_at optimization.
Produces identical output to the Ruby prototype on all test examples.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat (limited to 'rs/src/chart_to_hg.rs')
| -rw-r--r-- | rs/src/chart_to_hg.rs | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/rs/src/chart_to_hg.rs b/rs/src/chart_to_hg.rs new file mode 100644 index 0000000..95f642e --- /dev/null +++ b/rs/src/chart_to_hg.rs @@ -0,0 +1,95 @@ +use std::collections::HashMap; + +use crate::grammar::{Grammar, Rule, Symbol}; +use crate::hypergraph::{Hypergraph, NodeId}; +use crate::parse::{Chart, visit}; +use crate::sparse_vector::SparseVector; + +pub fn chart_to_hg(chart: &Chart, n: usize, weights: &SparseVector, grammar: &Grammar) -> Hypergraph { + let mut hg = Hypergraph::new(); + let mut seen: HashMap<String, NodeId> = HashMap::new(); + + // Add root node with id=-1 + let root_nid = hg.add_node(-1, "root", -1, -1); + + let mut next_id: i64 = 0; + + // First pass: create nodes + visit(1, 0, n, 0, |i, j| { + for item in chart.at(i, j) { + let lhs_sym = grammar.rules[item.rule_idx].lhs.nt_symbol(); + let key = format!("{},{},{}", lhs_sym, i, j); + if !seen.contains_key(&key) { + let nid = hg.add_node(next_id, lhs_sym, i as i32, j as i32); + seen.insert(key, nid); + next_id += 1; + } + } + }); + + // Second pass: create edges + visit(1, 0, n, 0, |i, j| { + for item in chart.at(i, j) { + let rule = &grammar.rules[item.rule_idx]; + let head_key = format!("{},{},{}", rule.lhs.nt_symbol(), i, j); + let head_nid = *seen.get(&head_key).unwrap(); + + // Build tails + let tails: Vec<NodeId> = if item.tail_spans.iter().all(|s| s.is_none()) + || item.tail_spans.iter().all(|s| { + s.as_ref() + .map_or(true, |sp| sp.left == -1 && sp.right == -1) + }) + { + if rule.rhs.iter().any(|s| s.is_nt()) { + build_tails(&rule.rhs, &item.tail_spans, &seen, root_nid) + } else { + vec![root_nid] + } + } else { + build_tails(&rule.rhs, &item.tail_spans, &seen, root_nid) + }; + + let score = weights.dot(&rule.f).exp(); + let rule_clone = Rule::new( + rule.lhs.clone(), + rule.rhs.clone(), + rule.target.clone(), + rule.map.clone(), + rule.f.clone(), + ); + + hg.add_edge(head_nid, tails, score, rule.f.clone(), rule_clone); + } + }); + + hg +} + +fn build_tails( + rhs: &[Symbol], + tail_spans: &[Option<crate::parse::Span>], + seen: &HashMap<String, NodeId>, + root_nid: NodeId, +) -> Vec<NodeId> { + let mut tails = Vec::new(); + let mut has_nt = false; + for (idx, sym) in rhs.iter().enumerate() { + if let Symbol::NT { symbol, .. } = sym { + has_nt = true; + if let Some(Some(span)) = tail_spans.get(idx) { + if span.left >= 0 && span.right >= 0 { + let key = format!("{},{},{}", symbol, span.left, span.right); + if let Some(&nid) = seen.get(&key) { + tails.push(nid); + } + } + } + } + } + if !has_nt { + vec![root_nid] + } else { + tails + } +} |
