diff options
| author | Patrick Simianer <patrick@lilt.com> | 2026-02-26 19:28:22 +0100 |
|---|---|---|
| committer | Patrick Simianer <patrick@lilt.com> | 2026-02-26 19:28:22 +0100 |
| commit | 0abcdd7e4358cb902c320b008d3c04bde07b749e (patch) | |
| tree | f26bd36cc16b792ef4acf5450ef9293b55179167 /rs/src/hypergraph.rs | |
| parent | 4e62908a1757f83ff703399252ad50758c4eb237 (diff) | |
Add Rust implementation of SCFG decoder
Rust port of the Ruby prototype decoder with performance
optimizations for real Hiero-style grammars:
- Rule indexing by first terminal/NT symbol for fast lookup
- Chart symbol interning (u16 IDs) instead of string hashing
- Passive chart index by (symbol, left) for direct right-endpoint lookup
- Items store rule index instead of cloned rule data
Includes CKY+ parser, chart-to-hypergraph conversion, Viterbi
decoding, derivation extraction, and JSON hypergraph I/O.
Self-filling step in parse uses grammar lookup (not just
remaining active items) to handle rules that were consumed
during the parse loop or skipped by the has_any_at optimization.
Produces identical output to the Ruby prototype on all test examples.
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Diffstat (limited to 'rs/src/hypergraph.rs')
| -rw-r--r-- | rs/src/hypergraph.rs | 115 |
1 files changed, 115 insertions, 0 deletions
diff --git a/rs/src/hypergraph.rs b/rs/src/hypergraph.rs new file mode 100644 index 0000000..90069b0 --- /dev/null +++ b/rs/src/hypergraph.rs @@ -0,0 +1,115 @@ +use crate::grammar::Rule; +use crate::sparse_vector::SparseVector; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct NodeId(pub usize); + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub struct EdgeId(pub usize); + +#[derive(Debug, Clone)] +pub struct Node { + pub id: i64, + pub symbol: String, + pub left: i32, + pub right: i32, + pub outgoing: Vec<EdgeId>, + pub incoming: Vec<EdgeId>, + pub score: f64, +} + +impl Node { + pub fn new(id: i64, symbol: &str, left: i32, right: i32) -> Self { + Self { + id, + symbol: symbol.to_string(), + left, + right, + outgoing: Vec::new(), + incoming: Vec::new(), + score: 0.0, + } + } +} + +#[derive(Debug, Clone)] +pub struct Edge { + pub head: NodeId, + pub tails: Vec<NodeId>, + pub score: f64, + pub f: SparseVector, + pub mark: usize, + pub rule: Rule, +} + +impl Edge { + pub fn arity(&self) -> usize { + self.tails.len() + } + + pub fn marked(&self) -> bool { + self.arity() == self.mark + } +} + +#[derive(Debug)] +pub struct Hypergraph { + pub nodes: Vec<Node>, + pub edges: Vec<Edge>, + pub nodes_by_id: std::collections::HashMap<i64, NodeId>, +} + +impl Hypergraph { + pub fn new() -> Self { + Self { + nodes: Vec::new(), + edges: Vec::new(), + nodes_by_id: std::collections::HashMap::new(), + } + } + + pub fn add_node(&mut self, id: i64, symbol: &str, left: i32, right: i32) -> NodeId { + let nid = NodeId(self.nodes.len()); + self.nodes.push(Node::new(id, symbol, left, right)); + self.nodes_by_id.insert(id, nid); + nid + } + + pub fn add_edge( + &mut self, + head: NodeId, + tails: Vec<NodeId>, + score: f64, + f: SparseVector, + rule: Rule, + ) -> EdgeId { + let eid = EdgeId(self.edges.len()); + self.edges.push(Edge { + head, + tails: tails.clone(), + score, + f, + mark: 0, + rule, + }); + self.nodes[head.0].incoming.push(eid); + for &t in &tails { + self.nodes[t.0].outgoing.push(eid); + } + eid + } + + pub fn node(&self, nid: NodeId) -> &Node { + &self.nodes[nid.0] + } + + pub fn edge(&self, eid: EdgeId) -> &Edge { + &self.edges[eid.0] + } + + pub fn reset(&mut self) { + for e in &mut self.edges { + e.mark = 0; + } + } +} |
