summaryrefslogtreecommitdiff
path: root/rs/src/parse.rs
diff options
context:
space:
mode:
Diffstat (limited to 'rs/src/parse.rs')
-rw-r--r--rs/src/parse.rs347
1 files changed, 347 insertions, 0 deletions
diff --git a/rs/src/parse.rs b/rs/src/parse.rs
new file mode 100644
index 0000000..7f64a89
--- /dev/null
+++ b/rs/src/parse.rs
@@ -0,0 +1,347 @@
+use crate::grammar::{Grammar, Symbol};
+
+#[derive(Debug, Clone)]
+pub struct Span {
+ pub left: i32,
+ pub right: i32,
+}
+
+#[derive(Debug, Clone)]
+pub struct Item {
+ pub rule_idx: usize,
+ pub left: usize,
+ pub right: usize,
+ pub dot: usize,
+ pub tail_spans: Vec<Option<Span>>,
+}
+
+impl Item {
+ pub fn from_rule(rule_idx: usize, rhs: &[Symbol], left: usize, right: usize, dot: usize) -> Self {
+ let tail_spans = rhs
+ .iter()
+ .map(|sym| {
+ if sym.is_nt() {
+ Some(Span {
+ left: -1,
+ right: -1,
+ })
+ } else {
+ None
+ }
+ })
+ .collect();
+
+ Self {
+ rule_idx,
+ left,
+ right,
+ dot,
+ tail_spans,
+ }
+ }
+
+ pub fn advance(&self, left: usize, right: usize, dot: usize) -> Self {
+ Self {
+ rule_idx: self.rule_idx,
+ left,
+ right,
+ dot,
+ tail_spans: self.tail_spans.clone(),
+ }
+ }
+}
+
+pub struct Chart {
+ _n: usize,
+ m: Vec<Vec<Vec<Item>>>,
+ b: std::collections::HashSet<(u16, u16, u16)>,
+ /// Index: (symbol_id, left) -> sorted vec of right endpoints
+ spans_by_left: std::collections::HashMap<(u16, u16), Vec<u16>>,
+ sym_to_id: std::collections::HashMap<String, u16>,
+ next_sym_id: u16,
+}
+
+impl Chart {
+ pub fn new(n: usize) -> Self {
+ let mut m = Vec::with_capacity(n + 1);
+ for _ in 0..=n {
+ let mut row = Vec::with_capacity(n + 1);
+ for _ in 0..=n {
+ row.push(Vec::new());
+ }
+ m.push(row);
+ }
+ Self {
+ _n: n,
+ m,
+ b: std::collections::HashSet::new(),
+ spans_by_left: std::collections::HashMap::new(),
+ sym_to_id: std::collections::HashMap::new(),
+ next_sym_id: 0,
+ }
+ }
+
+ fn sym_id(&mut self, symbol: &str) -> u16 {
+ if let Some(&id) = self.sym_to_id.get(symbol) {
+ id
+ } else {
+ let id = self.next_sym_id;
+ self.next_sym_id += 1;
+ self.sym_to_id.insert(symbol.to_string(), id);
+ id
+ }
+ }
+
+ fn sym_id_lookup(&self, symbol: &str) -> Option<u16> {
+ self.sym_to_id.get(symbol).copied()
+ }
+
+ pub fn at(&self, i: usize, j: usize) -> &Vec<Item> {
+ &self.m[i][j]
+ }
+
+ pub fn at_mut(&mut self, i: usize, j: usize) -> &mut Vec<Item> {
+ &mut self.m[i][j]
+ }
+
+ pub fn add(&mut self, item: Item, i: usize, j: usize, symbol: &str) {
+ let sid = self.sym_id(symbol);
+ if self.b.insert((i as u16, j as u16, sid)) {
+ self.spans_by_left
+ .entry((sid, i as u16))
+ .or_default()
+ .push(j as u16);
+ }
+ self.m[i][j].push(item);
+ }
+
+ pub fn has(&self, symbol: &str, i: usize, j: usize) -> bool {
+ if let Some(&sid) = self.sym_to_id.get(symbol) {
+ self.b.contains(&(i as u16, j as u16, sid))
+ } else {
+ false
+ }
+ }
+
+ /// Returns right endpoints where (symbol, left, right) exists
+ pub fn rights_for(&self, symbol: &str, left: usize) -> &[u16] {
+ static EMPTY: Vec<u16> = Vec::new();
+ if let Some(sid) = self.sym_id_lookup(symbol) {
+ if let Some(rights) = self.spans_by_left.get(&(sid, left as u16)) {
+ return rights;
+ }
+ }
+ &EMPTY
+ }
+
+ /// Check if any entry for (symbol, left, *) exists
+ pub fn has_any_at(&self, symbol: &str, left: usize) -> bool {
+ if let Some(sid) = self.sym_id_lookup(symbol) {
+ self.spans_by_left.contains_key(&(sid, left as u16))
+ } else {
+ false
+ }
+ }
+}
+
+/// Visit spans bottom-up: from span size `from` up.
+/// Yields (left, right) pairs.
+pub fn visit<F: FnMut(usize, usize)>(from: usize, l: usize, r: usize, x: usize, mut f: F) {
+ for span in from..=(r - x) {
+ for k in l..=(r - span) {
+ f(k, k + span);
+ }
+ }
+}
+
+pub fn init(input: &[String], n: usize, _active_chart: &mut Chart, passive_chart: &mut Chart, grammar: &Grammar) {
+ for i in 0..n {
+ if let Some(matching) = grammar.flat_by_first_word.get(&input[i]) {
+ for &fi in matching {
+ let rule = &grammar.rules[fi];
+ let rhs_len = rule.rhs.len();
+ if i + rhs_len > n {
+ continue;
+ }
+ let matches = rule
+ .rhs
+ .iter()
+ .enumerate()
+ .skip(1)
+ .all(|(k, s)| s.word() == input[i + k]);
+ if matches {
+ let item = Item::from_rule(fi, &rule.rhs, i, i + rhs_len, rhs_len);
+ passive_chart.add(item, i, i + rhs_len, rule.lhs.nt_symbol());
+ }
+ }
+ }
+ }
+}
+
+fn scan(item: &mut Item, input: &[String], limit: usize, grammar: &Grammar) -> bool {
+ let rhs = &grammar.rules[item.rule_idx].rhs;
+ while item.dot < rhs.len() && rhs[item.dot].is_t() {
+ if item.right == limit {
+ return false;
+ }
+ if rhs[item.dot].word() == input[item.right] {
+ item.dot += 1;
+ item.right += 1;
+ } else {
+ return false;
+ }
+ }
+ true
+}
+
+pub fn parse(
+ input: &[String],
+ n: usize,
+ active_chart: &mut Chart,
+ passive_chart: &mut Chart,
+ grammar: &Grammar,
+) {
+ visit(1, 0, n, 0, |i, j| {
+ // Try to apply rules starting with T
+ if let Some(matching) = grammar.start_t_by_word.get(&input[i]) {
+ for &ri in matching {
+ let mut new_item = Item::from_rule(ri, &grammar.rules[ri].rhs, i, i, 0);
+ if scan(&mut new_item, input, j, grammar) {
+ active_chart.at_mut(i, j).push(new_item);
+ }
+ }
+ }
+
+ // Seed active chart with rules starting with NT
+ for (symbol, rule_indices) in &grammar.start_nt_by_symbol {
+ if !passive_chart.has_any_at(symbol, i) {
+ continue;
+ }
+ for &ri in rule_indices {
+ let rule = &grammar.rules[ri];
+ if rule.rhs.len() > j - i {
+ continue;
+ }
+ active_chart
+ .at_mut(i, j)
+ .push(Item::from_rule(ri, &rule.rhs, i, i, 0));
+ }
+ }
+
+ // Parse
+ let mut new_symbols: Vec<String> = Vec::new();
+ let mut remaining_items: Vec<Item> = Vec::new();
+
+ while !active_chart.at(i, j).is_empty() {
+ let active_item = active_chart.at_mut(i, j).pop().unwrap();
+ let mut advanced = false;
+
+ let active_rhs = &grammar.rules[active_item.rule_idx].rhs;
+ if active_item.dot < active_rhs.len() && active_rhs[active_item.dot].is_nt() {
+ let wanted_symbol = active_rhs[active_item.dot].nt_symbol();
+ let k = active_item.right;
+
+ // Use index to directly get matching right endpoints
+ let rights: Vec<u16> = passive_chart
+ .rights_for(wanted_symbol, k)
+ .iter()
+ .filter(|&&r| (r as usize) <= j)
+ .copied()
+ .collect();
+
+ for l16 in rights {
+ let l = l16 as usize;
+
+ let mut new_item =
+ active_item.advance(active_item.left, l, active_item.dot + 1);
+ new_item.tail_spans[new_item.dot - 1] = Some(Span {
+ left: k as i32,
+ right: l as i32,
+ });
+
+ let new_rhs = &grammar.rules[new_item.rule_idx].rhs;
+ if scan(&mut new_item, input, j, grammar) {
+ if new_item.dot == new_rhs.len() {
+ if new_item.left == i && new_item.right == j {
+ let sym_str = grammar.rules[new_item.rule_idx].lhs.nt_symbol();
+ if !new_symbols.iter().any(|s| s == sym_str) {
+ new_symbols.push(sym_str.to_string());
+ }
+ passive_chart.add(new_item, i, j, sym_str);
+ advanced = true;
+ }
+ } else if new_item.right + (new_rhs.len() - new_item.dot) <= j {
+ active_chart.at_mut(i, j).push(new_item);
+ advanced = true;
+ }
+ }
+ }
+ }
+
+ if !advanced {
+ remaining_items.push(active_item);
+ }
+ }
+
+ // Self-filling step
+ let mut si = 0;
+ while si < new_symbols.len() {
+ let s = new_symbols[si].clone();
+
+ // Try start_nt rules from the grammar for this new symbol.
+ // This handles rules that weren't seeded in step 2 because
+ // the symbol didn't exist in the passive chart at that point.
+ if let Some(rule_indices) = grammar.start_nt_by_symbol.get(&s) {
+ for &ri in rule_indices {
+ let rule = &grammar.rules[ri];
+ if rule.rhs.len() > j - i {
+ continue;
+ }
+ let mut new_item = Item::from_rule(ri, &rule.rhs, i, i, 0);
+ new_item.dot = 1;
+ new_item.right = j;
+ new_item.tail_spans[0] = Some(Span {
+ left: i as i32,
+ right: j as i32,
+ });
+ if new_item.dot == rule.rhs.len() {
+ let sym_str = rule.lhs.nt_symbol();
+ if !new_symbols.iter().any(|ns| ns == sym_str) {
+ new_symbols.push(sym_str.to_string());
+ }
+ passive_chart.add(new_item, i, j, sym_str);
+ }
+ }
+ }
+
+ // Also check remaining active items (for rules that were
+ // seeded but couldn't advance during the parse loop)
+ for item in &remaining_items {
+ if item.dot != 0 {
+ continue;
+ }
+ let item_rhs = &grammar.rules[item.rule_idx].rhs;
+ if !item_rhs[item.dot].is_nt() {
+ continue;
+ }
+ if item_rhs[item.dot].nt_symbol() != s {
+ continue;
+ }
+ let mut new_item = item.advance(i, j, item.dot + 1);
+ new_item.tail_spans[new_item.dot - 1] = Some(Span {
+ left: i as i32,
+ right: j as i32,
+ });
+ let new_rhs = &grammar.rules[new_item.rule_idx].rhs;
+ if new_item.dot == new_rhs.len() {
+ let sym_str = grammar.rules[new_item.rule_idx].lhs.nt_symbol();
+ if !new_symbols.iter().any(|ns| ns == sym_str) {
+ new_symbols.push(sym_str.to_string());
+ }
+ passive_chart.add(new_item, i, j, sym_str);
+ }
+ }
+ si += 1;
+ }
+ });
+}