155 lines
4.6 KiB
Rust
155 lines
4.6 KiB
Rust
mod cluster;
|
|
mod eat;
|
|
mod pattern;
|
|
use crate::{lojbanic::starts_with_permissible_initial_pair, Token};
|
|
use cluster::{clusterise, Cluster, ClusterKind};
|
|
use eat::*;
|
|
use pattern::Pattern;
|
|
|
|
fn matches_gismu(clusters: &[Cluster]) -> bool {
|
|
if Pattern::CVCCV.matches_strict(clusters) || Pattern::CCVCV.matches_strict(clusters) {
|
|
if let Some(Cluster { s: _, kind }) = clusters.get(5) {
|
|
(match kind {
|
|
ClusterKind::Consonant | ClusterKind::Number | ClusterKind::Whitespace => true,
|
|
ClusterKind::Vowel => false,
|
|
// pretty sure these are unreachable at this point
|
|
ClusterKind::Huhboo => false,
|
|
ClusterKind::Glide => false,
|
|
ClusterKind::Unknown => false,
|
|
}) && { clusters.iter().take(4).fold(0, |a, c| c.len() + a) == 5 }
|
|
} else {
|
|
true
|
|
}
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
fn matches_lujvo(clusters: &[Cluster]) -> bool {
|
|
Pattern::CCVCCV.matches(clusters)
|
|
|| Pattern::CCV.matches(clusters)
|
|
|| if Pattern::VCC.matches(clusters) {
|
|
clusters[0].len() <= 3
|
|
} else {
|
|
false
|
|
}
|
|
|| if Pattern::CVCCV.matches(clusters) {
|
|
clusters[0].len() + clusters[1].len() <= 3
|
|
&& !starts_with_permissible_initial_pair(&clusters[2].s)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
fn matches_cmavo(clusters: &[Cluster]) -> bool {
|
|
if Pattern::CVCCV.matches(clusters) {
|
|
starts_with_permissible_initial_pair(&clusters[2].s)
|
|
} else {
|
|
Pattern::CVCCVCCV.matches(clusters)
|
|
|| Pattern::CV.matches(clusters)
|
|
|| Pattern::V.matches(clusters)
|
|
}
|
|
}
|
|
|
|
fn matches_cmevla(clusters: &[Cluster]) -> bool {
|
|
let mut previous_was_consonant = false;
|
|
for Cluster { s: _, kind } in clusters {
|
|
previous_was_consonant = match kind {
|
|
ClusterKind::Consonant => true,
|
|
ClusterKind::Huhboo | ClusterKind::Glide | ClusterKind::Vowel => false,
|
|
ClusterKind::Unknown | ClusterKind::Whitespace => return previous_was_consonant,
|
|
ClusterKind::Number => break,
|
|
};
|
|
}
|
|
previous_was_consonant
|
|
}
|
|
|
|
fn matches_unknown(clusters: &[Cluster]) -> bool {
|
|
for Cluster { s: _, kind } in clusters {
|
|
if let ClusterKind::Unknown = kind {
|
|
return true;
|
|
}
|
|
if let ClusterKind::Whitespace = kind {
|
|
return false;
|
|
}
|
|
}
|
|
false
|
|
}
|
|
|
|
fn lex_inner(stripped: bool, src: &str) -> Vec<Token> {
|
|
let mut output = Vec::new();
|
|
let clusters = clusterise(src);
|
|
let mut rest = clusters.as_slice();
|
|
loop {
|
|
output.push(if matches_unknown(rest) {
|
|
let (buf, new_rest) = eat_non_lojban(rest);
|
|
rest = new_rest;
|
|
Token::unknown(buf.as_str())
|
|
} else if matches_cmevla(rest) {
|
|
let (buf, new_rest) = eat_cmevla(rest);
|
|
rest = new_rest;
|
|
Token::cmevla(buf.as_str())
|
|
} else if matches_gismu(rest) {
|
|
let (buf, new_rest) = eat_gismu(rest);
|
|
rest = new_rest;
|
|
Token::brivla(buf.as_str())
|
|
} else if matches_lujvo(rest) {
|
|
let (buf, new_rest) = eat_lujvo(rest);
|
|
rest = new_rest;
|
|
Token::brivla(buf.as_str())
|
|
} else if matches_cmavo(rest) {
|
|
let (buf, new_rest) = eat_cmavo(rest);
|
|
rest = new_rest;
|
|
Token::cmavo(buf.as_str())
|
|
} else {
|
|
match rest.get(0) {
|
|
Some(Cluster {
|
|
s: _,
|
|
kind: ClusterKind::Number,
|
|
}) => {
|
|
let (buf, new_rest) = eat_number(rest);
|
|
rest = new_rest;
|
|
Token::number(buf.as_str())
|
|
}
|
|
Some(Cluster {
|
|
s: _,
|
|
kind: ClusterKind::Unknown,
|
|
}) => {
|
|
let (buf, new_rest) = eat_non_lojban(rest);
|
|
rest = new_rest;
|
|
Token::unknown(buf.as_str())
|
|
}
|
|
Some(Cluster {
|
|
s: _,
|
|
kind: ClusterKind::Whitespace,
|
|
}) => {
|
|
let (buf, new_rest) = eat_whitespace(rest);
|
|
rest = new_rest;
|
|
if stripped {
|
|
continue;
|
|
} else {
|
|
Token::whitespace(buf.as_str())
|
|
}
|
|
}
|
|
Some(Cluster { s: _, kind: _ }) => {
|
|
let (buf, new_rest) = eat_non_lojban(rest);
|
|
rest = new_rest;
|
|
Token::unknown(buf.as_str())
|
|
}
|
|
None => break,
|
|
}
|
|
});
|
|
}
|
|
output
|
|
}
|
|
|
|
/// Lex tokens, including whitespace
|
|
pub fn lex(src: &str) -> Vec<Token> {
|
|
lex_inner(false, src)
|
|
}
|
|
|
|
/// Lex tokens, stripping whitespace
|
|
pub fn lex_stripped(src: &str) -> Vec<Token> {
|
|
lex_inner(true, src)
|
|
}
|