elf_lojban/src/lex/mod.rs

261 lines
8.1 KiB
Rust

mod cluster;
mod pattern;
use crate::{lojbanic::starts_with_permissible_initial_pair, strange::StrRange, Token};
use cluster::{clusterise, Cluster, ClusterKind};
use pattern::Pattern;
fn matches_gismu(clusters: &[Cluster]) -> bool {
if Pattern::CVCCV.matches_strict(clusters) || Pattern::CCVCV.matches_strict(clusters) {
if let Some(Cluster { s: _, kind }) = clusters.get(5) {
(match kind {
ClusterKind::Consonant | ClusterKind::Number | ClusterKind::Whitespace => true,
ClusterKind::Vowel => false,
// pretty sure these are unreachable at this point
ClusterKind::Huhboo => false,
ClusterKind::Glide => false,
ClusterKind::Unknown => false,
}) && { clusters.iter().take(4).fold(0, |a, c| c.len() + a) == 5 }
} else {
true
}
} else {
false
}
}
fn matches_lujvo(clusters: &[Cluster]) -> bool {
Pattern::CCVCCV.matches(clusters)
|| Pattern::CCV.matches(clusters)
|| if Pattern::CVCCV.matches(clusters) {
clusters.iter().take(2).fold(0, |a, c| c.len() + a) <= 3
} else {
false
}
}
fn matches_cmavo(clusters: &[Cluster]) -> bool {
if Pattern::CVCCV.matches(clusters) {
starts_with_permissible_initial_pair(&clusters[2].s)
} else {
Pattern::CVCCVCCV.matches(clusters)
|| Pattern::CV.matches(clusters)
|| Pattern::V.matches(clusters)
}
}
fn matches_cmevla(clusters: &[Cluster]) -> bool {
let mut previous_was_consonant = false;
for Cluster { s: _, kind } in clusters {
match kind {
ClusterKind::Consonant => previous_was_consonant = true,
ClusterKind::Huhboo | ClusterKind::Glide | ClusterKind::Vowel => {
previous_was_consonant = false
}
ClusterKind::Unknown | ClusterKind::Whitespace => return false,
ClusterKind::Number => break,
}
}
previous_was_consonant
}
fn matches_unknown(clusters: &[Cluster]) -> bool {
for Cluster { s: _, kind } in clusters {
if let ClusterKind::Unknown = kind {
return true;
}
if let ClusterKind::Whitespace = kind {
return false;
}
}
false
}
fn eat_cmevla<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
let mut new_offset = 0;
for (i, Cluster { s, kind }) in rest.iter().enumerate() {
match kind {
ClusterKind::Consonant | ClusterKind::Vowel | ClusterKind::Huhboo | ClusterKind::Glide => {
temp_buf.increase_length(s.len())
}
_ => break,
}
new_offset = i + 1;
}
(temp_buf, &rest[new_offset..])
}
fn eat_gismu<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
let gismu_offset = 4;
(0..gismu_offset).for_each(|i| temp_buf.increase_length(rest[i].len()));
(temp_buf, &rest[gismu_offset..])
}
fn eat_lujvo<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
let mut new_offset = 0;
let mut stressed = false;
for (i, Cluster { s, kind }) in rest.iter().enumerate() {
match kind {
ClusterKind::Consonant | ClusterKind::Vowel => {
temp_buf.increase_length(s.len());
if s.as_str().to_lowercase() != *s.as_str() {
stressed = true;
new_offset += 1;
continue;
}
}
ClusterKind::Huhboo | ClusterKind::Glide => temp_buf.increase_length(s.len()),
_ => break,
}
if stressed {
break;
}
new_offset = i + 1;
}
(temp_buf, &rest[new_offset..])
}
fn eat_cmavo<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
let mut new_offset = 0;
let mut found_consonant = false;
for (i, Cluster { s, kind }) in rest.iter().enumerate() {
match kind {
ClusterKind::Consonant => {
if found_consonant {
break;
} else {
found_consonant = true;
temp_buf.increase_length(s.len());
}
}
ClusterKind::Vowel => {
temp_buf.increase_length(s.len());
found_consonant = true;
}
ClusterKind::Huhboo => temp_buf.increase_length(s.len()),
ClusterKind::Glide => temp_buf.increase_length(s.len()),
_ => break,
}
new_offset = i;
}
(temp_buf, &rest[new_offset + 1..])
}
fn eat_number<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
let mut new_offset = 0;
let mut finished = false;
for (i, Cluster { s, kind }) in rest.iter().enumerate() {
new_offset = i;
finished = false;
match kind {
ClusterKind::Number => temp_buf.increase_length(s.len()),
_ => break,
}
finished = true;
}
if finished {
new_offset += 1;
}
(temp_buf, &rest[new_offset..])
}
fn eat_non_lojban<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
let mut new_offset = 0;
for (i, Cluster { s, kind }) in rest.iter().enumerate() {
new_offset = i + 1;
match kind {
ClusterKind::Whitespace => break,
_ => temp_buf.increase_length(s.len()),
}
}
(temp_buf, &rest[new_offset..])
}
fn eat_whitespace<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
let mut new_offset = 0;
let mut finished = false;
for (i, Cluster { s, kind }) in rest.iter().enumerate() {
new_offset = i;
finished = false;
match kind {
ClusterKind::Whitespace => temp_buf.increase_length(s.len()),
_ => break,
}
finished = true;
}
if finished {
new_offset += 1;
}
(temp_buf, &rest[new_offset..])
}
pub fn lex(src: &str) -> Vec<Token> {
let mut output = Vec::new();
let clusters = clusterise(src);
let mut rest = clusters.as_slice();
loop {
output.push(if matches_unknown(rest) {
let (buf, new_rest) = eat_non_lojban(rest);
rest = new_rest;
Token::unknown(buf.as_str())
} else if matches_cmevla(rest) {
let (buf, new_rest) = eat_cmevla(rest);
rest = new_rest;
Token::cmevla(buf.as_str())
} else if matches_gismu(rest) {
let (buf, new_rest) = eat_gismu(rest);
rest = new_rest;
Token::brivla(buf.as_str())
} else if matches_lujvo(rest) {
let (buf, new_rest) = eat_lujvo(rest);
rest = new_rest;
Token::brivla(buf.as_str())
} else if matches_cmavo(rest) {
let (buf, new_rest) = eat_cmavo(rest);
rest = new_rest;
Token::cmavo(buf.as_str())
} else {
match rest.get(0) {
Some(Cluster {
s: _,
kind: ClusterKind::Number,
}) => {
let (buf, new_rest) = eat_number(rest);
rest = new_rest;
Token::number(buf.as_str())
}
Some(Cluster {
s: _,
kind: ClusterKind::Unknown,
}) => {
let (buf, new_rest) = eat_non_lojban(rest);
rest = new_rest;
Token::unknown(buf.as_str())
}
Some(Cluster {
s: _,
kind: ClusterKind::Whitespace,
}) => {
let (buf, new_rest) = eat_whitespace(rest);
rest = new_rest;
Token::whitespace(buf.as_str())
}
Some(Cluster { s: _, kind: _ }) => {
let (buf, new_rest) = eat_non_lojban(rest);
rest = new_rest;
Token::unknown(buf.as_str())
}
None => break,
}
});
}
output
}