elf_lojban/src/lex/cluster.rs

371 lines
12 KiB
Rust

use crate::{
lojbanic::{is_valid_consonant_pair, is_valid_vowel_pair_name, Lojbanic},
strange::StrRange,
};
fn split_consonants(s: StrRange) -> Vec<StrRange> {
let mut output = Vec::new();
let mut temp_buf = StrRange::new(s.src(), s.start(), 0);
let mut previous_ch = None;
for (at, ch) in s.as_str().char_indices() {
let char_len = ch.len_utf8();
if let Some(previous_ch) = previous_ch {
if !is_valid_consonant_pair(previous_ch, ch) {
output.push(temp_buf);
temp_buf = StrRange::new(s.src(), at, char_len);
} else {
temp_buf.increase_length(char_len);
}
} else {
temp_buf.increase_length(char_len);
}
previous_ch = ch.into();
}
if !temp_buf.is_empty() {
output.push(temp_buf);
}
output
}
#[test]
fn splitsonants() {
let src = "mmmmmmmmm";
assert_eq![
split_consonants(StrRange::new(src, 0, src.len()))
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>(),
["m", "m", "m", "m", "m", "m", "m", "m", "m",]
];
let src = "bcdfgjk";
assert_eq![
split_consonants(StrRange::new(src, 0, src.len()))
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>(),
["b", "c", "d", "f", "gj", "k",]
];
}
fn split_vowels(s: StrRange) -> Vec<StrRange> {
let mut output = Vec::new();
let mut temp_buf = StrRange::new(s.src(), s.start(), 0);
let mut previous_ch = None;
for (at, ch) in s.as_str().char_indices() {
let char_len = ch.len_utf8();
if let Some(previous_ch) = previous_ch {
if !is_valid_vowel_pair_name(previous_ch, ch) {
output.push(temp_buf);
temp_buf = StrRange::new(s.src(), at, char_len);
} else {
temp_buf.increase_length(char_len);
}
} else {
temp_buf.increase_length(char_len);
}
previous_ch = ch.into();
}
if !temp_buf.is_empty() {
output.push(temp_buf);
}
output
}
#[test]
fn splowels() {
let src = "aaaaaaaaa";
assert_eq![
split_vowels(StrRange::new(src, 0, src.len()))
.iter()
.map(|s| s.as_str())
.collect::<Vec<&str>>(),
["a", "a", "a", "a", "a", "a", "a", "a", "a",]
];
}
#[derive(Debug, PartialEq, Eq)]
pub struct Cluster<'src_buf> {
pub s: StrRange<'src_buf>,
pub kind: ClusterKind,
}
macro_rules! gen_cluster_fns {
($($nym:ident, $v:ident),*) => {$(
pub fn $nym(s: StrRange<'src_buf>) -> Self {
Self {
s,
kind: ClusterKind::$v,
}
})*
};
}
impl<'src_buf> Cluster<'src_buf> {
gen_cluster_fns![
consonant, Consonant, number, Number, huhboo, Huhboo, glide, Glide, unknown, Unknown, vowel,
Vowel, whitespace, Whitespace
];
}
#[derive(Debug, PartialEq, Eq)]
pub enum ClusterKind {
Consonant,
Number,
Huhboo,
Glide,
Unknown,
Vowel,
Whitespace,
}
impl<'src_buf> Cluster<'src_buf> {
pub fn len(&self) -> usize {
self.s.len()
}
}
enum ClusterState {
Any,
BetweenVowels,
Consonant,
Number,
Unknown,
Vowel,
Whitespace,
}
pub fn clusterise(s: &str) -> Vec<Cluster> {
let mut output = vec![];
let mut temp_buf = StrRange::new(s, 0, 0);
let mut state = ClusterState::Any;
for (at, ch) in s.char_indices() {
let char_len = ch.len_utf8();
match state {
ClusterState::Any => {
temp_buf.increase_length(char_len);
match ch {
e if e.is_lojban_apostrophe() | e.is_lojban_glide() | !e.is_lojbanic() => {
state = ClusterState::Unknown
}
e if e.is_lojban_consonant() => state = ClusterState::Consonant,
e if e.is_lojban_vowel() => state = ClusterState::Vowel,
e if e.is_whitespace() | e.is_lojban_stop() => state = ClusterState::Whitespace,
e if e.is_numeric() => state = ClusterState::Number,
_ => unreachable![],
};
}
ClusterState::BetweenVowels => match ch {
e if e.is_lojban_vowel() => {
if temp_buf.as_str() == "'" {
output.push(Cluster::huhboo(temp_buf));
} else {
output.push(Cluster::glide(temp_buf));
}
state = ClusterState::Vowel;
temp_buf = StrRange::new(s, at, char_len);
}
_ => {
state = ClusterState::Unknown;
temp_buf.increase_length(char_len);
}
},
ClusterState::Consonant => match ch {
e if e.is_lojban_consonant() => {
temp_buf.increase_length(char_len);
}
e if e.is_lojban_vowel() | e.is_whitespace() | e.is_lojban_stop() => {
let clusters = split_consonants(temp_buf.clone());
let len = clusters.len();
let mut accum = 0..0;
for (j, c) in clusters.into_iter().enumerate() {
if j == 0 || j == len - 1 {
output.push(Cluster::consonant(c));
if j != 0 && !accum.is_empty() {
output.push(Cluster::unknown(StrRange::new(
s,
temp_buf.start() + accum.start,
accum.len(),
)));
break;
}
} else {
accum.end += c.len();
}
}
state = if !ch.is_lojban_vowel() {
ClusterState::Whitespace
} else {
ClusterState::Vowel
};
temp_buf = StrRange::new(s, at, char_len);
}
e if e.is_lojban_apostrophe()
| e.is_lojban_glide()
| !e.is_lojbanic()
| e.is_numeric() =>
{
temp_buf.increase_length(char_len);
state = ClusterState::Unknown;
}
_ => unreachable![],
},
ClusterState::Number => match ch {
e if e.is_numeric() => {
temp_buf.increase_length(char_len);
}
e if e.is_lojban_apostrophe() | e.is_lojban_glide() => {
state = ClusterState::Unknown;
temp_buf.increase_length(char_len);
}
e if e.is_lojban_vowel()
| e.is_lojban_consonant()
| e.is_whitespace()
| e.is_lojban_stop() =>
{
output.push(Cluster::number(temp_buf));
state = if ch.is_lojban_vowel() {
ClusterState::Vowel
} else if ch.is_lojban_consonant() {
ClusterState::Consonant
} else {
ClusterState::Whitespace
};
temp_buf = StrRange::new(s, at, char_len);
}
e if !e.is_lojbanic() => {
state = ClusterState::Unknown;
temp_buf.increase_length(char_len);
}
_ => unreachable![],
},
ClusterState::Unknown => match ch {
e if e.is_whitespace() => {
output.push(Cluster::unknown(temp_buf));
state = ClusterState::Whitespace;
temp_buf = StrRange::new(s, at, char_len);
}
_ => temp_buf.increase_length(char_len),
},
ClusterState::Vowel => match ch {
e if e.is_lojban_apostrophe() => {
for vowels in split_vowels(temp_buf) {
output.push(Cluster::vowel(vowels));
}
state = ClusterState::BetweenVowels;
temp_buf = StrRange::new(s, at, char_len);
}
e if e.is_lojban_glide() => {
for vowels in split_vowels(temp_buf) {
output.push(Cluster::vowel(vowels));
}
state = ClusterState::BetweenVowels;
temp_buf = StrRange::new(s, at, char_len);
}
e if e.is_lojban_vowel() => {
temp_buf.increase_length(char_len);
}
e if e.is_lojban_consonant() | e.is_lojban_stop() | e.is_whitespace() => {
for vowels in split_vowels(temp_buf) {
output.push(Cluster::vowel(vowels));
}
state = if !ch.is_lojban_consonant() {
ClusterState::Whitespace
} else {
ClusterState::Consonant
};
temp_buf = StrRange::new(s, at, char_len);
}
e if e.is_numeric() => {
todo![]
}
e if !e.is_lojbanic() => {
state = ClusterState::Unknown;
temp_buf.increase_length(char_len);
}
_ => unreachable![],
},
ClusterState::Whitespace => match ch {
e if e.is_lojban_apostrophe() | e.is_lojban_glide() => {
output.push(Cluster::whitespace(temp_buf));
state = ClusterState::Unknown;
temp_buf = StrRange::new(s, at, char_len);
}
e if e.is_lojban_consonant() => {
output.push(Cluster::whitespace(temp_buf));
state = ClusterState::Consonant;
temp_buf = StrRange::new(s, at, char_len);
}
e if e.is_lojban_vowel() => {
output.push(Cluster::whitespace(temp_buf));
state = ClusterState::Vowel;
temp_buf = StrRange::new(s, at, char_len);
}
e if e.is_whitespace() | e.is_lojban_stop() => {
temp_buf.increase_length(char_len);
}
e if e.is_numeric() => {
output.push(Cluster::whitespace(temp_buf));
state = ClusterState::Number;
temp_buf = StrRange::new(s, at, char_len);
}
e if !e.is_lojbanic() => {
output.push(Cluster::whitespace(temp_buf));
state = ClusterState::Unknown;
temp_buf = StrRange::new(s, at, char_len);
}
_ => unreachable![],
},
}
}
if !temp_buf.is_empty() {
match state {
ClusterState::Any => {}
ClusterState::BetweenVowels => {
if temp_buf.clone().as_str() == "'" {
output.push(Cluster::huhboo(temp_buf));
} else {
output.push(Cluster::glide(temp_buf));
}
}
ClusterState::Consonant => {
let clusters = split_consonants(temp_buf.clone());
let len = clusters.len();
let mut accum = 0..0;
for (j, c) in clusters.into_iter().enumerate() {
if j == 0 || j == len - 1 {
output.push(Cluster::consonant(c));
if j != 0 && !accum.clone().is_empty() {
output.push(Cluster::unknown(StrRange::new(
s,
temp_buf.start() + accum.start,
accum.len(),
)));
break;
}
} else {
accum.end += c.len();
}
}
}
ClusterState::Number => {
output.push(Cluster::number(temp_buf));
}
ClusterState::Unknown => {
output.push(Cluster::unknown(temp_buf));
}
ClusterState::Vowel => {
for vowels in split_vowels(temp_buf) {
output.push(Cluster::vowel(vowels));
}
}
ClusterState::Whitespace => output.push(Cluster::whitespace(temp_buf)),
}
}
output
}
// #[test]
// fn periods_are_whitespace() {
// assert_eq![clusterise("tssssssssssssssi."), [Cluster::whitespace(".")]]
// }