371 lines
12 KiB
Rust
371 lines
12 KiB
Rust
use crate::{
|
|
lojbanic::{is_valid_consonant_pair, is_valid_vowel_pair_name, Lojbanic},
|
|
strange::StrRange,
|
|
};
|
|
|
|
fn split_consonants(s: StrRange) -> Vec<StrRange> {
|
|
let mut output = Vec::new();
|
|
let mut temp_buf = StrRange::new(s.src(), s.start(), 0);
|
|
let mut previous_ch = None;
|
|
for (at, ch) in s.as_str().char_indices() {
|
|
let char_len = ch.len_utf8();
|
|
if let Some(previous_ch) = previous_ch {
|
|
if !is_valid_consonant_pair(previous_ch, ch) {
|
|
output.push(temp_buf);
|
|
temp_buf = StrRange::new(s.src(), at, char_len);
|
|
} else {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
} else {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
previous_ch = ch.into();
|
|
}
|
|
if !temp_buf.is_empty() {
|
|
output.push(temp_buf);
|
|
}
|
|
output
|
|
}
|
|
|
|
#[test]
|
|
fn splitsonants() {
|
|
let src = "mmmmmmmmm";
|
|
assert_eq![
|
|
split_consonants(StrRange::new(src, 0, src.len()))
|
|
.iter()
|
|
.map(|s| s.as_str())
|
|
.collect::<Vec<&str>>(),
|
|
["m", "m", "m", "m", "m", "m", "m", "m", "m",]
|
|
];
|
|
let src = "bcdfgjk";
|
|
assert_eq![
|
|
split_consonants(StrRange::new(src, 0, src.len()))
|
|
.iter()
|
|
.map(|s| s.as_str())
|
|
.collect::<Vec<&str>>(),
|
|
["b", "c", "d", "f", "gj", "k",]
|
|
];
|
|
}
|
|
|
|
fn split_vowels(s: StrRange) -> Vec<StrRange> {
|
|
let mut output = Vec::new();
|
|
let mut temp_buf = StrRange::new(s.src(), s.start(), 0);
|
|
let mut previous_ch = None;
|
|
for (at, ch) in s.as_str().char_indices() {
|
|
let char_len = ch.len_utf8();
|
|
if let Some(previous_ch) = previous_ch {
|
|
if !is_valid_vowel_pair_name(previous_ch, ch) {
|
|
output.push(temp_buf);
|
|
temp_buf = StrRange::new(s.src(), at, char_len);
|
|
} else {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
} else {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
previous_ch = ch.into();
|
|
}
|
|
if !temp_buf.is_empty() {
|
|
output.push(temp_buf);
|
|
}
|
|
output
|
|
}
|
|
|
|
#[test]
|
|
fn splowels() {
|
|
let src = "aaaaaaaaa";
|
|
assert_eq![
|
|
split_vowels(StrRange::new(src, 0, src.len()))
|
|
.iter()
|
|
.map(|s| s.as_str())
|
|
.collect::<Vec<&str>>(),
|
|
["a", "a", "a", "a", "a", "a", "a", "a", "a",]
|
|
];
|
|
}
|
|
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
pub struct Cluster<'src_buf> {
|
|
pub s: StrRange<'src_buf>,
|
|
pub kind: ClusterKind,
|
|
}
|
|
|
|
macro_rules! gen_cluster_fns {
|
|
($($nym:ident, $v:ident),*) => {$(
|
|
pub fn $nym(s: StrRange<'src_buf>) -> Self {
|
|
Self {
|
|
s,
|
|
kind: ClusterKind::$v,
|
|
}
|
|
})*
|
|
};
|
|
}
|
|
|
|
impl<'src_buf> Cluster<'src_buf> {
|
|
gen_cluster_fns![
|
|
consonant, Consonant, number, Number, huhboo, Huhboo, glide, Glide, unknown, Unknown, vowel,
|
|
Vowel, whitespace, Whitespace
|
|
];
|
|
}
|
|
|
|
#[derive(Debug, PartialEq, Eq)]
|
|
pub enum ClusterKind {
|
|
Consonant,
|
|
Number,
|
|
Huhboo,
|
|
Glide,
|
|
Unknown,
|
|
Vowel,
|
|
Whitespace,
|
|
}
|
|
|
|
impl<'src_buf> Cluster<'src_buf> {
|
|
pub fn len(&self) -> usize {
|
|
self.s.len()
|
|
}
|
|
}
|
|
|
|
enum ClusterState {
|
|
Any,
|
|
BetweenVowels,
|
|
Consonant,
|
|
Number,
|
|
Unknown,
|
|
Vowel,
|
|
Whitespace,
|
|
}
|
|
|
|
pub fn clusterise(s: &str) -> Vec<Cluster> {
|
|
let mut output = vec![];
|
|
let mut temp_buf = StrRange::new(s, 0, 0);
|
|
let mut state = ClusterState::Any;
|
|
for (at, ch) in s.char_indices() {
|
|
let char_len = ch.len_utf8();
|
|
match state {
|
|
ClusterState::Any => {
|
|
temp_buf.increase_length(char_len);
|
|
match ch {
|
|
e if e.is_lojban_apostrophe() | e.is_lojban_glide() | !e.is_lojbanic() => {
|
|
state = ClusterState::Unknown
|
|
}
|
|
e if e.is_lojban_consonant() => state = ClusterState::Consonant,
|
|
e if e.is_lojban_vowel() => state = ClusterState::Vowel,
|
|
e if e.is_whitespace() | e.is_lojban_stop() => state = ClusterState::Whitespace,
|
|
e if e.is_numeric() => state = ClusterState::Number,
|
|
_ => unreachable![],
|
|
};
|
|
}
|
|
ClusterState::BetweenVowels => match ch {
|
|
e if e.is_lojban_vowel() => {
|
|
if temp_buf.as_str() == "'" {
|
|
output.push(Cluster::huhboo(temp_buf));
|
|
} else {
|
|
output.push(Cluster::glide(temp_buf));
|
|
}
|
|
state = ClusterState::Vowel;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
_ => {
|
|
state = ClusterState::Unknown;
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
},
|
|
ClusterState::Consonant => match ch {
|
|
e if e.is_lojban_consonant() => {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
e if e.is_lojban_vowel() | e.is_whitespace() | e.is_lojban_stop() => {
|
|
let clusters = split_consonants(temp_buf.clone());
|
|
let len = clusters.len();
|
|
let mut accum = 0..0;
|
|
for (j, c) in clusters.into_iter().enumerate() {
|
|
if j == 0 || j == len - 1 {
|
|
output.push(Cluster::consonant(c));
|
|
if j != 0 && !accum.is_empty() {
|
|
output.push(Cluster::unknown(StrRange::new(
|
|
s,
|
|
temp_buf.start() + accum.start,
|
|
accum.len(),
|
|
)));
|
|
break;
|
|
}
|
|
} else {
|
|
accum.end += c.len();
|
|
}
|
|
}
|
|
state = if !ch.is_lojban_vowel() {
|
|
ClusterState::Whitespace
|
|
} else {
|
|
ClusterState::Vowel
|
|
};
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if e.is_lojban_apostrophe()
|
|
| e.is_lojban_glide()
|
|
| !e.is_lojbanic()
|
|
| e.is_numeric() =>
|
|
{
|
|
temp_buf.increase_length(char_len);
|
|
state = ClusterState::Unknown;
|
|
}
|
|
_ => unreachable![],
|
|
},
|
|
ClusterState::Number => match ch {
|
|
e if e.is_numeric() => {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
e if e.is_lojban_apostrophe() | e.is_lojban_glide() => {
|
|
state = ClusterState::Unknown;
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
e if e.is_lojban_vowel()
|
|
| e.is_lojban_consonant()
|
|
| e.is_whitespace()
|
|
| e.is_lojban_stop() =>
|
|
{
|
|
output.push(Cluster::number(temp_buf));
|
|
state = if ch.is_lojban_vowel() {
|
|
ClusterState::Vowel
|
|
} else if ch.is_lojban_consonant() {
|
|
ClusterState::Consonant
|
|
} else {
|
|
ClusterState::Whitespace
|
|
};
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if !e.is_lojbanic() => {
|
|
state = ClusterState::Unknown;
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
_ => unreachable![],
|
|
},
|
|
ClusterState::Unknown => match ch {
|
|
e if e.is_whitespace() => {
|
|
output.push(Cluster::unknown(temp_buf));
|
|
state = ClusterState::Whitespace;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
_ => temp_buf.increase_length(char_len),
|
|
},
|
|
ClusterState::Vowel => match ch {
|
|
e if e.is_lojban_apostrophe() => {
|
|
for vowels in split_vowels(temp_buf) {
|
|
output.push(Cluster::vowel(vowels));
|
|
}
|
|
state = ClusterState::BetweenVowels;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if e.is_lojban_glide() => {
|
|
for vowels in split_vowels(temp_buf) {
|
|
output.push(Cluster::vowel(vowels));
|
|
}
|
|
state = ClusterState::BetweenVowels;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if e.is_lojban_vowel() => {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
e if e.is_lojban_consonant() | e.is_lojban_stop() | e.is_whitespace() => {
|
|
for vowels in split_vowels(temp_buf) {
|
|
output.push(Cluster::vowel(vowels));
|
|
}
|
|
state = if !ch.is_lojban_consonant() {
|
|
ClusterState::Whitespace
|
|
} else {
|
|
ClusterState::Consonant
|
|
};
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if e.is_numeric() => {
|
|
todo![]
|
|
}
|
|
e if !e.is_lojbanic() => {
|
|
state = ClusterState::Unknown;
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
_ => unreachable![],
|
|
},
|
|
ClusterState::Whitespace => match ch {
|
|
e if e.is_lojban_apostrophe() | e.is_lojban_glide() => {
|
|
output.push(Cluster::whitespace(temp_buf));
|
|
state = ClusterState::Unknown;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if e.is_lojban_consonant() => {
|
|
output.push(Cluster::whitespace(temp_buf));
|
|
state = ClusterState::Consonant;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if e.is_lojban_vowel() => {
|
|
output.push(Cluster::whitespace(temp_buf));
|
|
state = ClusterState::Vowel;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if e.is_whitespace() | e.is_lojban_stop() => {
|
|
temp_buf.increase_length(char_len);
|
|
}
|
|
e if e.is_numeric() => {
|
|
output.push(Cluster::whitespace(temp_buf));
|
|
state = ClusterState::Number;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
e if !e.is_lojbanic() => {
|
|
output.push(Cluster::whitespace(temp_buf));
|
|
state = ClusterState::Unknown;
|
|
temp_buf = StrRange::new(s, at, char_len);
|
|
}
|
|
_ => unreachable![],
|
|
},
|
|
}
|
|
}
|
|
if !temp_buf.is_empty() {
|
|
match state {
|
|
ClusterState::Any => {}
|
|
ClusterState::BetweenVowels => {
|
|
if temp_buf.clone().as_str() == "'" {
|
|
output.push(Cluster::huhboo(temp_buf));
|
|
} else {
|
|
output.push(Cluster::glide(temp_buf));
|
|
}
|
|
}
|
|
ClusterState::Consonant => {
|
|
let clusters = split_consonants(temp_buf.clone());
|
|
let len = clusters.len();
|
|
let mut accum = 0..0;
|
|
for (j, c) in clusters.into_iter().enumerate() {
|
|
if j == 0 || j == len - 1 {
|
|
output.push(Cluster::consonant(c));
|
|
if j != 0 && !accum.clone().is_empty() {
|
|
output.push(Cluster::unknown(StrRange::new(
|
|
s,
|
|
temp_buf.start() + accum.start,
|
|
accum.len(),
|
|
)));
|
|
break;
|
|
}
|
|
} else {
|
|
accum.end += c.len();
|
|
}
|
|
}
|
|
}
|
|
ClusterState::Number => {
|
|
output.push(Cluster::number(temp_buf));
|
|
}
|
|
ClusterState::Unknown => {
|
|
output.push(Cluster::unknown(temp_buf));
|
|
}
|
|
ClusterState::Vowel => {
|
|
for vowels in split_vowels(temp_buf) {
|
|
output.push(Cluster::vowel(vowels));
|
|
}
|
|
}
|
|
ClusterState::Whitespace => output.push(Cluster::whitespace(temp_buf)),
|
|
}
|
|
}
|
|
output
|
|
}
|
|
|
|
// #[test]
|
|
// fn periods_are_whitespace() {
|
|
// assert_eq![clusterise("tssssssssssssssi."), [Cluster::whitespace(".")]]
|
|
// }
|