initial commit

2022-08-12 16:53:32 -07:00 · 2022-08-12 16:53:32 -07:00 · 6410de9feb
commit 6410de9feb
9 changed files with 1389 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+/target
+Cargo.lock
--- a/BRKWORDS.TXT
+++ b/BRKWORDS.TXT
@ -0,0 +1,224 @@
+Morphology Algorithm
+Internal Revision 4.1, 8 June 1992
+
+The following will become the official baseline algorithm for resolution of
+Lojban text into individual words from sounds, stress, and pause.  As such,
+it is the ultimate standard of Lojban's unambiguous resolvability, which
+may make Lojban speech recognition by computers more possible than for
+other languages.  While the algorithm looks very complicated, almost all of
+it is resolving special cases, and performing what error detection and
+correction may be possible.
+
+
+We have a string representing the speech stream, marked with stress and
+pauses.  We want to break it up into words.
+
+
+1.  First, break at all pauses (cannot pause in the middle of a word).
+2.  Then, pick the first piece that has not been uniquely resolved.
+ A.  The first thing is to deal with some constructs which are required to
+ end with a pause:
+   1)  Names:
+     a)  If the last letter of the piece is a consonant, we have a name.  A
+     name must have a pause before it UNLESS it is immediately preceded by
+     a /la/, /lai/, /la'i/ or /doi/ as a marker, and it cannot contain any
+     of these markers unless the marker is immediately preceded by a conso-
+     nant.  So, look backwards from the end of the piece for any of the
+     allowed markers.  If we don't find one (e.g. /jonz/), then the whole
+     piece has been resolved as a name.
+     b)  If you do find such a marker, then check what immediately precedes
+     it.  If there is nothing (e.g. /ladjAn/), or if a vowel precedes (e.g.
+     /mivIskaladjAn./, break off the marker as a resolved piece (/la/), and
+     what follows it is also a resolved piece, a name (/djAn/), leaving us
+     with whatever preceded the marker, if anything, as still unresolved
+     (/mivIska/).
+     c)  If what precedes the marker is a consonant (e.g. /karoslAInas/)
+     then ignore the marker and continue looking backwards.  This exception
+     is allowed because /karos/ with no following pause cannot represent a
+     separate word.
+   2)  ".y.", the hesitation:
+     If the piece consists solely of /y/, then it resolves as the
+     hesitation word (which is required to be surrounded by pauses).
+   
+   3) If the piece ends in "y", check for some lerfu words:  specifically,   the last lerfu word of a string, if it ends in a "y" (e.g. /abubycydy/
+   or /y'y/), must be followed by a pause:
+     a)  If the "y" is preceded by a consonant, break off the consonant+"y"
+     as a resolved lerfu word (e.g. /abubycydy/ gives /abubycy/ unresolved,
+     and /dy/ resolved as a lerfu word). Continue breaking off any Cy
+     pieces as lerfu words if they're there (e.g. unresolved /abubycy/
+     gives unresolved /abuby/ + resolved /cy/; then /abuby/ gives un-
+     resolved /abu/ plus resolved /by/).
+      Note that the Cy-type lerfu words will NEVER come before the other
+      lerfu word pieces in a breath-group - the "abu" and "y'y" types -
+      since they begin with vowels, they MUST be preceded by pauses; and Cy
+      followed by anything but another Cy must be followed by a pause
+      (because "y" is used as glue in lujvo, it could cause resolvability
+      problems if not separate; e.g. /micybusmAbru/ would not uniquely re-
+      solve).
+     b)  If the "y" is preceded by "V'" or "y'" (e.g. /y'y/), break before
+     the "V", and the "V'y" is resolved as a lerfu word.
+     c)  If the "y" is preceded by an "i" or "u" ("iy" and "uy" are
+     reserved) the piece cannot be resolved.
+     d)  If the "y" is preceded by a vowel (V) other than "i" or "u", the
+     piece is in error and cannot be further resolved.
+ B.  Next, see if the piece is composed entirely of cmavo.
+   1)  Check the piece to see if there are any consonant clusters (a
+   consonant cluster is of one of the forms CC or CyC). If there are none,
+   break up the piece before each consonant, resolving each piece as a
+   cmavo (e.g. /alenumibaca'a/ breaks into the cmavo /a/ + /le/ + /nu/ +
+   /mi/ + /ba/ + /ca'a/).  If there are no consonants, the piece is a
+   single cmavo.  In either case, the piece is completely resolved.o
+ C.  Now we have a piece which we are sure contains a brivla (a gismu, a
+ lujvo or a le'avla).  We know that a brivla must have a consonant cluster
+ (CC or CyC) within the 1st five letters (ignoring apostrophes in the
+ count), and must have penultimate stress (ignoring "y" syllables, which
+ are not allowed to be stressed).
+   1)  First, let's check for a potential error (a form which shouldn't
+   arise):
+     a)  If the piece contains no stress, but has a consonant cluster (CC
+     or CyC), it is in error.  The consonant cluster indicates it contains
+     a brivla (gismu, lujvo or le'avla), which requires penultimate stress.
+     The only place this MIGHT validly occur is inside a zoi-quote (and
+     therefore need not be resolved at all).
+     b)  However, if stress information is not available, assume the brivla
+     ends at the end of the piece.  (This rule gives the right behavior
+     with canonical written Lojban, where spaces separate all words except
+     for some cmavo compounds and stress is normally not marked.)
+   2)  Next, we need to find THE penultimate stress for the first brivla in
+   the piece (the brivla is expected to end after the syllable following
+   the stress, ignoring "y" syllables).  Starting from the first consonant
+   cluster (CC or CyC):
+     a)  If the previous letter is a stressed vowel, take that as THE
+     penultimate stress of the brivla.
+     b)  If the previous letter is an unstressed vowel, but the letter
+     before that is a stressed vowel, then it is a stressed diphthong;
+     treat the entire diphthong as stressed (So that "find the next vowel"
+     will not get just the second half of the diphthong).  Take that as THE
+     penultimate stress.
+     c)  Otherwise, find the first stress after the consonant cluster.  If
+     the stress is on a diphthong, treat the entire diphthong as stressed
+     (So that "find the next vowel" will not get just the second half of
+     the diphthong).  Take that as THE penultimate stress.
+   3)  Next, let's find the end of the first brivla in the piece:     a)  If there is no vowel in the piece after the stress, it can't be a
+     penultimate stress, so the piece is in error (unresolvable).  This is
+     also true if "y" is the only vowel after the stress (e.g. */stAsy/ is
+     not a valid breath-group).
+     b)  If the NEXT vowel following the stress (skipping over "y"'s ) is
+     immediately followed by "'V" (as in /mlAtyci'a/), then the syllable
+     following the stress cannot be the last syllable of a word (since the
+     'V cannot begin the next word).  Ordinarily we would count this as an
+     error, but let's instead assume that this was a secondary stress and
+     ignore the fact that there is some stress on it.  Go find the next
+     stress to use as THE penultimate stress for this brivla (e.g. in
+     /mlAtyci'abrIjuti/, assume the penultimate stress is "I", not "A").
+     c)  Having eliminated all the potential problems with finding the end,
+     let's cut the piece after the end of the brivla:
+      Find the first vowel (not counting "y") after the stress.  If it is
+      part of a diphthong, break after the diphthong; otherwise, break
+      after the vowel itself.
+   4)  Now let's find the beginning of the brivla in the front part of the
+   piece we just broke off:
+     a)  First, break off as many obvious cmavo pieces off the front as we
+     can:
+      1]  If there is no consonant cluster (CC or CyC) in the first 5
+      letters (ignoring apostrophes in the count), then, if the piece
+      starts with a vowel, break off before the first consonant (e.g.
+      /alekArce/ becomes /a/ = cmavo) + /lekArce/ = unresolved), otherwise
+      break off before the second consonant (e.g. /vilekArce/ becomes /vi/
+      = cmavo + /lekArce/ = unresolved).  The front piece is then resolved
+      as a cmavo.
+      2]  Repeat the above as many times as we can (so, /lekArce/ becomes
+      /le/ = cmavo + /kArce/ = unresolved. Since /kArce/ has a consonant
+      cluster in the first five letters, we can't go any further).
+      3]  If the piece we have left starts with a vowel, find the first
+      consonant.  If the first consonant is part of a consonant cluster
+      (only CC-form this time), and this consonant cluster is NOT a valid
+      initial cluster (with each adjacent pair of consonants is a valid
+      initial pair), then we can resolve the entire piece as a le'avla
+      (e.g. /antipAsto/); otherwise (if the first consonant is NOT part of
+      a consonant cluster, or the consonant cluster IS a valid initial
+      cluster), break off before the first consonant as a cmavo (e.g.
+      /a'ofArlu/ becomes /a'o/ = cmavo + /fArlu/ = unresolved; or,
+      /aismAcu/ becomes /ai/ = cmavo + /smAcu/ = unresolved).
+     b)  What's left begins with a consonant and has a consonant cluster
+     (CC or CyC) in the first 5 letters.  The whole thing may be a brivla,
+     or there may be (at most) one consonant-initial cmavo in front.  Here
+     are the possibilities for the start of the piece, and their
+     resolutions:
+      1]  CC...  or CVCyC...:
+        Resolve whole thing as a brivla (a gismu, lujvo, or le'avla).
+      2]  CyC... :
+        Invalid form.  Unresolvable.
+      3]  CVVCC... :        (Note: stressing a cmavo on the final syllable before a brivla is
+        not allowed.)
+        a]  If there is no stress on the VV and the consonant cluster
+        beginning with the CC is a valid initial cluster (i.e., each
+        adjacent pair of consonants is a valid initial pair), then break
+        off the CVV, and resolve it as a cmavo; the remaining piece can
+        then be resolved as a brivla (see "CC....", above).  For example,
+        /leiprEnu/ becomes /lei/ = cmavo + /prEnu/ = brivla.
+        b]  Otherwise (i.e. there IS a stress on the VV, or the first
+        consonant cluster is not a valid initial cluster), resolve the
+        whole thing as a brivla (e.g. /cAItro/ = brivla)
+      4]  CV'VCC... :
+        (Note: stressing a cmavo on the final syllable before a brivla is
+        not allowed.)
+        a]  If there is no stress on the final vowel of the V'V) and the
+        consonant cluster beginning with the CC is a valid initial cluster
+        (i.e., each adjacent pair of consonants is a valid initial pair),
+        then break off the CV'V, and resolve it as a cmavo; the remaining
+        piece can then be resolved as a brivla (see "CC....", above).  For
+        example, /so'iprEnu/ becomes /so'i/ = cmavo + /prEnu/ = brivla.
+        b]  Otherwise (i.e. there is a stress on the final vowel of the
+        V'V, or the first consonant cluster is not a valid initial
+        cluster), resolve the whole thing as a brivla (e.g. /cA'Itro/ =
+        brivla)
+      5]  CVCC... (This is the hard one.  Is the front CV a separate
+      word?):
+        a]  If the whole piece is CVCCV, then the whole thing resolves as a
+        gismu.
+        b]  If the consonant cluster beginning with the CC is not a valid
+        initial cluster (with each adjacent pair of consonants is a valid
+        initial pair), then the whole piece can be resolved as a brivla
+        (gismu, lujvo, or le'avla).  For example, /selfArlu/,
+        /cidjrspagEti/.
+        c]  If the penultimate stress is on the 1st vowel of the CVCC (e.g.
+        /mAtcti/, then resolve the whole thing as a brivla (a lujvo or
+        le'avla).
+        d]  If there is a "y", we need to look at the sub-piece up to the
+        first "y":
+          1>  If the sub-piece consists entirely of CVC's repeating (at
+          least 2 needed: e.g. /cacric/), and all the CC's of the sub-piece
+          are valid initial clusters, then resolve the initial CV as a
+          cmavo, and the rest of the whole piece is a brivla (a lujvo or
+          le'avla).
+          2>  Otherwise, if the sub-piece can be broken down into any
+          number (including 0) of valid lujvo "front-middles" in front and
+          exactly one valid lujvo "end" thereafter, resolve the whole piece
+          as a brivla.
+           a>  Valid front-middles (we've eliminated all but those starting
+           with CV): CVC CVV CV'V CCV
+           b>  Valid ends: CVC CCVC CVCC
+          3>  Otherwise, the front CV should be resolved as a cmavo, and
+          the remaining piece is resolved as a brivla (a lujvo or le'avla)
+        e]  If there is no "y":
+          1>  If the piece consists of CVC's repeating (at least 2 needed)
+          up to a final CV (e.g. /cacricfu/), and all the CC's of the sub-
+          piece are valid initial clusters, then resolve the initial CV as
+          a cmavo, and the rest of the piece is a brivla (a lujvo).
+          2>  Otherwise, if the piece can be broken down into any number
+          (including 0) of valid lujvo "front-middles" in front and exactly
+          one valid lujvo "end", then resolve the whole piece as a brivla          (a lujvo).
+           a>  Valid front-middles (we've eliminated all but those starting
+           with CV): CVC CVV CV'V CVC
+           d>  Valid ends: CVV CV'V CCV CCVCV CVCCV
+
+          3>  Otherwise, the front CV should be resolved as a cmavo, and
+          the remaining piece is resolved as a brivla (a le'avla).         
+               
+      6]  Any other beginning (e.g. CVVCyC):
+        Resolve the whole as an error.
+
+
+
+_______________________________________
--- a/Cargo.toml
+++ b/Cargo.toml
@ -0,0 +1,11 @@
+[package]
+name = "elf_lojban"
+version = "0.1.0-beta"
+edition = "2018"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+
+[dev-dependencies]
+rand = "0.8.5"
--- a/src/lex/cluster.rs
+++ b/src/lex/cluster.rs
@ -0,0 +1,370 @@
+use crate::{
+   lojbanic::{is_valid_consonant_pair, is_valid_vowel_pair_name, Lojbanic},
+   strange::StrRange,
+};
+
+fn split_consonants(s: StrRange) -> Vec<StrRange> {
+   let mut output = Vec::new();
+   let mut temp_buf = StrRange::new(s.src(), s.start(), 0);
+   let mut previous_ch = None;
+   for (at, ch) in s.as_str().char_indices() {
+      let char_len = ch.len_utf8();
+      if let Some(previous_ch) = previous_ch {
+         if !is_valid_consonant_pair(previous_ch, ch) {
+            output.push(temp_buf);
+            temp_buf = StrRange::new(s.src(), at, char_len);
+         } else {
+            temp_buf.increase_length(char_len);
+         }
+      } else {
+         temp_buf.increase_length(char_len);
+      }
+      previous_ch = ch.into();
+   }
+   if !temp_buf.is_empty() {
+      output.push(temp_buf);
+   }
+   output
+}
+
+#[test]
+fn splitsonants() {
+   let src = "mmmmmmmmm";
+   assert_eq![
+      split_consonants(StrRange::new(src, 0, src.len()))
+         .iter()
+         .map(|s| s.as_str())
+         .collect::<Vec<&str>>(),
+      ["m", "m", "m", "m", "m", "m", "m", "m", "m",]
+   ];
+   let src = "bcdfgjk";
+   assert_eq![
+      split_consonants(StrRange::new(src, 0, src.len()))
+         .iter()
+         .map(|s| s.as_str())
+         .collect::<Vec<&str>>(),
+      ["b", "c", "d", "f", "gj", "k",]
+   ];
+}
+
+fn split_vowels(s: StrRange) -> Vec<StrRange> {
+   let mut output = Vec::new();
+   let mut temp_buf = StrRange::new(s.src(), s.start(), 0);
+   let mut previous_ch = None;
+   for (at, ch) in s.as_str().char_indices() {
+      let char_len = ch.len_utf8();
+      if let Some(previous_ch) = previous_ch {
+         if !is_valid_vowel_pair_name(previous_ch, ch) {
+            output.push(temp_buf);
+            temp_buf = StrRange::new(s.src(), at, char_len);
+         } else {
+            temp_buf.increase_length(char_len);
+         }
+      } else {
+         temp_buf.increase_length(char_len);
+      }
+      previous_ch = ch.into();
+   }
+   if !temp_buf.is_empty() {
+      output.push(temp_buf);
+   }
+   output
+}
+
+#[test]
+fn splowels() {
+   let src = "aaaaaaaaa";
+   assert_eq![
+      split_vowels(StrRange::new(src, 0, src.len()))
+         .iter()
+         .map(|s| s.as_str())
+         .collect::<Vec<&str>>(),
+      ["a", "a", "a", "a", "a", "a", "a", "a", "a",]
+   ];
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct Cluster<'src_buf> {
+   pub s: StrRange<'src_buf>,
+   pub kind: ClusterKind,
+}
+
+macro_rules! gen_cluster_fns {
+   ($($nym:ident, $v:ident),*) => {$(
+      pub fn $nym(s: StrRange<'src_buf>) -> Self {
+         Self {
+            s,
+            kind: ClusterKind::$v,
+         }
+      })*
+   };
+}
+
+impl<'src_buf> Cluster<'src_buf> {
+   gen_cluster_fns![
+      consonant, Consonant, number, Number, huhboo, Huhboo, glide, Glide, unknown, Unknown, vowel,
+      Vowel, whitespace, Whitespace
+   ];
+}
+
+#[derive(Debug, PartialEq, Eq)]
+pub enum ClusterKind {
+   Consonant,
+   Number,
+   Huhboo,
+   Glide,
+   Unknown,
+   Vowel,
+   Whitespace,
+}
+
+impl<'src_buf> Cluster<'src_buf> {
+   pub fn len(&self) -> usize {
+      self.s.len()
+   }
+}
+
+enum ClusterState {
+   Any,
+   BetweenVowels,
+   Consonant,
+   Number,
+   Unknown,
+   Vowel,
+   Whitespace,
+}
+
+pub fn clusterise(s: &str) -> Vec<Cluster> {
+   let mut output = vec![];
+   let mut temp_buf = StrRange::new(s, 0, 0);
+   let mut state = ClusterState::Any;
+   for (at, ch) in s.char_indices() {
+      let char_len = ch.len_utf8();
+      match state {
+         ClusterState::Any => {
+            temp_buf.increase_length(char_len);
+            match ch {
+               e if e.is_lojban_apostrophe() | e.is_lojban_glide() | !e.is_lojbanic() => {
+                  state = ClusterState::Unknown
+               }
+               e if e.is_lojban_consonant() => state = ClusterState::Consonant,
+               e if e.is_lojban_vowel() => state = ClusterState::Vowel,
+               e if e.is_whitespace() | e.is_lojban_stop() => state = ClusterState::Whitespace,
+               e if e.is_numeric() => state = ClusterState::Number,
+               _ => unreachable![],
+            };
+         }
+         ClusterState::BetweenVowels => match ch {
+            e if e.is_lojban_vowel() => {
+               if temp_buf.as_str() == "'" {
+                  output.push(Cluster::huhboo(temp_buf));
+               } else {
+                  output.push(Cluster::glide(temp_buf));
+               }
+               state = ClusterState::Vowel;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            _ => {
+               state = ClusterState::Unknown;
+               temp_buf.increase_length(char_len);
+            }
+         },
+         ClusterState::Consonant => match ch {
+            e if e.is_lojban_consonant() => {
+               temp_buf.increase_length(char_len);
+            }
+            e if e.is_lojban_vowel() | e.is_whitespace() | e.is_lojban_stop() => {
+               let clusters = split_consonants(temp_buf.clone());
+               let len = clusters.len();
+               let mut accum = 0..0;
+               for (j, c) in clusters.into_iter().enumerate() {
+                  if j == 0 || j == len - 1 {
+                     output.push(Cluster::consonant(c));
+                     if j != 0 && !accum.is_empty() {
+                        output.push(Cluster::unknown(StrRange::new(
+                           s,
+                           temp_buf.start() + accum.start,
+                           accum.len(),
+                        )));
+                        break;
+                     }
+                  } else {
+                     accum.end += c.len();
+                  }
+               }
+               state = if !ch.is_lojban_vowel() {
+                  ClusterState::Whitespace
+               } else {
+                  ClusterState::Vowel
+               };
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if e.is_lojban_apostrophe()
+               | e.is_lojban_glide()
+               | !e.is_lojbanic()
+               | e.is_numeric() =>
+            {
+               temp_buf.increase_length(char_len);
+               state = ClusterState::Unknown;
+            }
+            _ => unreachable![],
+         },
+         ClusterState::Number => match ch {
+            e if e.is_numeric() => {
+               temp_buf.increase_length(char_len);
+            }
+            e if e.is_lojban_apostrophe() | e.is_lojban_glide() => {
+               state = ClusterState::Unknown;
+               temp_buf.increase_length(char_len);
+            }
+            e if e.is_lojban_vowel()
+               | e.is_lojban_consonant()
+               | e.is_whitespace()
+               | e.is_lojban_stop() =>
+            {
+               output.push(Cluster::number(temp_buf));
+               state = if ch.is_lojban_vowel() {
+                  ClusterState::Vowel
+               } else if ch.is_lojban_consonant() {
+                  ClusterState::Consonant
+               } else {
+                  ClusterState::Whitespace
+               };
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if !e.is_lojbanic() => {
+               state = ClusterState::Unknown;
+               temp_buf.increase_length(char_len);
+            }
+            _ => unreachable![],
+         },
+         ClusterState::Unknown => match ch {
+            e if e.is_whitespace() => {
+               output.push(Cluster::unknown(temp_buf));
+               state = ClusterState::Whitespace;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            _ => temp_buf.increase_length(char_len),
+         },
+         ClusterState::Vowel => match ch {
+            e if e.is_lojban_apostrophe() => {
+               for vowels in split_vowels(temp_buf) {
+                  output.push(Cluster::vowel(vowels));
+               }
+               state = ClusterState::BetweenVowels;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if e.is_lojban_glide() => {
+               for vowels in split_vowels(temp_buf) {
+                  output.push(Cluster::vowel(vowels));
+               }
+               state = ClusterState::BetweenVowels;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if e.is_lojban_vowel() => {
+               temp_buf.increase_length(char_len);
+            }
+            e if e.is_lojban_consonant() | e.is_lojban_stop() | e.is_whitespace() => {
+               for vowels in split_vowels(temp_buf) {
+                  output.push(Cluster::vowel(vowels));
+               }
+               state = if !ch.is_lojban_consonant() {
+                  ClusterState::Whitespace
+               } else {
+                  ClusterState::Consonant
+               };
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if e.is_numeric() => {
+               todo![]
+            }
+            e if !e.is_lojbanic() => {
+               state = ClusterState::Unknown;
+               temp_buf.increase_length(char_len);
+            }
+            _ => unreachable![],
+         },
+         ClusterState::Whitespace => match ch {
+            e if e.is_lojban_apostrophe() | e.is_lojban_glide() => {
+               output.push(Cluster::whitespace(temp_buf));
+               state = ClusterState::Unknown;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if e.is_lojban_consonant() => {
+               output.push(Cluster::whitespace(temp_buf));
+               state = ClusterState::Consonant;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if e.is_lojban_vowel() => {
+               output.push(Cluster::whitespace(temp_buf));
+               state = ClusterState::Vowel;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if e.is_whitespace() | e.is_lojban_stop() => {
+               temp_buf.increase_length(char_len);
+            }
+            e if e.is_numeric() => {
+               output.push(Cluster::whitespace(temp_buf));
+               state = ClusterState::Number;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            e if !e.is_lojbanic() => {
+               output.push(Cluster::whitespace(temp_buf));
+               state = ClusterState::Unknown;
+               temp_buf = StrRange::new(s, at, char_len);
+            }
+            _ => unreachable![],
+         },
+      }
+   }
+   if !temp_buf.is_empty() {
+      match state {
+         ClusterState::Any => {}
+         ClusterState::BetweenVowels => {
+            if temp_buf.clone().as_str() == "'" {
+               output.push(Cluster::huhboo(temp_buf));
+            } else {
+               output.push(Cluster::glide(temp_buf));
+            }
+         }
+         ClusterState::Consonant => {
+            let clusters = split_consonants(temp_buf.clone());
+            let len = clusters.len();
+            let mut accum = 0..0;
+            for (j, c) in clusters.into_iter().enumerate() {
+               if j == 0 || j == len - 1 {
+                  output.push(Cluster::consonant(c));
+                  if j != 0 && !accum.clone().is_empty() {
+                     output.push(Cluster::unknown(StrRange::new(
+                        s,
+                        temp_buf.start() + accum.start,
+                        accum.len(),
+                     )));
+                     break;
+                  }
+               } else {
+                  accum.end += c.len();
+               }
+            }
+         }
+         ClusterState::Number => {
+            output.push(Cluster::number(temp_buf));
+         }
+         ClusterState::Unknown => {
+            output.push(Cluster::unknown(temp_buf));
+         }
+         ClusterState::Vowel => {
+            for vowels in split_vowels(temp_buf) {
+               output.push(Cluster::vowel(vowels));
+            }
+         }
+         ClusterState::Whitespace => output.push(Cluster::whitespace(temp_buf)),
+      }
+   }
+   output
+}
+
+// #[test]
+// fn periods_are_whitespace() {
+//    assert_eq![clusterise("tssssssssssssssi."), [Cluster::whitespace(".")]]
+// }
--- a/src/lex/mod.rs
+++ b/src/lex/mod.rs
@ -0,0 +1,260 @@
+mod cluster;
+mod pattern;
+use crate::{lojbanic::starts_with_permissible_initial_pair, strange::StrRange, Token};
+use cluster::{clusterise, Cluster, ClusterKind};
+use pattern::Pattern;
+
+fn matches_gismu(clusters: &[Cluster]) -> bool {
+   if Pattern::CVCCV.matches_strict(clusters) || Pattern::CCVCV.matches_strict(clusters) {
+      if let Some(Cluster { s: _, kind }) = clusters.get(5) {
+         (match kind {
+            ClusterKind::Consonant | ClusterKind::Number | ClusterKind::Whitespace => true,
+            ClusterKind::Vowel => false,
+            // pretty sure these are unreachable at this point
+            ClusterKind::Huhboo => false,
+            ClusterKind::Glide => false,
+            ClusterKind::Unknown => false,
+         }) && { clusters.iter().take(4).fold(0, |a, c| c.len() + a) == 5 }
+      } else {
+         true
+      }
+   } else {
+      false
+   }
+}
+
+fn matches_lujvo(clusters: &[Cluster]) -> bool {
+   Pattern::CCVCCV.matches(clusters)
+      || Pattern::CCV.matches(clusters)
+      || if Pattern::CVCCV.matches(clusters) {
+         clusters.iter().take(2).fold(0, |a, c| c.len() + a) <= 3
+      } else {
+         false
+      }
+}
+
+fn matches_cmavo(clusters: &[Cluster]) -> bool {
+   if Pattern::CVCCV.matches(clusters) {
+      starts_with_permissible_initial_pair(&clusters[2].s)
+   } else {
+      Pattern::CVCCVCCV.matches(clusters)
+         || Pattern::CV.matches(clusters)
+         || Pattern::V.matches(clusters)
+   }
+}
+
+fn matches_cmevla(clusters: &[Cluster]) -> bool {
+   let mut previous_was_consonant = false;
+   for Cluster { s: _, kind } in clusters {
+      match kind {
+         ClusterKind::Consonant => previous_was_consonant = true,
+         ClusterKind::Huhboo | ClusterKind::Glide | ClusterKind::Vowel => {
+            previous_was_consonant = false
+         }
+         ClusterKind::Unknown | ClusterKind::Whitespace => return false,
+         ClusterKind::Number => break,
+      }
+   }
+   previous_was_consonant
+}
+
+fn matches_unknown(clusters: &[Cluster]) -> bool {
+   for Cluster { s: _, kind } in clusters {
+      if let ClusterKind::Unknown = kind {
+         return true;
+      }
+      if let ClusterKind::Whitespace = kind {
+         return false;
+      }
+   }
+   false
+}
+
+fn eat_cmevla<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
+   let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
+   let mut new_offset = 0;
+   for (i, Cluster { s, kind }) in rest.iter().enumerate() {
+      match kind {
+         ClusterKind::Consonant | ClusterKind::Vowel | ClusterKind::Huhboo | ClusterKind::Glide => {
+            temp_buf.increase_length(s.len())
+         }
+         _ => break,
+      }
+      new_offset = i + 1;
+   }
+
+   (temp_buf, &rest[new_offset..])
+}
+
+fn eat_gismu<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
+   let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
+   let gismu_offset = 4;
+   (0..gismu_offset).for_each(|i| temp_buf.increase_length(rest[i].len()));
+   (temp_buf, &rest[gismu_offset..])
+}
+
+fn eat_lujvo<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
+   let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
+   let mut new_offset = 0;
+   let mut stressed = false;
+   for (i, Cluster { s, kind }) in rest.iter().enumerate() {
+      match kind {
+         ClusterKind::Consonant | ClusterKind::Vowel => {
+            temp_buf.increase_length(s.len());
+            if s.as_str().to_lowercase() != *s.as_str() {
+               stressed = true;
+               new_offset += 1;
+               continue;
+            }
+         }
+         ClusterKind::Huhboo | ClusterKind::Glide => temp_buf.increase_length(s.len()),
+         _ => break,
+      }
+      if stressed {
+         break;
+      }
+      new_offset = i + 1;
+   }
+   (temp_buf, &rest[new_offset..])
+}
+
+fn eat_cmavo<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
+   let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
+   let mut new_offset = 0;
+   let mut found_consonant = false;
+   for (i, Cluster { s, kind }) in rest.iter().enumerate() {
+      match kind {
+         ClusterKind::Consonant => {
+            if found_consonant {
+               break;
+            } else {
+               found_consonant = true;
+               temp_buf.increase_length(s.len());
+            }
+         }
+         ClusterKind::Vowel => {
+            temp_buf.increase_length(s.len());
+            found_consonant = true;
+         }
+         ClusterKind::Huhboo => temp_buf.increase_length(s.len()),
+         ClusterKind::Glide => temp_buf.increase_length(s.len()),
+         _ => break,
+      }
+      new_offset = i;
+   }
+   (temp_buf, &rest[new_offset + 1..])
+}
+
+fn eat_number<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
+   let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
+   let mut new_offset = 0;
+   let mut finished = false;
+   for (i, Cluster { s, kind }) in rest.iter().enumerate() {
+      new_offset = i;
+      finished = false;
+      match kind {
+         ClusterKind::Number => temp_buf.increase_length(s.len()),
+         _ => break,
+      }
+      finished = true;
+   }
+   if finished {
+      new_offset += 1;
+   }
+   (temp_buf, &rest[new_offset..])
+}
+
+fn eat_non_lojban<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
+   let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
+   let mut new_offset = 0;
+   for (i, Cluster { s, kind }) in rest.iter().enumerate() {
+      new_offset = i + 1;
+      match kind {
+         ClusterKind::Whitespace => break,
+         _ => temp_buf.increase_length(s.len()),
+      }
+   }
+   (temp_buf, &rest[new_offset..])
+}
+
+fn eat_whitespace<'a, 'b>(rest: &'b [Cluster<'a>]) -> (StrRange<'a>, &'b [Cluster<'a>]) {
+   let mut temp_buf = StrRange::new(rest[0].s.src(), rest[0].s.start(), 0);
+   let mut new_offset = 0;
+   let mut finished = false;
+   for (i, Cluster { s, kind }) in rest.iter().enumerate() {
+      new_offset = i;
+      finished = false;
+      match kind {
+         ClusterKind::Whitespace => temp_buf.increase_length(s.len()),
+         _ => break,
+      }
+      finished = true;
+   }
+   if finished {
+      new_offset += 1;
+   }
+   (temp_buf, &rest[new_offset..])
+}
+
+pub fn lex(src: &str) -> Vec<Token> {
+   let mut output = Vec::new();
+   let clusters = clusterise(src);
+   let mut rest = clusters.as_slice();
+   loop {
+      output.push(if matches_unknown(rest) {
+         let (buf, new_rest) = eat_non_lojban(rest);
+         rest = new_rest;
+         Token::unknown(buf.as_str())
+      } else if matches_cmevla(rest) {
+         let (buf, new_rest) = eat_cmevla(rest);
+         rest = new_rest;
+         Token::cmevla(buf.as_str())
+      } else if matches_gismu(rest) {
+         let (buf, new_rest) = eat_gismu(rest);
+         rest = new_rest;
+         Token::brivla(buf.as_str())
+      } else if matches_lujvo(rest) {
+         let (buf, new_rest) = eat_lujvo(rest);
+         rest = new_rest;
+         Token::brivla(buf.as_str())
+      } else if matches_cmavo(rest) {
+         let (buf, new_rest) = eat_cmavo(rest);
+         rest = new_rest;
+         Token::cmavo(buf.as_str())
+      } else {
+         match rest.get(0) {
+            Some(Cluster {
+               s: _,
+               kind: ClusterKind::Number,
+            }) => {
+               let (buf, new_rest) = eat_number(rest);
+               rest = new_rest;
+               Token::number(buf.as_str())
+            }
+            Some(Cluster {
+               s: _,
+               kind: ClusterKind::Unknown,
+            }) => {
+               let (buf, new_rest) = eat_non_lojban(rest);
+               rest = new_rest;
+               Token::unknown(buf.as_str())
+            }
+            Some(Cluster {
+               s: _,
+               kind: ClusterKind::Whitespace,
+            }) => {
+               let (buf, new_rest) = eat_whitespace(rest);
+               rest = new_rest;
+               Token::whitespace(buf.as_str())
+            }
+            Some(Cluster { s: _, kind: _ }) => {
+               let (buf, new_rest) = eat_non_lojban(rest);
+               rest = new_rest;
+               Token::unknown(buf.as_str())
+            }
+            None => break,
+         }
+      });
+   }
+   output
+}
--- a/src/lex/pattern.rs
+++ b/src/lex/pattern.rs
@ -0,0 +1,156 @@
+use super::{cluster::Cluster, cluster::ClusterKind};
+use Clust::*;
+
+#[derive(Copy, Clone)]
+pub enum Clust {
+   C,
+   Cc,
+   V,
+}
+
+impl From<&'static [Clust]> for Pattern {
+   fn from(pat: &'static [Clust]) -> Self {
+      Self { pat }
+   }
+}
+
+pub struct Pattern {
+   pub pat: &'static [Clust],
+}
+
+impl Pattern {
+   pub const CV: Self = Self { pat: &[C, V] };
+   pub const V: Self = Self { pat: &[V] };
+   pub const CCV: Self = Self { pat: &[Cc, V] };
+   pub const CCVCV: Self = Self {
+      pat: &[Cc, V, C, V],
+   };
+   pub const CCVCCV: Self = Self {
+      pat: &[Cc, V, Cc, V],
+   };
+   pub const CVCCV: Self = Self {
+      pat: &[C, V, Cc, V],
+   };
+   pub const CVCCVCCV: Self = Self {
+      pat: &[C, V, Cc, V, Cc, V],
+   };
+   fn matches_inner(&self, strict: bool, clusters: &[Cluster]) -> bool {
+      let mut pat = self.pat.iter();
+      let mut clusters = clusters.iter();
+      let mut checking_vowel = false;
+      let mut previous_punct = false;
+      loop {
+         if checking_vowel {
+            if let Some(Cluster { s, kind }) = clusters.next() {
+               match kind {
+                  ClusterKind::Consonant => {
+                     if previous_punct {
+                        break false;
+                     }
+                     checking_vowel = false;
+                     if let Some(clust_kind) = pat.next() {
+                        match clust_kind {
+                           Clust::C => {
+                              if s.as_str().chars().count() > 1 {
+                                 break false;
+                              }
+                           }
+                           Clust::Cc => {
+                              if s.as_str().chars().count() <= 1 {
+                                 break false;
+                              }
+                           }
+                           Clust::V => break false,
+                        }
+                     } else {
+                        break true;
+                     }
+                  }
+                  ClusterKind::Whitespace | ClusterKind::Number | ClusterKind::Unknown => {
+                     if pat.next().is_some() {
+                        break false;
+                     } else {
+                        break true;
+                     }
+                  }
+                  ClusterKind::Huhboo | ClusterKind::Glide => {
+                     if previous_punct {
+                        break false;
+                     } else {
+                        previous_punct = true
+                     }
+                  }
+                  ClusterKind::Vowel => {
+                     if previous_punct {
+                        previous_punct = false;
+                     } else if let Some(clust_kind) = pat.next() {
+                        match clust_kind {
+                           Clust::C | Clust::Cc => break false,
+                           Clust::V => {}
+                        }
+                     } else {
+                        break true;
+                     }
+                  }
+               }
+            } else if previous_punct || pat.next().is_some() {
+               break false;
+            } else {
+               break true;
+            }
+         } else if let Some(clust_kind) = pat.next() {
+            if let Some(Cluster { s, kind }) = clusters.next() {
+               match kind {
+                  ClusterKind::Consonant => match clust_kind {
+                     Clust::C => {
+                        if s.as_str().chars().count() != 1 {
+                           break false;
+                        }
+                     }
+                     Clust::Cc => {
+                        if strict {
+                           if s.as_str().chars().count() <= 1 {
+                              break false;
+                           }
+                        } else if s.as_str().chars().count() != 2 {
+                           break false;
+                        }
+                     }
+                     Clust::V => break false,
+                  },
+                  ClusterKind::Whitespace
+                  | ClusterKind::Number
+                  | ClusterKind::Huhboo
+                  | ClusterKind::Glide
+                  | ClusterKind::Unknown => break false,
+                  ClusterKind::Vowel => match clust_kind {
+                     Clust::C | Clust::Cc => break false,
+                     Clust::V => {
+                        if !strict {
+                           checking_vowel = true;
+                        }
+                     }
+                  },
+               }
+            } else {
+               break false;
+            }
+         } else {
+            break true;
+         }
+      }
+   }
+   pub(crate) fn matches(&self, clusters: &[Cluster]) -> bool {
+      self.matches_inner(false, clusters)
+   }
+   pub(crate) fn matches_strict(&self, clusters: &[Cluster]) -> bool {
+      self.matches_inner(true, clusters)
+   }
+}
+
+#[test]
+fn patterns_match() {
+   use super::clusterise;
+   assert![Pattern::CV.matches(&clusterise("do".into()))];
+   assert![Pattern::CV.matches(&clusterise("fa'i".into()))];
+}
--- a/src/lib.rs
+++ b/src/lib.rs
@ -0,0 +1,72 @@
+//! # elf_lojban
+//!
+//! elf_lojban: lex your lojban. Parser coming soon.
+//!
+//! ## Basic Usage
+//!
+//! ```
+//! use elf_lojban::lex;
+//!
+//! // Parse including whitespace
+//! let tokens = lex("mi prami do");
+//!
+//! assert_eq![
+//!   tokens.iter().map(|t| t.s ).collect::<Vec<&str>>(),
+//!   ["mi", " ", "prami", " ", "do"]
+//! ];
+//! ```
+
+mod lex;
+pub mod lojbanic;
+mod strange;
+pub use lex::*;
+
+#[derive(Debug, Eq, PartialEq)]
+pub struct Token<'src_buf> {
+   pub s: &'src_buf str,
+   pub kind: TokenKind,
+}
+
+macro_rules! gen_token_fns {
+   ($($nym:ident, $v:ident),*) => {$(
+      pub fn $nym(s: &'src_buf str) -> Self {
+         Self {
+            s,
+            kind: TokenKind::$v,
+         }
+      })*
+   };
+}
+
+impl<'src_buf> Token<'src_buf> {
+   gen_token_fns![
+      brivla, Brivla, cmavo, Cmavo, cmevla, Cmevla, number, Number, unknown, Unknown, whitespace,
+      Whitespace
+   ];
+}
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum TokenKind {
+   Brivla,
+   Cmavo,
+   Cmevla,
+   Number,
+   Unknown,
+   Whitespace,
+}
+
+#[test]
+fn lexes() {
+   assert_eq![
+      lex("mi prami do"),
+      [
+         Token::cmavo("mi"),
+         Token::whitespace(" "),
+         Token::brivla("prami"),
+         Token::whitespace(" "),
+         Token::cmavo("do")
+      ]
+   ];
+   assert_eq![lex("garbage"), [Token::brivla("garbage")]];
+   assert_eq![lex("loprami"), [Token::brivla("loprami")]];
+}
--- a/src/lojbanic.rs
+++ b/src/lojbanic.rs
@ -0,0 +1,241 @@
+use crate::strange::StrRange;
+
+const CONSONANT_LOWER: &str = "bcdfgjklmnprstvzx";
+const CONSONANT_UPPER: &str = "BCDFGJKLMNPRSTVZX";
+const VOICED: &str = "BDGJVZbdgjvz";
+const UNVOICED: &str = "CFKPSTXcfkpstx";
+const SYLLABIC_CONSONANTS: &str = "lmnrLMNR";
+#[rustfmt::skip]
+const VOWEL_LOWER: &str = "aeiouyáạàảãăắặằẳẵâấậầẩẫeéẹèẻẽêếệềểễiíịìỉĩoóọòỏõôốộồổỗơớợờởỡuúụùủũưứựừửữyýỵỳỷỹ";
+#[rustfmt::skip]
+const VOWEL_UPPER: &str = "AEIOUYÁẠÀẢÃĂẮẶẰẲẴÂẤẬẦẨẪEÉẸÈẺẼÊẾỆỀỂỄIÍỊÌỈĨOÓỌÒỎÕÔỐỘỒỔỖƠỚỢỜỞỠUÚỤÙỦŨƯỨỰỪỬỮYÝỴỲỶỸ";
+const APOSTROPHE_LOWER: &str = "'";
+const APOSTROPHE_UPPER: &str = "h";
+const STOP: &str = ".";
+const GLIDE: &str = ",";
+const NUMERAL: &str = "0123456789";
+const A: &str = "aáạàảãăắặằẳẵâấậầẩẫAÁẠÀẢÃĂẮẶẰẲẴÂẤẬẦẨẪ";
+const E: &str = "eeéẹèẻẽêếệềểễEEÉẸÈẺẼÊẾỆỀỂỄ";
+const I: &str = "iiíịìỉĩIIÍỊÌỈĨ";
+const O: &str = "ooóọòỏõôốộồổỗơớợờởỡOOÓỌÒỎÕÔỐỘỒỔỖƠỚỢỜỞỠ";
+const U: &str = "uuúụùủũưứựừửữUUÚỤÙỦŨƯỨỰỪỬỮ";
+const Y: &str = "yyýỵỳỷỹYYÝỴỲỶỸ";
+const PERMISSIBLE_INITIAL_PAIRS: &[&str] = &[
+   "bl", "br", "cf", "ck", "cl", "cm", "cn", "cp", "cr", "ct", "dj", "dr", "dz", "fl", "fr", "gl",
+   "gr", "jb", "jd", "jg", "jm", "jv", "kl", "kr", "ml", "mr", "pl", "pr", "sf", "sk", "sl", "sm",
+   "sn", "sp", "sr", "st", "tc", "tr", "ts", "vl", "vr", "xl", "xr", "zb", "zd", "zg", "zm", "zv",
+];
+
+pub trait Lojbanic {
+   fn is_lojbanic(&self) -> bool;
+   fn is_lojban_consonant(&self) -> bool;
+   fn is_lojban_voiced(&self) -> bool;
+   fn is_lojban_unvoiced(&self) -> bool;
+   fn is_lojban_syllabic_consonant(&self) -> bool;
+   fn is_lojban_vowel(&self) -> bool;
+   fn is_lojban_uppercase(&self) -> bool;
+   fn is_lojban_lowercase(&self) -> bool;
+   fn is_lojban_apostrophe(&self) -> bool;
+   fn is_lojban_stop(&self) -> bool;
+   fn is_lojban_glide(&self) -> bool;
+   fn is_lojban_a(&self) -> bool;
+   fn is_lojban_e(&self) -> bool;
+   fn is_lojban_i(&self) -> bool;
+   fn is_lojban_o(&self) -> bool;
+   fn is_lojban_u(&self) -> bool;
+   fn is_lojban_y(&self) -> bool;
+}
+
+impl Lojbanic for char {
+   fn is_lojbanic(&self) -> bool {
+      [
+         CONSONANT_LOWER,
+         CONSONANT_UPPER,
+         VOWEL_LOWER,
+         VOWEL_UPPER,
+         APOSTROPHE_LOWER,
+         APOSTROPHE_UPPER,
+         NUMERAL,
+         GLIDE,
+         STOP,
+      ]
+      .iter()
+      .any(|s| s.contains(*self))
+   }
+   fn is_lojban_consonant(&self) -> bool {
+      self.is_lojbanic()
+         && [CONSONANT_LOWER, CONSONANT_UPPER]
+            .iter()
+            .any(|s| s.contains(*self))
+   }
+   fn is_lojban_vowel(&self) -> bool {
+      self.is_lojbanic() && [VOWEL_LOWER, VOWEL_UPPER].iter().any(|s| s.contains(*self))
+   }
+   fn is_lojban_uppercase(&self) -> bool {
+      self.is_lojbanic()
+         && [CONSONANT_UPPER, VOWEL_UPPER, APOSTROPHE_UPPER]
+            .iter()
+            .any(|s| s.contains(*self))
+   }
+   fn is_lojban_voiced(&self) -> bool {
+      self.is_lojbanic() && VOICED.contains(*self)
+   }
+   fn is_lojban_unvoiced(&self) -> bool {
+      self.is_lojbanic() && UNVOICED.contains(*self)
+   }
+   fn is_lojban_syllabic_consonant(&self) -> bool {
+      self.is_lojbanic() && SYLLABIC_CONSONANTS.contains(*self)
+   }
+   fn is_lojban_lowercase(&self) -> bool {
+      self.is_lojbanic() && (CONSONANT_UPPER.contains(*self) || VOWEL_LOWER.contains(*self))
+   }
+   fn is_lojban_apostrophe(&self) -> bool {
+      [APOSTROPHE_LOWER, APOSTROPHE_UPPER]
+         .iter()
+         .any(|s| s.contains(*self))
+   }
+   fn is_lojban_stop(&self) -> bool {
+      STOP.contains(*self)
+   }
+   fn is_lojban_glide(&self) -> bool {
+      GLIDE.contains(*self)
+   }
+   fn is_lojban_a(&self) -> bool {
+      A.contains(*self)
+   }
+   fn is_lojban_e(&self) -> bool {
+      E.contains(*self)
+   }
+   fn is_lojban_i(&self) -> bool {
+      I.contains(*self)
+   }
+   fn is_lojban_o(&self) -> bool {
+      O.contains(*self)
+   }
+   fn is_lojban_u(&self) -> bool {
+      U.contains(*self)
+   }
+   fn is_lojban_y(&self) -> bool {
+      Y.contains(*self)
+   }
+}
+
+macro_rules! option_passthrough_methods {
+   ($($nym:ident),*$(,)?) => {
+      $(fn $nym(&self) -> bool {
+         self.iter().any(|ch| ch.$nym())
+      })*
+   };
+}
+
+impl<T: Lojbanic + Copy> Lojbanic for Option<T> {
+   // If `None`, false, otherwise `Some(ch).unwrap().{method}()`
+   option_passthrough_methods! {
+      is_lojbanic,
+      is_lojban_consonant,
+      is_lojban_vowel,
+      is_lojban_uppercase,
+      is_lojban_voiced,
+      is_lojban_unvoiced,
+      is_lojban_syllabic_consonant,
+      is_lojban_lowercase,
+      is_lojban_apostrophe,
+      is_lojban_stop,
+      is_lojban_glide,
+      is_lojban_a,
+      is_lojban_e,
+      is_lojban_i,
+      is_lojban_o,
+      is_lojban_u,
+      is_lojban_y,
+   }
+}
+
+pub fn is_valid_consonant_pair(left: char, right: char) -> bool {
+   match left {
+      e if e == right => false,
+      e if e.is_lojban_voiced() => right.is_lojban_voiced() || right.is_lojban_syllabic_consonant(),
+      e if e.is_lojban_unvoiced() => {
+         right.is_lojban_unvoiced() || right.is_lojban_syllabic_consonant()
+      }
+      e if e.is_lojban_syllabic_consonant() => true,
+      _ => false,
+   }
+}
+
+#[test]
+fn consonant_pairs() {
+   let permissible_pairs = [
+      ('b', "dgjvzmnlr"),
+      ('c', "fkptlrmn"),
+      ('d', "bgjvlmnzr"),
+      ('f', "ckpstxmnlr"),
+      ('g', "bdjvzmnlr"),
+      ('j', "bdgvmlnr"),
+      ('k', "cfpstmnlr"),
+      ('l', "bcdfgjkpstvxzmnr"),
+      ('m', "bcdfgjkpstvxlrn"),
+      ('n', "bcdfgjkpstvxzlmr"),
+      ('p', "cfkstxmnlr"),
+      ('r', "bcdfgjkpstvxzlmn"),
+      ('s', "fklprtmnx"),
+      ('t', "crsfklpxmn"),
+      ('v', "bdgjzmnlr"),
+      ('x', "fpstmnlr"),
+      ('z', "bdgvmlnr"),
+   ];
+   for (left, s) in permissible_pairs {
+      for right in s.chars() {
+         assert![is_valid_consonant_pair(left, right)];
+      }
+   }
+}
+
+pub fn is_valid_vowel_pair(left: char, right: char) -> bool {
+   (match left {
+      e if e.is_lojban_a() => !right.is_lojban_a() && (right.is_lojban_u() || right.is_lojban_i()),
+      e if e.is_lojban_e() => !right.is_lojban_e() && right.is_lojban_i(),
+      e if e.is_lojban_i() => true,
+      e if e.is_lojban_o() => false,
+      e if e.is_lojban_u() => true,
+      e if e.is_lojban_y() => false,
+      _ => {
+         println!["{left}, {right}"];
+         unreachable![]
+      }
+   }) && !right.is_lojban_y()
+}
+
+pub fn is_valid_vowel_pair_name(left: char, right: char) -> bool {
+   is_valid_vowel_pair(left, right) || left.is_lojban_u() || left.is_lojban_i()
+}
+
+pub fn starts_with_permissible_initial_pair(s: &StrRange) -> bool {
+   if s.len() > 1 {
+      for pair in PERMISSIBLE_INITIAL_PAIRS {
+         if s.as_str().starts_with(pair) {
+            return true;
+         }
+      }
+      false
+   } else {
+      true
+   }
+}
+
+#[test]
+fn vowel_pairs() {
+   let correct = [
+      [false, false, true, false, true, false],
+      [false, false, true, false, false, false],
+      [true, true, true, true, true, true],
+      [false, false, false, false, false, false],
+      [true, true, true, true, true, true],
+      [false, false, false, false, false, false],
+   ];
+   for (i, left) in "aeiouy".chars().enumerate() {
+      for (j, right) in "aeiouy".chars().enumerate() {
+         assert![is_valid_vowel_pair_name(left, right) == correct[i][j]];
+      }
+   }
+}
--- a/src/strange.rs
+++ b/src/strange.rs
@ -0,0 +1,53 @@
+use std::ops::Range;
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+pub struct StrRange<'src_buf> {
+   src: &'src_buf str,
+   range: Range<usize>,
+}
+
+impl<'src_buf> StrRange<'src_buf> {
+   pub fn new(src: &'src_buf str, start: usize, length: usize) -> Self {
+      Self {
+         src,
+         range: start..(start + length),
+      }
+   }
+
+   pub fn src(&self) -> &'src_buf str {
+      self.src
+   }
+
+   pub fn increase_length(&mut self, add_len: usize) {
+      self.range = self.range.start..self.range.end + add_len;
+   }
+
+   pub fn as_str(&self) -> &'src_buf str {
+      &self.src[self.range.clone()]
+   }
+
+   pub fn len(&self) -> usize {
+      self.range.len()
+   }
+
+   pub fn is_empty(&self) -> bool {
+      self.range.is_empty()
+   }
+
+   // pub fn offset_range(&self, range: Range<usize>) -> Self {
+   //    Self {
+   //       range: (self.range.start + range.start)..(self.range.start + range.end),
+   //       src: self.src,
+   //    }
+   // }
+
+   pub fn start(&self) -> usize {
+      self.range.start
+   }
+}
+
+impl<'src_buf> core::fmt::Display for StrRange<'src_buf> {
+   fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> core::fmt::Result {
+      write![f, "{}", self.as_str()]
+   }
+}