use std::cmp;
use std::iter::Filter;
use unic_ucd_segment::WordBreak as WB;
pub struct Words<'a> {
inner: Filter<WordBounds<'a>, fn(&&str) -> bool>,
}
impl<'a> Iterator for Words<'a> {
type Item = &'a str;
#[inline]
fn next(&mut self) -> Option<&'a str> {
self.inner.next()
}
}
impl<'a> DoubleEndedIterator for Words<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
self.inner.next_back()
}
}
impl<'a> Words<'a> {
#[inline]
pub fn new<'b>(s: &'b str, filter: fn(&&str) -> bool) -> Words<'b> {
Words {
inner: WordBounds::new(s).filter(filter),
}
}
}
#[derive(Clone)]
pub struct WordBounds<'a> {
string: &'a str,
cat: Option<WB>,
catb: Option<WB>,
}
#[derive(Clone)]
pub struct WordBoundIndices<'a> {
start_offset: usize,
iter: WordBounds<'a>,
}
impl<'a> WordBoundIndices<'a> {
#[inline]
pub fn new<'b>(s: &'b str) -> WordBoundIndices<'b> {
WordBoundIndices {
start_offset: s.as_ptr() as usize,
iter: WordBounds::new(s),
}
}
#[inline]
pub fn as_str(&self) -> &'a str {
self.iter.as_str()
}
}
impl<'a> Iterator for WordBoundIndices<'a> {
type Item = (usize, &'a str);
#[inline]
fn next(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
self.iter.size_hint()
}
}
impl<'a> DoubleEndedIterator for WordBoundIndices<'a> {
#[inline]
fn next_back(&mut self) -> Option<(usize, &'a str)> {
self.iter
.next_back()
.map(|s| (s.as_ptr() as usize - self.start_offset, s))
}
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum WordBoundsState {
Start,
Letter,
HLetter,
Numeric,
Katakana,
ExtendNumLet,
Regional(RegionalState),
FormatExtend(FormatExtendType),
Zwj,
Emoji,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum FormatExtendType {
AcceptAny,
AcceptNone,
RequireLetter,
RequireHLetter,
AcceptQLetter,
RequireNumeric,
}
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
enum RegionalState {
Half,
Full,
Unknown,
}
impl<'a> Iterator for WordBounds<'a> {
type Item = &'a str;
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
let slen = self.string.len();
(cmp::min(slen, 1), Some(slen))
}
#[inline]
fn next(&mut self) -> Option<&'a str> {
use self::WordBoundsState::*;
use self::FormatExtendType::*;
if self.string.is_empty() {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = 0;
let mut saveidx = 0;
let mut state = Start;
let mut cat = WB::Other;
let mut savecat = WB::Other;
let mut prev_zwj;
for (curr, ch) in self.string.char_indices() {
idx = curr;
prev_zwj = cat == WB::ZWJ;
cat = match self.cat {
None => WB::of(ch),
_ => self.cat.take().unwrap(),
};
take_cat = true;
if state != Start {
match cat {
WB::Extend | WB::Format | WB::ZWJ => continue,
_ => {}
}
}
if prev_zwj {
match cat {
WB::GlueAfterZwj => continue,
WB::EBaseGAZ => {
state = Emoji;
continue;
}
_ => (),
}
}
state = match state {
Start if cat == WB::CR => {
idx += match self.get_next_cat(idx) {
Some(ncat) if ncat == WB::LF => 1,
_ => 0,
};
break;
}
Start => match cat {
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::Numeric => Numeric,
WB::Katakana => Katakana,
WB::ExtendNumLet => ExtendNumLet,
WB::RegionalIndicator => Regional(RegionalState::Half),
WB::LF | WB::Newline => break,
WB::ZWJ => Zwj,
WB::EBase | WB::EBaseGAZ => Emoji,
_ => {
if let Some(ncat) = self.get_next_cat(idx) {
if ncat == WB::Format || ncat == WB::Extend || ncat == WB::ZWJ {
state = FormatExtend(AcceptNone);
self.cat = Some(ncat);
continue;
}
}
break;
}
},
Zwj => {
take_curr = false;
break;
}
Letter | HLetter => match cat {
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::Numeric => Numeric,
WB::ExtendNumLet => ExtendNumLet,
WB::DoubleQuote if state == HLetter => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireHLetter)
}
WB::SingleQuote if state == HLetter => {
FormatExtend(AcceptQLetter)
}
WB::MidLetter | WB::MidNumLet | WB::SingleQuote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireLetter)
}
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
WB::Numeric => Numeric,
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::ExtendNumLet => ExtendNumLet,
WB::MidNum | WB::MidNumLet | WB::SingleQuote => {
savecat = cat;
saveidx = idx;
FormatExtend(RequireNumeric)
}
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
WB::Katakana => Katakana,
WB::ExtendNumLet => ExtendNumLet,
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
WB::ExtendNumLet => ExtendNumLet,
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::Numeric => Numeric,
WB::Katakana => Katakana,
_ => {
take_curr = false;
break;
}
},
Regional(RegionalState::Full) => {
take_curr = false;
break;
}
Regional(RegionalState::Half) => match cat {
WB::RegionalIndicator => Regional(RegionalState::Full),
_ => {
take_curr = false;
break;
}
},
Regional(_) => {
unreachable!("RegionalState::Unknown should not occur on forward iteration")
}
Emoji => match cat {
WB::EModifier => state,
_ => {
take_curr = false;
break;
}
},
FormatExtend(t) => match t {
RequireNumeric if cat == WB::Numeric => Numeric,
RequireLetter | AcceptQLetter if cat == WB::ALetter => Letter,
RequireLetter | AcceptQLetter if cat == WB::HebrewLetter => HLetter,
RequireHLetter if cat == WB::HebrewLetter => HLetter,
AcceptNone | AcceptQLetter => {
take_curr = false;
take_cat = false;
break;
}
_ => break,
},
}
}
if let FormatExtend(t) = state {
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric {
idx = saveidx;
cat = savecat;
take_curr = false;
}
}
self.cat = if take_curr {
idx += self.string[idx..].chars().next().unwrap().len_utf8();
None
} else if take_cat {
Some(cat)
} else {
None
};
let retstr = &self.string[..idx];
self.string = &self.string[idx..];
Some(retstr)
}
}
impl<'a> DoubleEndedIterator for WordBounds<'a> {
#[inline]
fn next_back(&mut self) -> Option<&'a str> {
use self::WordBoundsState::*;
use self::FormatExtendType::*;
if self.string.is_empty() {
return None;
}
let mut take_curr = true;
let mut take_cat = true;
let mut idx = self.string.len();
idx -= self.string.chars().next_back().unwrap().len_utf8();
let mut previdx = idx;
let mut saveidx = idx;
let mut state = Start;
let mut savestate = Start;
let mut cat = WB::Other;
for (curr, ch) in self.string.char_indices().rev() {
previdx = idx;
idx = curr;
cat = match self.catb {
None => WB::of(ch),
_ => self.catb.take().unwrap(),
};
take_cat = true;
if cat == WB::Extend || cat == WB::Format || (cat == WB::ZWJ && state != Zwj) {
if match state {
FormatExtend(_) | Start => false,
_ => true,
} {
saveidx = previdx;
savestate = state;
state = FormatExtend(AcceptNone);
}
if state != Start {
continue;
}
} else if state == FormatExtend(AcceptNone) {
state = savestate;
previdx = saveidx;
take_cat = false;
}
state = match state {
Start | FormatExtend(AcceptAny) => match cat {
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::Numeric => Numeric,
WB::Katakana => Katakana,
WB::ExtendNumLet => ExtendNumLet,
WB::RegionalIndicator => Regional(RegionalState::Unknown),
WB::GlueAfterZwj | WB::EBaseGAZ => Zwj,
WB::Extend | WB::Format | WB::ZWJ => FormatExtend(AcceptAny),
WB::SingleQuote => {
saveidx = idx;
FormatExtend(AcceptQLetter)
}
WB::EModifier => Emoji,
WB::CR | WB::LF | WB::Newline => {
if state == Start {
if cat == WB::LF {
idx -= match self.get_prev_cat(idx) {
Some(pcat) if pcat == WB::CR => 1,
_ => 0,
};
}
} else {
take_curr = false;
}
break;
}
_ => break,
},
Zwj => match cat {
WB::ZWJ => FormatExtend(AcceptAny),
_ => {
take_curr = false;
break;
}
},
Letter | HLetter => match cat {
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::Numeric => Numeric,
WB::ExtendNumLet => ExtendNumLet,
WB::DoubleQuote if state == HLetter => {
saveidx = previdx;
FormatExtend(RequireHLetter)
}
WB::MidLetter | WB::MidNumLet | WB::SingleQuote => {
saveidx = previdx;
FormatExtend(RequireLetter)
}
_ => {
take_curr = false;
break;
}
},
Numeric => match cat {
WB::Numeric => Numeric,
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::ExtendNumLet => ExtendNumLet,
WB::MidNum | WB::MidNumLet | WB::SingleQuote => {
saveidx = previdx;
FormatExtend(RequireNumeric)
}
_ => {
take_curr = false;
break;
}
},
Katakana => match cat {
WB::Katakana => Katakana,
WB::ExtendNumLet => ExtendNumLet,
_ => {
take_curr = false;
break;
}
},
ExtendNumLet => match cat {
WB::ExtendNumLet => ExtendNumLet,
WB::ALetter => Letter,
WB::HebrewLetter => HLetter,
WB::Numeric => Numeric,
WB::Katakana => Katakana,
_ => {
take_curr = false;
break;
}
},
Regional(mut regional_state) => match cat {
WB::RegionalIndicator => {
if regional_state == RegionalState::Unknown {
let count = self.string[..previdx]
.chars()
.rev()
.map(WB::of)
.filter(|&c| !(c == WB::ZWJ || c == WB::Extend || c == WB::Format))
.take_while(|&c| c == WB::RegionalIndicator)
.count();
regional_state = if count % 2 == 0 {
RegionalState::Full
} else {
RegionalState::Half
};
}
if regional_state == RegionalState::Full {
take_curr = false;
break;
} else {
Regional(RegionalState::Full)
}
}
_ => {
take_curr = false;
break;
}
},
Emoji => match cat {
WB::EBase | WB::EBaseGAZ => Zwj,
_ => {
take_curr = false;
break;
}
},
FormatExtend(t) => match t {
RequireNumeric if cat == WB::Numeric => Numeric,
RequireLetter if cat == WB::ALetter => Letter,
RequireLetter if cat == WB::HebrewLetter => HLetter,
AcceptQLetter if cat == WB::HebrewLetter => HLetter,
RequireHLetter if cat == WB::HebrewLetter => HLetter,
_ => break,
},
}
}
if let FormatExtend(t) = state {
if t == RequireLetter || t == RequireHLetter || t == RequireNumeric || t == AcceptNone
|| t == AcceptQLetter
{
previdx = saveidx;
take_cat = false;
take_curr = false;
}
}
self.catb = if take_curr {
None
} else {
idx = previdx;
if take_cat {
Some(cat)
} else {
None
}
};
let retstr = &self.string[idx..];
self.string = &self.string[..idx];
Some(retstr)
}
}
impl<'a> WordBounds<'a> {
#[inline]
pub fn new(s: &str) -> WordBounds {
WordBounds {
string: s,
cat: None,
catb: None,
}
}
#[inline]
pub fn as_str(&self) -> &'a str {
self.string
}
#[inline]
fn get_next_cat(&self, idx: usize) -> Option<WB> {
let nidx = idx + self.string[idx..].chars().next().unwrap().len_utf8();
if nidx < self.string.len() {
let nch = self.string[nidx..].chars().next().unwrap();
Some(WB::of(nch))
} else {
None
}
}
#[inline]
fn get_prev_cat(&self, idx: usize) -> Option<WB> {
if idx > 0 {
let nch = self.string[..idx].chars().next_back().unwrap();
Some(WB::of(nch))
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::{WordBounds, Words};
use unic_ucd_common::is_alphanumeric;
#[test]
fn test_word_bounds() {
assert_eq!(
WordBounds::new("The quick (\"brown\") fox").collect::<Vec<&str>>(),
&[
"The", " ", "quick", " ", "(", "\"", "brown", "\"", ")", " ", " ", "fox"
]
);
}
#[test]
fn test_words() {
assert_eq!(
Words::new(
"The quick (\"brown\") fox can't jump 32.3 feet, right?",
|s: &&str| s.chars().any(is_alphanumeric),
).collect::<Vec<&str>>(),
&[
"The", "quick", "brown", "fox", "can't", "jump", "32.3", "feet", "right"
]
);
}
}