The roe from artichoke

Implement titlecase iterator

Implement a Titlecase iterator similar to the Lowercase and Uppercase iterators already in this crate.

Unlike Lowercase and Uppercase, char::to_titlecase is not defined in std, so we'll have to generate mappings for the titlecase transforms.

One way we might do that is to roll some Ruby scripts like focaccia does for generating case folding mappings:

Another way we might tackle this is with codegen from crates in the Rust ecosystem:

https://crates.io/crates/ucd-generate

Use the source layout of the Lowercase iterator as a guide:

https://github.com/artichoke/roe/blob/acdab1e6b2b1eea475e8eea8a0a7c616ce7bf850/src/lowercase.rs
https://github.com/artichoke/roe/tree/acdab1e6b2b1eea475e8eea8a0a7c616ce7bf850/src/lowercase

roe/src/lib.rs

Lines 180 to 320 in acdab1e

 /// Options to configure the behavior of [`lowercase`]. 

 /// 

 /// Which letters exactly are replaced, and by which other letters, depends on 

 /// the given options. 

 /// 

 /// See individual variants for a description of the available behaviors. 

 /// 

 /// If you're not sure which mode to choose, [`LowercaseMode::Full`] is a a good 

 /// default. 

 /// 

 /// [`lowercase`]: crate::lowercase() 

 #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] 

 pub enum LowercaseMode { 

 /// Full Unicode case mapping, suitable for most languages. 

 /// 

 /// See the [Turkic] and [Lithuanian] variants for exceptions. 

 /// 

 /// Context-dependent case mapping as described in Table 3-14 of the Unicode 

 /// standard is currently not supported. 

 /// 

 /// [Turkic]: Self::Turkic 

 /// [Lithuanian]: Self::Lithuanian 

 Full, 

 /// Only the ASCII region, i.e. the characters `'A'..='Z'` and `'a'..='z'`, 

 /// are affected. 

 /// 

 /// This option cannot be combined with any other option. 

 Ascii, 

 /// Full Unicode case mapping, adapted for Turkic languages (Turkish, 

 /// Azerbaijani, …). 

 /// 

 /// This means that upper case I is mapped to lower case dotless i, and so 

 /// on. 

 Turkic, 

 /// Currently, just [full Unicode case mapping]. 

 /// 

 /// In the future, full Unicode case mapping adapted for Lithuanian (keeping 

 /// the dot on the lower case i even if there is an accent on top). 

 /// 

 /// [full Unicode case mapping]: Self::Full 

 Lithuanian, 

 /// Unicode case **folding**, which is more far-reaching than Unicode case 

 /// mapping. 

 /// 

 /// This option currently cannot be combined with any other option (i.e. 

 /// there is currently no variant for turkic languages). 

 Fold, 

 } 

 impl Default for LowercaseMode { 

 fn default() -> Self { 

 Self::Full 

 } 

 } 

 impl TryFrom<&str> for LowercaseMode { 

 type Error = InvalidCaseMappingMode; 

 #[inline] 

 fn try_from(value: &str) -> Result<Self, Self::Error> { 

 value.as_bytes().try_into() 

 } 

 } 

 impl TryFrom<Option<&str>> for LowercaseMode { 

 type Error = InvalidCaseMappingMode; 

 #[inline] 

 fn try_from(value: Option<&str>) -> Result<Self, Self::Error> { 

 value.map(str::as_bytes).try_into() 

 } 

 } 

 impl TryFrom<&[u8]> for LowercaseMode { 

 type Error = InvalidCaseMappingMode; 

 #[inline] 

 fn try_from(value: &[u8]) -> Result<Self, Self::Error> { 

 match value { 

 b"ascii" => Ok(Self::Ascii), 

 b"turkic" => Ok(Self::Turkic), 

 b"lithuanian" => Ok(Self::Lithuanian), 

 b"fold" => Ok(Self::Fold), 

 _ => Err(InvalidCaseMappingMode::new()), 

 } 

 } 

 } 

 impl TryFrom<Option<&[u8]>> for LowercaseMode { 

 type Error = InvalidCaseMappingMode; 

 #[inline] 

 fn try_from(value: Option<&[u8]>) -> Result<Self, Self::Error> { 

 match value { 

 None => Ok(Self::Full), 

 Some(b"ascii") => Ok(Self::Ascii), 

 Some(b"turkic") => Ok(Self::Turkic), 

 Some(b"lithuanian") => Ok(Self::Lithuanian), 

 Some(b"fold") => Ok(Self::Fold), 

 Some(_) => Err(InvalidCaseMappingMode::new()), 

 } 

 } 

 } 

 impl FromStr for LowercaseMode { 

 type Err = InvalidCaseMappingMode; 

 #[inline] 

 fn from_str(s: &str) -> Result<Self, Self::Err> { 

 s.try_into() 

 } 

 } 

 /// Returns an iterator that yields a copy of the bytes in the given slice with 

 /// all uppercase letters replaced with their lowercase counterparts. 

 /// 

 /// This function treats the given slice as a [conventionally UTF-8 string]. 

 /// UTF-8 byte sequences are converted to their Unicode lowercase equivalents. 

 /// Invalid UTF-8 byte sequences are yielded as is. 

 /// 

 /// The case mapping mode is determined by the given [`LowercaseMode`]. See its 

 /// documentation for details on the available case mapping modes. 

 /// 

 /// # Panics 

 /// 

 /// Not all [`LowercaseMode`]s are currently implemented. This function will 

 /// panic if the caller supplies [Turkic] or [case folding] lowercasing mode. 

 /// 

 /// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings 

 /// [Turkic]: LowercaseMode::Turkic 

 /// [case folding]: LowercaseMode::Fold 

 // TODO: make this const once we're no longer panicking. 

 pub fn lowercase(slice: &[u8], options: LowercaseMode) -> Lowercase<'_> { 

 match options { 

 LowercaseMode::Full | LowercaseMode::Lithuanian => Lowercase::with_slice(slice), 

 LowercaseMode::Ascii => Lowercase::with_ascii_slice(slice), 

 // TODO: implement `turkic` and `fold` modes. 

 LowercaseMode::Turkic => panic!("lowercase Turkic mode is not yet implemented"), 

 LowercaseMode::Fold => panic!("lowercase case folding mode is not yet implemented"), 

 } 

 }

Implementation steps

Implement TitlecaseMode.
Implement ASCII iterator.
Implement Full iterator.
Wire everything up in a free function entrypoint in lib.rs.

artichoke / roe Goto Github PK

roe's People

Contributors

Stargazers

Watchers

roe's Issues

Add cargo-spellcheck config

Implement titlecase iterator

Implementation steps

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

	/// Options to configure the behavior of [`lowercase`].
	///
	/// Which letters exactly are replaced, and by which other letters, depends on
	/// the given options.
	///
	/// See individual variants for a description of the available behaviors.
	///
	/// If you're not sure which mode to choose, [`LowercaseMode::Full`] is a a good
	/// default.
	///
	/// [`lowercase`]: crate::lowercase()
	#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
	pub enum LowercaseMode {
	/// Full Unicode case mapping, suitable for most languages.
	///
	/// See the [Turkic] and [Lithuanian] variants for exceptions.
	///
	/// Context-dependent case mapping as described in Table 3-14 of the Unicode
	/// standard is currently not supported.
	///
	/// [Turkic]: Self::Turkic
	/// [Lithuanian]: Self::Lithuanian
	Full,
	/// Only the ASCII region, i.e. the characters `'A'..='Z'` and `'a'..='z'`,
	/// are affected.
	///
	/// This option cannot be combined with any other option.
	Ascii,
	/// Full Unicode case mapping, adapted for Turkic languages (Turkish,
	/// Azerbaijani, …).
	///
	/// This means that upper case I is mapped to lower case dotless i, and so
	/// on.
	Turkic,
	/// Currently, just [full Unicode case mapping].
	///
	/// In the future, full Unicode case mapping adapted for Lithuanian (keeping
	/// the dot on the lower case i even if there is an accent on top).
	///
	/// [full Unicode case mapping]: Self::Full
	Lithuanian,
	/// Unicode case folding, which is more far-reaching than Unicode case
	/// mapping.
	///
	/// This option currently cannot be combined with any other option (i.e.
	/// there is currently no variant for turkic languages).
	Fold,
	}

	impl Default for LowercaseMode {
	fn default() -> Self {
	Self::Full
	}
	}

	impl TryFrom<&str> for LowercaseMode {
	type Error = InvalidCaseMappingMode;

	#[inline]
	fn try_from(value: &str) -> Result<Self, Self::Error> {
	value.as_bytes().try_into()
	}
	}

	impl TryFrom<Option<&str>> for LowercaseMode {
	type Error = InvalidCaseMappingMode;

	#[inline]
	fn try_from(value: Option<&str>) -> Result<Self, Self::Error> {
	value.map(str::as_bytes).try_into()
	}
	}

	impl TryFrom<&[u8]> for LowercaseMode {
	type Error = InvalidCaseMappingMode;

	#[inline]
	fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
	match value {
	b"ascii" => Ok(Self::Ascii),
	b"turkic" => Ok(Self::Turkic),
	b"lithuanian" => Ok(Self::Lithuanian),
	b"fold" => Ok(Self::Fold),
	_ => Err(InvalidCaseMappingMode::new()),
	}
	}
	}

	impl TryFrom<Option<&[u8]>> for LowercaseMode {
	type Error = InvalidCaseMappingMode;

	#[inline]
	fn try_from(value: Option<&[u8]>) -> Result<Self, Self::Error> {
	match value {
	None => Ok(Self::Full),
	Some(b"ascii") => Ok(Self::Ascii),
	Some(b"turkic") => Ok(Self::Turkic),
	Some(b"lithuanian") => Ok(Self::Lithuanian),
	Some(b"fold") => Ok(Self::Fold),
	Some(_) => Err(InvalidCaseMappingMode::new()),
	}
	}
	}

	impl FromStr for LowercaseMode {
	type Err = InvalidCaseMappingMode;

	#[inline]
	fn from_str(s: &str) -> Result<Self, Self::Err> {
	s.try_into()
	}
	}

	/// Returns an iterator that yields a copy of the bytes in the given slice with
	/// all uppercase letters replaced with their lowercase counterparts.
	///
	/// This function treats the given slice as a [conventionally UTF-8 string].
	/// UTF-8 byte sequences are converted to their Unicode lowercase equivalents.
	/// Invalid UTF-8 byte sequences are yielded as is.
	///
	/// The case mapping mode is determined by the given [`LowercaseMode`]. See its
	/// documentation for details on the available case mapping modes.
	///
	/// # Panics
	///
	/// Not all [`LowercaseMode`]s are currently implemented. This function will
	/// panic if the caller supplies [Turkic] or [case folding] lowercasing mode.
	///
	/// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings
	/// [Turkic]: LowercaseMode::Turkic
	/// [case folding]: LowercaseMode::Fold
	// TODO: make this const once we're no longer panicking.
	pub fn lowercase(slice: &[u8], options: LowercaseMode) -> Lowercase<'_> {
	match options {
	LowercaseMode::Full \| LowercaseMode::Lithuanian => Lowercase::with_slice(slice),
	LowercaseMode::Ascii => Lowercase::with_ascii_slice(slice),
	// TODO: implement `turkic` and `fold` modes.
	LowercaseMode::Turkic => panic!("lowercase Turkic mode is not yet implemented"),
	LowercaseMode::Fold => panic!("lowercase case folding mode is not yet implemented"),
	}
	}