ascii_domain

Domains whose labels are only ASCII.
git clone https://git.philomathiclife.com/repos/ascii_domain
Log | Files | Refs | README

commit 4fa6b9277beeff563a195796b4d0a6f016234b36
parent ab96e3ccfcf8a47def57d892f73603e96639a6c2
Author: Zack Newman <zack@philomathiclife.com>
Date:   Sun,  4 Feb 2024 15:20:38 -0700

more impls. remove contains_trailing_dot field

Diffstat:
MCargo.toml | 2+-
Msrc/dom.rs | 272+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
2 files changed, 225 insertions(+), 49 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -9,7 +9,7 @@ license = "MIT OR Apache-2.0" name = "ascii_domain" readme = "README.md" repository = "https://git.philomathiclife.com/repos/ascii_domain/" -version = "0.1.0" +version = "0.2.0" [lib] name = "ascii_domain" diff --git a/src/dom.rs b/src/dom.rs @@ -8,9 +8,9 @@ use core::{ iter::FusedIterator, num::NonZeroU8, ops::Deref, - str::{self, FromStr}, + str, }; -use std::{error::Error, net::Ipv4Addr}; +use std::error::Error; /// Returned by [`Domain::cmp_by_domain_ordering`]. It is more informative than [`Ordering`] in that it /// distinguishes between a `Domain` that is greater than another `Domain` due to a [`Label`] being greater /// from a `Domain` that has the same `Label`s as another but simply more. @@ -60,7 +60,6 @@ impl From<DomainOrdering> for Ordering { /// The reason `b'\\'`, `b']'`, `b'^'`, `b'_'`, and `` b'`' `` need to be tracked in addition to letters /// is to ensure uppercase letters are considered greater since lowercase letters are. As the documentation /// of `Domain` states, "uppercase letters are treated as lowercase". -#[allow(clippy::exhaustive_enums)] #[repr(u8)] #[derive(Clone, Copy, Debug, Eq, Hash, Ord, PartialEq, PartialOrd)] enum CharFlag { @@ -146,18 +145,118 @@ impl Display for CharFlag { /// contributes one byte due to each label being preceded by the octet that represents its length. #[derive(Clone, Debug)] pub struct Domain<'a, const ALLOWED_COUNT: usize, T> { - /// The domain value. `value.as_ref().len()` is guaranteed to have length between 1 and 254. + /// The domain value. `value.as_ref().len()` is guaranteed to have length between 1 and 253 when the last `u8` + /// is not `b'.'`; otherwise the length is between 2 and 254. /// Guaranteed to only contain `b'.'` and the ASCII `u8`s in `allowed_ascii`. value: T, /// The allowed ASCII `u8`s a `Label` can have. allowed_ascii: &'a AllowedAscii<ALLOWED_COUNT>, /// The lengths of each label. Guaranteed to have length between 1 and 127 with each value being /// between 1 and 63. + /// The sum of each value plus the length is guaranteed to be 1 greater than `value.as_ref().len()` when + /// the last `u8` in `value` is not `b'.'`; otherwise it will be the same. label_lens: Vec<NonZeroU8>, /// Flag that contains information about the kind of ASCII `u8`s in `value`. flag: CharFlag, - /// Indicates if the domain has a trailing `b'.'`. - contains_trailing_dot: bool, +} +impl<'a, 'b, const ALLOWED_COUNT: usize> Domain<'a, ALLOWED_COUNT, &'b [u8]> { + /// Same as [`Self::as_bytes`] except the lifetime is tied to the slice and not `self`. + /// + /// # Example + /// + /// ``` + /// use ascii_domain::{dom::Domain, char_set::ASCII_LOWERCASE}; + /// assert!(Domain::try_from_bytes(b"example.com.".as_slice(), &ASCII_LOWERCASE).unwrap().domain_without_trailing_dot() == b"example.com"); + /// ``` + #[allow(clippy::as_conversions, clippy::indexing_slicing)] + #[inline] + #[must_use] + pub fn domain_without_trailing_dot(&self) -> &'b [u8] { + // `self.len().get() as usize` is fine since it's a positive `u8`. + // Indexing won't `panic` since `self.len()` is at most as long as `self.value`. + &self.value[..self.len().get() as usize] + } +} +impl<'a, 'b, const ALLOWED_COUNT: usize> Domain<'a, ALLOWED_COUNT, &'b str> { + /// Same as [`Self::as_str`] except the lifetime is tied to the `str` and not `self`. + /// + /// # Example + /// + /// ``` + /// use ascii_domain::{dom::Domain, char_set::ASCII_LOWERCASE}; + /// assert!(Domain::try_from_bytes("example.com.", &ASCII_LOWERCASE).unwrap().domain_without_trailing_dot() == "example.com"); + /// ``` + #[allow(unsafe_code, clippy::as_conversions, clippy::indexing_slicing)] + #[inline] + #[must_use] + pub fn domain_without_trailing_dot(&self) -> &'b str { + // `self.len().get() as usize` is fine since it's a positive `u8`. + // Indexing won't `panic` since `self.len()` is at most as long as `self.value`. + let utf8 = &self.value.as_bytes()[..self.len().get() as usize]; + // SAFETY: + // Only ASCII is allowed, so this is fine. + unsafe { str::from_utf8_unchecked(utf8) } + } +} +impl<'a: 'b, 'b, const ALLOWED_COUNT: usize> From<Domain<'a, ALLOWED_COUNT, Vec<u8>>> + for Domain<'b, ALLOWED_COUNT, String> +{ + #[allow(unsafe_code)] + #[inline] + fn from(value: Domain<'a, ALLOWED_COUNT, Vec<u8>>) -> Self { + // SAFETY: + // We only allow ASCII, so this is fine. + let val = unsafe { String::from_utf8_unchecked(value.value) }; + Self { + value: val, + allowed_ascii: value.allowed_ascii, + label_lens: value.label_lens, + flag: value.flag, + } + } +} +impl<'a: 'b, 'b, const ALLOWED_COUNT: usize> From<Domain<'a, ALLOWED_COUNT, String>> + for Domain<'b, ALLOWED_COUNT, Vec<u8>> +{ + #[inline] + fn from(value: Domain<'a, ALLOWED_COUNT, String>) -> Self { + Self { + value: value.value.into_bytes(), + allowed_ascii: value.allowed_ascii, + label_lens: value.label_lens, + flag: value.flag, + } + } +} +impl<'a: 'b, 'b, 'c: 'd, 'd, const ALLOWED_COUNT: usize> From<Domain<'a, ALLOWED_COUNT, &'c [u8]>> + for Domain<'b, ALLOWED_COUNT, &'d str> +{ + #[allow(unsafe_code)] + #[inline] + fn from(value: Domain<'a, ALLOWED_COUNT, &'c [u8]>) -> Self { + // SAFETY: + // We only allow ASCII, so this is fine. + let val = unsafe { str::from_utf8_unchecked(value.value) }; + Self { + value: val, + allowed_ascii: value.allowed_ascii, + label_lens: value.label_lens, + flag: value.flag, + } + } +} +impl<'a: 'b, 'b, 'c: 'd, 'd, const ALLOWED_COUNT: usize> From<Domain<'a, ALLOWED_COUNT, &'c str>> + for Domain<'b, ALLOWED_COUNT, &'d [u8]> +{ + #[inline] + fn from(value: Domain<'a, ALLOWED_COUNT, &'c str>) -> Self { + Self { + value: value.value.as_bytes(), + allowed_ascii: value.allowed_ascii, + label_lens: value.label_lens, + flag: value.flag, + } + } } impl<'a, const ALLOWED_COUNT: usize, T> Domain<'a, ALLOWED_COUNT, T> { /// The maximum length of a `Domain` which is 253. @@ -222,7 +321,9 @@ impl<'a, const ALLOWED_COUNT: usize, T> Domain<'a, ALLOWED_COUNT, T> { pub const fn allowed_ascii(&self) -> &'a AllowedAscii<ALLOWED_COUNT> { self.allowed_ascii } - /// Returns `true` iff the domain contained a trailing `b'.'`. +} +impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T> { + /// Returns `true` iff the domain contains a trailing `b'.'`. /// /// # Example /// @@ -230,12 +331,13 @@ impl<'a, const ALLOWED_COUNT: usize, T> Domain<'a, ALLOWED_COUNT, T> { /// use ascii_domain::{dom::Domain, char_set::ASCII_LOWERCASE}; /// assert!(Domain::try_from_bytes("example.com.", &ASCII_LOWERCASE).unwrap().contains_trailing_dot()); /// ``` + #[allow(clippy::arithmetic_side_effects, clippy::indexing_slicing)] #[inline] - pub const fn contains_trailing_dot(&self) -> bool { - self.contains_trailing_dot + pub fn contains_trailing_dot(&self) -> bool { + let bytes = self.value.as_ref(); + // This won't underflow or `panic` since `Domain`s are not empty. + bytes[bytes.len() - 1] == b'.' } -} -impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T> { /// The domain without a trailing `b'.'` if there was one. /// /// # Example @@ -290,7 +392,7 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T // `true as usize` is guaranteed to be 1 and `false as usize` is guaranteed to be 0. // No fear of truncation either since the length is guaranteed to be less than 255. // `Domain` is immutable ensuring such invariants are kept. - let len = (self.value.as_ref().len() - self.contains_trailing_dot as usize) as u8; + let len = (self.value.as_ref().len() - self.contains_trailing_dot() as usize) as u8; // SAFETY: // The only way to construct a `Domain` is via `try_from_bytes` which ensures `len` is // is at least 1. @@ -327,7 +429,7 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T allowed_ascii: &'b AllowedAscii<ALLOWED_COUNT>, ) -> Result<Self, DomainErr> { let val = v.as_ref(); - let (value, contains_trailing_dot) = match val.last() { + let value = match val.last() { None => return Err(DomainErr::Empty), Some(byt) => { if *byt == b'.' { @@ -337,9 +439,9 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T } // We know `val.len` is at least 1; otherwise `last` would have returned `None`. // Therefore this won't underflow and indexing won't `panic`. - (&val[..val.len() - 1], true) + &val[..val.len() - 1] } else { - (val, false) + val } } }; @@ -402,7 +504,6 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T 7 => CharFlag::All, _ => unreachable!("there is a bug in Domain::try_from_bytes"), }, - contains_trailing_dot, } }) }) @@ -460,15 +561,20 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T Err(()) } } - let f = if self.flag.eq_ignore_case(right.flag) { - eq_ignore + // Faster to check the values as bytes and not iterate each `Label`. + if self == right { + true } else { - eq - }; - self.into_iter() - .zip(right) - .try_fold((), |(), (label, label2)| f(label.value, label2.value)) - .map_or(false, |()| true) + let f = if self.flag.eq_ignore_case(right.flag) { + eq_ignore + } else { + eq + }; + self.into_iter() + .zip(right) + .try_fold((), |(), (label, label2)| f(label.value, label2.value)) + .map_or(false, |()| true) + } } /// Same as [`Self::cmp_doms`] except returns [`DomainOrdering::Longer`] iff `self > right` due solely /// to having more [`Label`]s and [`DomainOrdering::Shorter`] iff `self < right` due solely to having @@ -501,7 +607,7 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T &self, right: &Domain<ALLOWED_COUNT2, T2>, ) -> DomainOrdering { - // Faster to compare the entire value when we can instead of each label. + // Faster to compare the entire value when we can. if self == right { return DomainOrdering::Equal; } @@ -509,6 +615,11 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T let right_input; let left_dom; let right_dom; + // We try to avoid needless converting to lowercase or uppercase. + // Note that `CharFlag` does not need to be perfect. It only needs to be correct "enough" for comparisons. + // For example, if we convert a `Domain` that contains uppercase letters to lowercase; and we compare + // said `Domain` to one that only contains lowercase letters, `CharFlag::Lower`, `CharFlag::Between`, + // and `CharFlag::LowerBetween` all will lead to a correct comparison. let (left_ref, right_ref) = match (self.flag, right.flag) { (CharFlag::None, _) | (_, CharFlag::None) @@ -521,15 +632,13 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T value: self.value.as_ref(), allowed_ascii: self.allowed_ascii, label_lens: self.label_lens.clone(), - flag: CharFlag::LowerBetween, - contains_trailing_dot: self.contains_trailing_dot, + flag: self.flag, }; right_dom = Domain { value: right.value.as_ref(), allowed_ascii: right.allowed_ascii, label_lens: right.label_lens.clone(), - flag: CharFlag::LowerBetween, - contains_trailing_dot: right.contains_trailing_dot, + flag: right.flag, }; (&left_dom, &right_dom) } @@ -538,8 +647,7 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T value: self.value.as_ref(), allowed_ascii: self.allowed_ascii, label_lens: self.label_lens.clone(), - flag: CharFlag::LowerBetween, - contains_trailing_dot: self.contains_trailing_dot, + flag: self.flag, }; right_input = right.value.as_ref().to_ascii_lowercase(); right_dom = Domain { @@ -547,7 +655,6 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T allowed_ascii: right.allowed_ascii, label_lens: right.label_lens.clone(), flag: CharFlag::LowerBetween, - contains_trailing_dot: right.contains_trailing_dot, }; (&left_dom, &right_dom) } @@ -556,8 +663,7 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T value: self.value.as_ref(), allowed_ascii: self.allowed_ascii, label_lens: self.label_lens.clone(), - flag: CharFlag::LowerBetween, - contains_trailing_dot: self.contains_trailing_dot, + flag: self.flag, }; right_input = right.value.as_ref().to_ascii_uppercase(); right_dom = Domain { @@ -565,7 +671,6 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T allowed_ascii: right.allowed_ascii, label_lens: right.label_lens.clone(), flag: CharFlag::Upper, - contains_trailing_dot: right.contains_trailing_dot, }; (&left_dom, &right_dom) } @@ -576,14 +681,12 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T allowed_ascii: self.allowed_ascii, label_lens: self.label_lens.clone(), flag: CharFlag::LowerBetween, - contains_trailing_dot: self.contains_trailing_dot, }; right_dom = Domain { value: right.value.as_ref(), allowed_ascii: right.allowed_ascii, label_lens: right.label_lens.clone(), - flag: CharFlag::LowerBetween, - contains_trailing_dot: right.contains_trailing_dot, + flag: right.flag, }; (&left_dom, &right_dom) } @@ -594,14 +697,12 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T allowed_ascii: self.allowed_ascii, label_lens: self.label_lens.clone(), flag: CharFlag::Upper, - contains_trailing_dot: self.contains_trailing_dot, }; right_dom = Domain { value: right.value.as_ref(), allowed_ascii: right.allowed_ascii, label_lens: right.label_lens.clone(), - flag: CharFlag::LowerBetween, - contains_trailing_dot: right.contains_trailing_dot, + flag: right.flag, }; (&left_dom, &right_dom) } @@ -612,7 +713,6 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T allowed_ascii: self.allowed_ascii, label_lens: self.label_lens.clone(), flag: CharFlag::LowerBetween, - contains_trailing_dot: self.contains_trailing_dot, }; right_input = right.value.as_ref().to_ascii_lowercase(); right_dom = Domain { @@ -620,7 +720,6 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T allowed_ascii: right.allowed_ascii, label_lens: right.label_lens.clone(), flag: CharFlag::LowerBetween, - contains_trailing_dot: right.contains_trailing_dot, }; (&left_dom, &right_dom) } @@ -629,6 +728,9 @@ impl<'a, const ALLOWED_COUNT: usize, T: AsRef<[u8]>> Domain<'a, ALLOWED_COUNT, T .into_iter() .zip(right_ref) .try_fold((), |(), (label, label2)| { + // We don't want to use `Label::cmp` since that always converts both `Label`s to + // lowercase since it is unaware of `CharFlag`s. This is fine due to above where + // we did the conversion already. match label.value.cmp(label2.value) { Ordering::Less => Err(DomainOrdering::Less), Ordering::Equal => Ok(()), @@ -1173,8 +1275,9 @@ impl<const ALLOWED_COUNT: usize, T: AsRef<[u8]>> DoubleEndedIterator self.idx_back += 1; // `len` is always below 64 so adding 1 won't cause overflow. // `start_back` is initialized to 0 and is only increased here. - // This means this will only ever be as large as the total length - // of the domain plus 1 which is less than 255. + // This means this will only ever be as large as the domain plus 1 + // when there is no trailing `b'.'` or as large as the domain when there is + // a trailing `b'.'` both of which are less than 255. self.start_back += len.get() + 1; } label @@ -1317,7 +1420,7 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<'_, T> { let tld = self.dom.tld(); (2..4).contains(&tld.len()) && tld.is_alphabetic() } - /// Returns `true` iff the domain has the same format as an [`Ipv4Addr`]. + /// Returns `true` iff the domain has the same format as an IPv4 address. /// /// Note that due to the most relaxed interpretation of RFC 1123 mentioned in [`Rfc1123Domain`], it is possible /// for the domain to be an IPv4 address unlike the strictest, strict, literal, and possibly relaxed @@ -1328,9 +1431,47 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<'_, T> { /// use ascii_domain::{dom::{Domain, Rfc1123Domain}, char_set::ASCII_HYPHEN_DIGITS_LETTERS}; /// assert!(Rfc1123Domain::try_from(Domain::try_from_bytes("1.2.3.4", &ASCII_HYPHEN_DIGITS_LETTERS).unwrap()).unwrap().is_ipv4()); /// ``` + #[allow( + clippy::arithmetic_side_effects, + clippy::as_conversions, + clippy::cast_lossless, + clippy::into_iter_on_ref + )] #[inline] pub fn is_ipv4(&self) -> bool { - Ipv4Addr::from_str(&self.dom).is_ok() + // Faster to check metadata first to hopefully avoid re-parsing the domain as an IPv4 address. + self.dom.flag == CharFlag::None + && self.as_bytes().len() < 16 + && self.label_count().get() == 4 + // We don't use `std::net::Ipv4Addr::from_str` since that does not consider octets with leading + // 0s as valid. This means something like `0.0.0.01` is not considered an IPv4 address, but we + // want to consider that as an IP. + && self + .dom + .into_iter() + .try_fold((), |(), label| { + if label.len() < 4 { + label + .as_bytes() + .into_iter() + .try_fold(0u16, |val, byt| { + if byt.is_ascii_digit() { + // We already verified the length is at most 3, and we only perform + // this arithmetic on integers between 0 and 9. This means the max value + // of these operations is 999 which is smaller than `u16::MAX`. We verified + // `byt` is an ASCII digit so we know `byt - b'0'` will be inclusively between + // 0 and 9. So no overflow, underflow, or truncation will occur. + Ok(val * 10 + (byt - b'0') as u16) + } else { + Err(()) + } + }) + .and_then(|int| u8::try_from(int).map_or(Err(()), |_| Ok(()))) + } else { + Err(()) + } + }) + .is_ok() } } impl<'a, 'b, T: AsRef<[u8]>, T2: AsRef<[u8]>> PartialEq<Rfc1123Domain<'a, T>> @@ -1462,7 +1603,7 @@ impl<'a, 'b: 'a, T: AsRef<[u8]>> TryFrom<Domain<'b, 63, T>> for Rfc1123Domain<'a fn try_from(value: Domain<'b, 63, T>) -> Result<Self, Self::Error> { if *value.allowed_ascii != ASCII_HYPHEN_DIGITS_LETTERS { Err(Rfc1123Err::InvalidAllowedAscii) - } else if value.contains_trailing_dot { + } else if value.contains_trailing_dot() { Err(Rfc1123Err::ContainsTrailingDot) } else { value @@ -1869,6 +2010,41 @@ mod tests { .map_or(false, |dom| Rfc1123Domain::try_from(dom) .map_or(false, |rfc| rfc.is_ipv4())) ); + assert!( + Domain::try_from_bytes("001.001.001.001", &ASCII_HYPHEN_DIGITS_LETTERS) + .map_or(false, |dom| Rfc1123Domain::try_from(dom) + .map_or(false, |rfc| rfc.is_ipv4())) + ); + assert!( + Domain::try_from_bytes("256.0.0.0", &ASCII_HYPHEN_DIGITS_LETTERS) + .map_or(false, |dom| Rfc1123Domain::try_from(dom) + .map_or(false, |rfc| !rfc.is_ipv4())) + ); + assert!( + Domain::try_from_bytes("0.0.0.0", &ASCII_HYPHEN_DIGITS_LETTERS) + .map_or(false, |dom| Rfc1123Domain::try_from(dom) + .map_or(false, |rfc| rfc.is_ipv4())) + ); + assert!( + Domain::try_from_bytes("255.255.255.255", &ASCII_HYPHEN_DIGITS_LETTERS) + .map_or(false, |dom| Rfc1123Domain::try_from(dom) + .map_or(false, |rfc| rfc.is_ipv4())) + ); + assert!( + Domain::try_from_bytes("255.255.255.256", &ASCII_HYPHEN_DIGITS_LETTERS) + .map_or(false, |dom| Rfc1123Domain::try_from(dom) + .map_or(false, |rfc| !rfc.is_ipv4())) + ); + assert!( + Domain::try_from_bytes("0.0.0.256", &ASCII_HYPHEN_DIGITS_LETTERS) + .map_or(false, |dom| Rfc1123Domain::try_from(dom) + .map_or(false, |rfc| !rfc.is_ipv4())) + ); + assert!( + Domain::try_from_bytes("1.1.1.0001", &ASCII_HYPHEN_DIGITS_LETTERS) + .map_or(false, |dom| Rfc1123Domain::try_from(dom) + .map_or(false, |rfc| !rfc.is_ipv4())) + ); } #[test] fn test_tld() {