ascii_domain

Domains whose labels are only ASCII.
git clone https://git.philomathiclife.com/repos/ascii_domain
Log | Files | Refs | README

commit dfa4309dccf129e3002209e5f41fd037abe3c605
parent e0a00a4da03c17329c5a87755c3d9e5b7571a343
Author: Zack Newman <zack@philomathiclife.com>
Date:   Tue, 13 Feb 2024 18:27:37 -0700

require tld to be alphabetic or an a-label in rfc1123domain

Diffstat:
MCargo.toml | 2+-
Msrc/dom.rs | 229++++++++++++++++++++++++++++++-------------------------------------------------
Msrc/serde.rs | 3++-
3 files changed, 90 insertions(+), 144 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -9,7 +9,7 @@ license = "MIT OR Apache-2.0" name = "ascii_domain" readme = "README.md" repository = "https://git.philomathiclife.com/repos/ascii_domain/" -version = "0.5.0" +version = "0.6.0" [lib] name = "ascii_domain" diff --git a/src/dom.rs b/src/dom.rs @@ -1005,6 +1005,9 @@ pub enum Rfc1123Err { LabelStartsWithAHyphen, /// A [`Label`] of [`Domain`] ends with an ASCII hyphen. LabelEndsWithAHyphen, + /// The last [`Label`] (i.e., TLD) was invalid which means it was not all ASCII letters nor + /// had length of at least five with the first 4 characters being `xn--`. + InvalidTld, } impl Display for Rfc1123Err { #[inline] @@ -1015,6 +1018,7 @@ impl Display for Rfc1123Err { f.write_str("a label in the domain starts with a hyphen") } Self::LabelEndsWithAHyphen => f.write_str("a label in the domain ends with a hyphen"), + Self::InvalidTld => f.write_str("the TLD in the domain was not all letters nor had length of at least five with the first 4 characters being 'xn--'") } } } @@ -1025,6 +1029,8 @@ impl Error for Rfc1123Err {} /// /// * Each [`Label`] must only contain ASCII digits, letters, or hyphen. /// * Each `Label` must not begin or end with a hyphen. +/// * The last `Label` (i.e., TLD) must either contain only ASCII letters or have length of at least five and +/// begin with `xn--`. /// --- /// Unsurprisingly, RFC 1123 is not super precise as it uses "host name" to mean label and also domain: /// "Host software MUST handle host names \[labels\] of up to 63 characters and SHOULD handle host @@ -1045,6 +1051,7 @@ impl Error for Rfc1123Err {} /// * Literal: enforce the TLD is alphabetic regardless of the lack of justification. /// * Relaxed: enforce the "spirit" that the TLD must exist. /// * More relaxed: enforce the "spirit" that the TLD must have the same format of a valid TLD. +/// * Much more relaxed: enforce the "spirit" that the domain cannot have the form of an IPv4 address. /// * Most relaxed: treat TLDs no differently than other labels (i.e., don't make assumptions about what will be /// a valid TLD in the future). /// @@ -1057,12 +1064,21 @@ impl Error for Rfc1123Err {} /// Assuming TLDs are static is absurd, and relying on some dynamic list of TLDs is undesirable. For that reason /// the relaxed interpretation is rejected. /// -/// To enforce that the TLD is of the correct format would require RFC 5891 semantics to ensure the TLD is -/// either a valid A-label or NR-LDH label. This is not a cheap operation, and it treats TLDs inconsistently -/// (specifically more strictly validated) than other labels. RFC 1123 should be treated as the foundation -/// until it is made obsolete. In the future there may be other RFCs that further restrict valid TLDs, and treating -/// RFC 1123 as if it were written presciently does not make sense. For that reason the more relaxed interpretation -/// is rejected. Consequently we use the most relaxed interpretation. +/// Enforcing that domains do not have the form of an IPv4 address opens up the question of what is an IPv4 +/// address? Should leading 0s be allowed? What about hexadecimal? Should there be length limits for each octet? +/// It also has the undesirable effect where subdomains that are all numeric exist but their parent domain does +/// not which goes against the hierarchical nature of DNS. For those reasons the much more relaxed interpretation +/// is rejected. +/// +/// Treating TLDs no differently than other labels is nice from a consistency perspective, but it suffers from +/// the fact that domains that have the form of an IPv4 address are now allowed. For that reason the most +/// relaxed interpretation is rejected. +/// +/// [ICANN](https://newgtlds.icann.org/sites/default/files/guidebook-full-04jun12-en.pdf) requires TLDs to either +/// be alphabetic or a valid A-label per RFC 5891. Verifying a label is a valid A-label is not a cheap operation +/// though. For that reason the more relaxed interpretation is accepted but with a twist: fake and valid A-labels +/// are allowed in addition to entirely alphabetic labels. More specifically the TLD must either contain only +/// letters or must be at least five characters long with the first 4 characters being `xn--`. /// /// If one wants to enforce the literal interpretation, one can use [`Self::is_literal_interpretation`]. Similarly, /// if one wants to enforce the strict interpretation, one can use [`Self::is_strict_interpretation`]. @@ -1100,7 +1116,8 @@ impl<T> Rfc1123Domain<T> { impl<T: AsRef<[u8]>> Rfc1123Domain<T> { /// Function that transforms `v` into an `Rfc1123Domain` by only allowing [`Label`]s to contain the ASCII `u8`s /// in [`ASCII_HYPHEN_DIGITS_LETTERS`] with each `Label` not starting or ending with a `b'-'`. A trailing `b'.'` - /// is ignored. + /// is ignored. The last `Label` (i.e., TLD) must either only contain ASCII letters or must have length of at + /// least five with the first 4 bytes being `b"xn--"`. /// /// Unliked calling [`Domain::try_from_bytes`] then [`Rfc1123Domain::try_from`] which performs two traversals /// of `v`, this performs a single traversal of `v`. @@ -1110,6 +1127,7 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> { /// ``` /// use ascii_domain::dom::{Rfc1123Domain, Rfc1123Err}; /// assert!(Rfc1123Domain::try_from_bytes("example.com").is_ok()); + /// assert!(Rfc1123Domain::try_from_bytes("example.xn--abc").is_ok()); /// assert!(Rfc1123Domain::try_from_bytes("a-.com").map_or_else(|err| err == Rfc1123Err::LabelEndsWithAHyphen, |_| false)); /// ``` /// @@ -1118,6 +1136,7 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> { /// Returns [`Rfc1123Err`] iff `v.as_ref()` is an invalid `Rfc1123Domain`. #[allow( clippy::arithmetic_side_effects, + clippy::as_conversions, clippy::indexing_slicing, clippy::into_iter_on_ref, clippy::redundant_else @@ -1188,7 +1207,27 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> { Ok(label_len + 1) } }) - .map(|_| Self { + .and_then(|tld_len| { + // `tld_len <= value.len()`. + let tld = &value[value.len() - tld_len as usize..]; + if (tld_len > 4 && tld[..4] == *b"xn--".as_slice()) + || tld + .into_iter() + .try_fold((), |(), byt| { + if byt.is_ascii_alphabetic() { + Ok(()) + } else { + Err(()) + } + }) + .is_ok() + { + Ok(()) + } else { + Err(Rfc1123Err::InvalidTld) + } + }) + .map(|()| Self { dom: Domain { value: v }, }) } @@ -1201,7 +1240,7 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> { /// ``` /// use ascii_domain::dom::Rfc1123Domain; /// assert!(Rfc1123Domain::try_from_bytes("example.commmm").unwrap().is_literal_interpretation()); - /// assert!(!Rfc1123Domain::try_from_bytes("example.c1m").unwrap().is_literal_interpretation()); + /// assert!(!Rfc1123Domain::try_from_bytes("example.xn--abc").unwrap().is_literal_interpretation()); /// ``` #[inline] pub fn is_literal_interpretation(&self) -> bool { @@ -1222,42 +1261,6 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> { let tld = self.dom.tld(); (2..4).contains(&tld.len().get()) && tld.is_alphabetic() } - /// Returns `true` iff the domain has the same format as an IPv4 address. - /// - /// Note that due to the most relaxed interpretation of RFC 1123 mentioned in [`Rfc1123Domain`], it is possible - /// for the domain to be an IPv4 address unlike the strictest, strict, literal, and possibly relaxed - /// interpretations. - /// - /// # Example - /// - /// ``` - /// use ascii_domain::dom::Rfc1123Domain; - /// assert!(Rfc1123Domain::try_from_bytes("1.2.3.4").unwrap().is_ipv4()); - /// ``` - #[allow(clippy::arithmetic_side_effects, clippy::into_iter_on_ref)] - #[inline] - pub fn is_ipv4(&self) -> bool { - // The min length of an IPv4 address is 7 and the max length is 15. - (7..=15).contains(&self.dom.len().get()) - // We don't use `std::net::Ipv4Addr::from_str` since that does not consider octets with leading - // 0s as valid. This means something like `0.0.0.01` is not considered an IPv4 address, but we - // want to consider that as an IP. - && self - .dom - .into_iter() - .try_fold(0u8, |count, label| { - // If we have more than 4 `Label`s, it's not an IPv4 address. Similarly if a `Label` has - // length greater than 3, it's not a valid IPv4 address octet. - if count < 4 && label.len().get() < 4 && label.as_str().parse::<u8>().is_ok() { - // Overflow is not possible since we know `count < 4`. - Ok(count + 1) - } else { - Err(()) - } - }) - // We must have exactly 4 `Label`s. - .map_or(false, |count| count == 4) - } } impl<T: AsRef<[u8]>, T2: AsRef<[u8]>> PartialEq<Rfc1123Domain<T>> for Rfc1123Domain<T2> { #[inline] @@ -1410,28 +1413,38 @@ impl<T: AsRef<[u8]>> TryFrom<Domain<T>> for Rfc1123Domain<T> { #[allow( clippy::arithmetic_side_effects, clippy::indexing_slicing, - clippy::into_iter_on_ref + clippy::into_iter_on_ref, + clippy::unreachable )] #[inline] fn try_from(value: Domain<T>) -> Result<Self, Self::Error> { - value - .into_iter() - .try_fold((), |(), label| { - let bytes = label.value.as_bytes(); - // `Label`s are never empty, so the below indexing is fine. - // Underflow won't occur for the same reason. - if bytes[0] == b'-' { - Err(Rfc1123Err::LabelStartsWithAHyphen) - } else if bytes[bytes.len() - 1] == b'-' { - Err(Rfc1123Err::LabelEndsWithAHyphen) - } else { - bytes.into_iter().try_fold((), |(), byt| match *byt { - b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' => Ok(()), - val => Err(Rfc1123Err::DomainErr(DomainErr::InvalidByte(val))), - }) - } - }) - .map(|()| Self { dom: value }) + let mut labels = value.into_iter(); + let tld = labels + .next() + .unwrap_or_else(|| unreachable!("there is a bug in Domain::try_from_bytes")); + if tld.is_alphabetic() + || (tld.len().get() > 4 && tld.as_bytes()[..4] == *b"xn--".as_slice()) + { + labels + .try_fold((), |(), label| { + let bytes = label.value.as_bytes(); + // `Label`s are never empty, so the below indexing is fine. + // Underflow won't occur for the same reason. + if bytes[0] == b'-' { + Err(Rfc1123Err::LabelStartsWithAHyphen) + } else if bytes[bytes.len() - 1] == b'-' { + Err(Rfc1123Err::LabelEndsWithAHyphen) + } else { + bytes.into_iter().try_fold((), |(), byt| match *byt { + b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' => Ok(()), + val => Err(Rfc1123Err::DomainErr(DomainErr::InvalidByte(val))), + }) + } + }) + .map(|()| Self { dom: value }) + } else { + Err(Rfc1123Err::InvalidTld) + } } } impl<T: AsRef<[u8]>> Display for Rfc1123Domain<T> { @@ -1677,6 +1690,13 @@ mod tests { ) ); assert!( + Domain::try_from_bytes("example.c1m", &ASCII_HYPHEN_DIGITS_LETTERS).map_or( + false, + |dom| Rfc1123Domain::try_from(dom) + .map_or_else(|e| e == Rfc1123Err::InvalidTld, |_| false) + ) + ); + assert!( Domain::try_from_bytes("example.commm", &ASCII_HYPHEN_DIGITS_LETTERS).map_or( false, |dom| Rfc1123Domain::try_from(dom) @@ -1684,7 +1704,7 @@ mod tests { ) ); assert!( - Domain::try_from_bytes("example.c1m", &ASCII_HYPHEN_DIGITS_LETTERS).map_or( + Domain::try_from_bytes("example.xn--abc", &ASCII_HYPHEN_DIGITS_LETTERS).map_or( false, |dom| Rfc1123Domain::try_from(dom) .map_or(false, |rfc| !rfc.is_literal_interpretation()) @@ -1704,54 +1724,6 @@ mod tests { .map_or(false, |rfc| !rfc.is_strict_interpretation()) ) ); - assert!( - Domain::try_from_bytes("1.2.3.4", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| rfc.is_ipv4())) - ); - assert!( - Domain::try_from_bytes("001.001.001.001", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| rfc.is_ipv4())) - ); - assert!(Domain::try_from_bytes("1", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| !rfc.is_ipv4()))); - assert!( - Domain::try_from_bytes("1.1.1.1.1", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| !rfc.is_ipv4())) - ); - assert!( - Domain::try_from_bytes("256.0.0.0", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| !rfc.is_ipv4())) - ); - assert!( - Domain::try_from_bytes("0.0.0.0", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| rfc.is_ipv4())) - ); - assert!( - Domain::try_from_bytes("255.255.255.255", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| rfc.is_ipv4())) - ); - assert!( - Domain::try_from_bytes("255.255.255.256", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| !rfc.is_ipv4())) - ); - assert!( - Domain::try_from_bytes("0.0.0.256", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| !rfc.is_ipv4())) - ); - assert!( - Domain::try_from_bytes("1.1.1.0001", &ASCII_HYPHEN_DIGITS_LETTERS) - .map_or(false, |dom| Rfc1123Domain::try_from(dom) - .map_or(false, |rfc| !rfc.is_ipv4())) - ); } #[test] fn test_tld() { @@ -1835,34 +1807,7 @@ mod tests { .map_or(false, |d2| d == d2 && d.cmp(&d2) == Ordering::Equal) }) ); - // Test valid bytes - let mut input; - let mut counter = 0; - for i in 0..=127 { - input = [i]; - match i { - b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' => { - counter += 1; - assert!(Rfc1123Domain::try_from_bytes(input) - .map_or(false, |d| d.value.len() == 1 && d.value == input)) - } - b'-' => { - counter += 1; - let input2 = b"a-a"; - assert!(Rfc1123Domain::try_from_bytes(input2) - .map_or(false, |d| d.len().get() == 3 && d.value == input2)) - } - b'.' => { - let input2 = b"a."; - assert!(Rfc1123Domain::try_from_bytes(input2) - .map_or(false, |d| d.len().get() == 1 && d.value == input2)) - } - _ => assert!(Rfc1123Domain::try_from_bytes(input).map_or_else( - |e| e == Rfc1123Err::DomainErr(DomainErr::InvalidByte(i)), - |_| false - )), - } - } - assert!(counter == 63); + assert!(Rfc1123Domain::try_from_bytes("1.1.1.1") + .map_or_else(|err| err == Rfc1123Err::InvalidTld, |_| false)); } } diff --git a/src/serde.rs b/src/serde.rs @@ -169,8 +169,9 @@ fn rfc_err_to_serde<E: de::Error>(value: Rfc1123Err) -> E { match value { Rfc1123Err::DomainErr(err) => dom_err_to_serde(err), Rfc1123Err::LabelStartsWithAHyphen | Rfc1123Err::LabelEndsWithAHyphen => { - E::invalid_value(Unexpected::Str("-"), &"a valid domain conforming to RFC 1123 which mean all labels don't being or end with a '-'") + E::invalid_value(Unexpected::Str("-"), &"a valid domain conforming to RFC 1123 which requires all labels to not begin or end with a '-'") } + Rfc1123Err::InvalidTld => E::invalid_value(Unexpected::Str("tld that is not all letters nor begins with 'xn--' and has length of at least five"), &"a valid domain conforming to RFC 1123 which requires the last label (i.e., TLD) to either be all letters or have length of at least five and begins with 'xn--'") } } /// Serde [`Visitor`] that deserializes a string into an [`Rfc1123Domain`].