commit dfa4309dccf129e3002209e5f41fd037abe3c605
parent e0a00a4da03c17329c5a87755c3d9e5b7571a343
Author: Zack Newman <zack@philomathiclife.com>
Date: Tue, 13 Feb 2024 18:27:37 -0700
require tld to be alphabetic or an a-label in rfc1123domain
Diffstat:
3 files changed, 90 insertions(+), 144 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,7 +9,7 @@ license = "MIT OR Apache-2.0"
name = "ascii_domain"
readme = "README.md"
repository = "https://git.philomathiclife.com/repos/ascii_domain/"
-version = "0.5.0"
+version = "0.6.0"
[lib]
name = "ascii_domain"
diff --git a/src/dom.rs b/src/dom.rs
@@ -1005,6 +1005,9 @@ pub enum Rfc1123Err {
LabelStartsWithAHyphen,
/// A [`Label`] of [`Domain`] ends with an ASCII hyphen.
LabelEndsWithAHyphen,
+ /// The last [`Label`] (i.e., TLD) was invalid which means it was not all ASCII letters nor
+ /// had length of at least five with the first 4 characters being `xn--`.
+ InvalidTld,
}
impl Display for Rfc1123Err {
#[inline]
@@ -1015,6 +1018,7 @@ impl Display for Rfc1123Err {
f.write_str("a label in the domain starts with a hyphen")
}
Self::LabelEndsWithAHyphen => f.write_str("a label in the domain ends with a hyphen"),
+ Self::InvalidTld => f.write_str("the TLD in the domain was not all letters nor had length of at least five with the first 4 characters being 'xn--'")
}
}
}
@@ -1025,6 +1029,8 @@ impl Error for Rfc1123Err {}
///
/// * Each [`Label`] must only contain ASCII digits, letters, or hyphen.
/// * Each `Label` must not begin or end with a hyphen.
+/// * The last `Label` (i.e., TLD) must either contain only ASCII letters or have length of at least five and
+/// begin with `xn--`.
/// ---
/// Unsurprisingly, RFC 1123 is not super precise as it uses "host name" to mean label and also domain:
/// "Host software MUST handle host names \[labels\] of up to 63 characters and SHOULD handle host
@@ -1045,6 +1051,7 @@ impl Error for Rfc1123Err {}
/// * Literal: enforce the TLD is alphabetic regardless of the lack of justification.
/// * Relaxed: enforce the "spirit" that the TLD must exist.
/// * More relaxed: enforce the "spirit" that the TLD must have the same format of a valid TLD.
+/// * Much more relaxed: enforce the "spirit" that the domain cannot have the form of an IPv4 address.
/// * Most relaxed: treat TLDs no differently than other labels (i.e., don't make assumptions about what will be
/// a valid TLD in the future).
///
@@ -1057,12 +1064,21 @@ impl Error for Rfc1123Err {}
/// Assuming TLDs are static is absurd, and relying on some dynamic list of TLDs is undesirable. For that reason
/// the relaxed interpretation is rejected.
///
-/// To enforce that the TLD is of the correct format would require RFC 5891 semantics to ensure the TLD is
-/// either a valid A-label or NR-LDH label. This is not a cheap operation, and it treats TLDs inconsistently
-/// (specifically more strictly validated) than other labels. RFC 1123 should be treated as the foundation
-/// until it is made obsolete. In the future there may be other RFCs that further restrict valid TLDs, and treating
-/// RFC 1123 as if it were written presciently does not make sense. For that reason the more relaxed interpretation
-/// is rejected. Consequently we use the most relaxed interpretation.
+/// Enforcing that domains do not have the form of an IPv4 address opens up the question of what is an IPv4
+/// address? Should leading 0s be allowed? What about hexadecimal? Should there be length limits for each octet?
+/// It also has the undesirable effect where subdomains that are all numeric exist but their parent domain does
+/// not which goes against the hierarchical nature of DNS. For those reasons the much more relaxed interpretation
+/// is rejected.
+///
+/// Treating TLDs no differently than other labels is nice from a consistency perspective, but it suffers from
+/// the fact that domains that have the form of an IPv4 address are now allowed. For that reason the most
+/// relaxed interpretation is rejected.
+///
+/// [ICANN](https://newgtlds.icann.org/sites/default/files/guidebook-full-04jun12-en.pdf) requires TLDs to either
+/// be alphabetic or a valid A-label per RFC 5891. Verifying a label is a valid A-label is not a cheap operation
+/// though. For that reason the more relaxed interpretation is accepted but with a twist: fake and valid A-labels
+/// are allowed in addition to entirely alphabetic labels. More specifically the TLD must either contain only
+/// letters or must be at least five characters long with the first 4 characters being `xn--`.
///
/// If one wants to enforce the literal interpretation, one can use [`Self::is_literal_interpretation`]. Similarly,
/// if one wants to enforce the strict interpretation, one can use [`Self::is_strict_interpretation`].
@@ -1100,7 +1116,8 @@ impl<T> Rfc1123Domain<T> {
impl<T: AsRef<[u8]>> Rfc1123Domain<T> {
/// Function that transforms `v` into an `Rfc1123Domain` by only allowing [`Label`]s to contain the ASCII `u8`s
/// in [`ASCII_HYPHEN_DIGITS_LETTERS`] with each `Label` not starting or ending with a `b'-'`. A trailing `b'.'`
- /// is ignored.
+ /// is ignored. The last `Label` (i.e., TLD) must either only contain ASCII letters or must have length of at
+ /// least five with the first 4 bytes being `b"xn--"`.
///
/// Unliked calling [`Domain::try_from_bytes`] then [`Rfc1123Domain::try_from`] which performs two traversals
/// of `v`, this performs a single traversal of `v`.
@@ -1110,6 +1127,7 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> {
/// ```
/// use ascii_domain::dom::{Rfc1123Domain, Rfc1123Err};
/// assert!(Rfc1123Domain::try_from_bytes("example.com").is_ok());
+ /// assert!(Rfc1123Domain::try_from_bytes("example.xn--abc").is_ok());
/// assert!(Rfc1123Domain::try_from_bytes("a-.com").map_or_else(|err| err == Rfc1123Err::LabelEndsWithAHyphen, |_| false));
/// ```
///
@@ -1118,6 +1136,7 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> {
/// Returns [`Rfc1123Err`] iff `v.as_ref()` is an invalid `Rfc1123Domain`.
#[allow(
clippy::arithmetic_side_effects,
+ clippy::as_conversions,
clippy::indexing_slicing,
clippy::into_iter_on_ref,
clippy::redundant_else
@@ -1188,7 +1207,27 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> {
Ok(label_len + 1)
}
})
- .map(|_| Self {
+ .and_then(|tld_len| {
+ // `tld_len <= value.len()`.
+ let tld = &value[value.len() - tld_len as usize..];
+ if (tld_len > 4 && tld[..4] == *b"xn--".as_slice())
+ || tld
+ .into_iter()
+ .try_fold((), |(), byt| {
+ if byt.is_ascii_alphabetic() {
+ Ok(())
+ } else {
+ Err(())
+ }
+ })
+ .is_ok()
+ {
+ Ok(())
+ } else {
+ Err(Rfc1123Err::InvalidTld)
+ }
+ })
+ .map(|()| Self {
dom: Domain { value: v },
})
}
@@ -1201,7 +1240,7 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> {
/// ```
/// use ascii_domain::dom::Rfc1123Domain;
/// assert!(Rfc1123Domain::try_from_bytes("example.commmm").unwrap().is_literal_interpretation());
- /// assert!(!Rfc1123Domain::try_from_bytes("example.c1m").unwrap().is_literal_interpretation());
+ /// assert!(!Rfc1123Domain::try_from_bytes("example.xn--abc").unwrap().is_literal_interpretation());
/// ```
#[inline]
pub fn is_literal_interpretation(&self) -> bool {
@@ -1222,42 +1261,6 @@ impl<T: AsRef<[u8]>> Rfc1123Domain<T> {
let tld = self.dom.tld();
(2..4).contains(&tld.len().get()) && tld.is_alphabetic()
}
- /// Returns `true` iff the domain has the same format as an IPv4 address.
- ///
- /// Note that due to the most relaxed interpretation of RFC 1123 mentioned in [`Rfc1123Domain`], it is possible
- /// for the domain to be an IPv4 address unlike the strictest, strict, literal, and possibly relaxed
- /// interpretations.
- ///
- /// # Example
- ///
- /// ```
- /// use ascii_domain::dom::Rfc1123Domain;
- /// assert!(Rfc1123Domain::try_from_bytes("1.2.3.4").unwrap().is_ipv4());
- /// ```
- #[allow(clippy::arithmetic_side_effects, clippy::into_iter_on_ref)]
- #[inline]
- pub fn is_ipv4(&self) -> bool {
- // The min length of an IPv4 address is 7 and the max length is 15.
- (7..=15).contains(&self.dom.len().get())
- // We don't use `std::net::Ipv4Addr::from_str` since that does not consider octets with leading
- // 0s as valid. This means something like `0.0.0.01` is not considered an IPv4 address, but we
- // want to consider that as an IP.
- && self
- .dom
- .into_iter()
- .try_fold(0u8, |count, label| {
- // If we have more than 4 `Label`s, it's not an IPv4 address. Similarly if a `Label` has
- // length greater than 3, it's not a valid IPv4 address octet.
- if count < 4 && label.len().get() < 4 && label.as_str().parse::<u8>().is_ok() {
- // Overflow is not possible since we know `count < 4`.
- Ok(count + 1)
- } else {
- Err(())
- }
- })
- // We must have exactly 4 `Label`s.
- .map_or(false, |count| count == 4)
- }
}
impl<T: AsRef<[u8]>, T2: AsRef<[u8]>> PartialEq<Rfc1123Domain<T>> for Rfc1123Domain<T2> {
#[inline]
@@ -1410,28 +1413,38 @@ impl<T: AsRef<[u8]>> TryFrom<Domain<T>> for Rfc1123Domain<T> {
#[allow(
clippy::arithmetic_side_effects,
clippy::indexing_slicing,
- clippy::into_iter_on_ref
+ clippy::into_iter_on_ref,
+ clippy::unreachable
)]
#[inline]
fn try_from(value: Domain<T>) -> Result<Self, Self::Error> {
- value
- .into_iter()
- .try_fold((), |(), label| {
- let bytes = label.value.as_bytes();
- // `Label`s are never empty, so the below indexing is fine.
- // Underflow won't occur for the same reason.
- if bytes[0] == b'-' {
- Err(Rfc1123Err::LabelStartsWithAHyphen)
- } else if bytes[bytes.len() - 1] == b'-' {
- Err(Rfc1123Err::LabelEndsWithAHyphen)
- } else {
- bytes.into_iter().try_fold((), |(), byt| match *byt {
- b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' => Ok(()),
- val => Err(Rfc1123Err::DomainErr(DomainErr::InvalidByte(val))),
- })
- }
- })
- .map(|()| Self { dom: value })
+ let mut labels = value.into_iter();
+ let tld = labels
+ .next()
+ .unwrap_or_else(|| unreachable!("there is a bug in Domain::try_from_bytes"));
+ if tld.is_alphabetic()
+ || (tld.len().get() > 4 && tld.as_bytes()[..4] == *b"xn--".as_slice())
+ {
+ labels
+ .try_fold((), |(), label| {
+ let bytes = label.value.as_bytes();
+ // `Label`s are never empty, so the below indexing is fine.
+ // Underflow won't occur for the same reason.
+ if bytes[0] == b'-' {
+ Err(Rfc1123Err::LabelStartsWithAHyphen)
+ } else if bytes[bytes.len() - 1] == b'-' {
+ Err(Rfc1123Err::LabelEndsWithAHyphen)
+ } else {
+ bytes.into_iter().try_fold((), |(), byt| match *byt {
+ b'-' | b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' => Ok(()),
+ val => Err(Rfc1123Err::DomainErr(DomainErr::InvalidByte(val))),
+ })
+ }
+ })
+ .map(|()| Self { dom: value })
+ } else {
+ Err(Rfc1123Err::InvalidTld)
+ }
}
}
impl<T: AsRef<[u8]>> Display for Rfc1123Domain<T> {
@@ -1677,6 +1690,13 @@ mod tests {
)
);
assert!(
+ Domain::try_from_bytes("example.c1m", &ASCII_HYPHEN_DIGITS_LETTERS).map_or(
+ false,
+ |dom| Rfc1123Domain::try_from(dom)
+ .map_or_else(|e| e == Rfc1123Err::InvalidTld, |_| false)
+ )
+ );
+ assert!(
Domain::try_from_bytes("example.commm", &ASCII_HYPHEN_DIGITS_LETTERS).map_or(
false,
|dom| Rfc1123Domain::try_from(dom)
@@ -1684,7 +1704,7 @@ mod tests {
)
);
assert!(
- Domain::try_from_bytes("example.c1m", &ASCII_HYPHEN_DIGITS_LETTERS).map_or(
+ Domain::try_from_bytes("example.xn--abc", &ASCII_HYPHEN_DIGITS_LETTERS).map_or(
false,
|dom| Rfc1123Domain::try_from(dom)
.map_or(false, |rfc| !rfc.is_literal_interpretation())
@@ -1704,54 +1724,6 @@ mod tests {
.map_or(false, |rfc| !rfc.is_strict_interpretation())
)
);
- assert!(
- Domain::try_from_bytes("1.2.3.4", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| rfc.is_ipv4()))
- );
- assert!(
- Domain::try_from_bytes("001.001.001.001", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| rfc.is_ipv4()))
- );
- assert!(Domain::try_from_bytes("1", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| !rfc.is_ipv4())));
- assert!(
- Domain::try_from_bytes("1.1.1.1.1", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| !rfc.is_ipv4()))
- );
- assert!(
- Domain::try_from_bytes("256.0.0.0", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| !rfc.is_ipv4()))
- );
- assert!(
- Domain::try_from_bytes("0.0.0.0", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| rfc.is_ipv4()))
- );
- assert!(
- Domain::try_from_bytes("255.255.255.255", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| rfc.is_ipv4()))
- );
- assert!(
- Domain::try_from_bytes("255.255.255.256", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| !rfc.is_ipv4()))
- );
- assert!(
- Domain::try_from_bytes("0.0.0.256", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| !rfc.is_ipv4()))
- );
- assert!(
- Domain::try_from_bytes("1.1.1.0001", &ASCII_HYPHEN_DIGITS_LETTERS)
- .map_or(false, |dom| Rfc1123Domain::try_from(dom)
- .map_or(false, |rfc| !rfc.is_ipv4()))
- );
}
#[test]
fn test_tld() {
@@ -1835,34 +1807,7 @@ mod tests {
.map_or(false, |d2| d == d2 && d.cmp(&d2) == Ordering::Equal)
})
);
- // Test valid bytes
- let mut input;
- let mut counter = 0;
- for i in 0..=127 {
- input = [i];
- match i {
- b'0'..=b'9' | b'A'..=b'Z' | b'a'..=b'z' => {
- counter += 1;
- assert!(Rfc1123Domain::try_from_bytes(input)
- .map_or(false, |d| d.value.len() == 1 && d.value == input))
- }
- b'-' => {
- counter += 1;
- let input2 = b"a-a";
- assert!(Rfc1123Domain::try_from_bytes(input2)
- .map_or(false, |d| d.len().get() == 3 && d.value == input2))
- }
- b'.' => {
- let input2 = b"a.";
- assert!(Rfc1123Domain::try_from_bytes(input2)
- .map_or(false, |d| d.len().get() == 1 && d.value == input2))
- }
- _ => assert!(Rfc1123Domain::try_from_bytes(input).map_or_else(
- |e| e == Rfc1123Err::DomainErr(DomainErr::InvalidByte(i)),
- |_| false
- )),
- }
- }
- assert!(counter == 63);
+ assert!(Rfc1123Domain::try_from_bytes("1.1.1.1")
+ .map_or_else(|err| err == Rfc1123Err::InvalidTld, |_| false));
}
}
diff --git a/src/serde.rs b/src/serde.rs
@@ -169,8 +169,9 @@ fn rfc_err_to_serde<E: de::Error>(value: Rfc1123Err) -> E {
match value {
Rfc1123Err::DomainErr(err) => dom_err_to_serde(err),
Rfc1123Err::LabelStartsWithAHyphen | Rfc1123Err::LabelEndsWithAHyphen => {
- E::invalid_value(Unexpected::Str("-"), &"a valid domain conforming to RFC 1123 which mean all labels don't being or end with a '-'")
+ E::invalid_value(Unexpected::Str("-"), &"a valid domain conforming to RFC 1123 which requires all labels to not begin or end with a '-'")
}
+ Rfc1123Err::InvalidTld => E::invalid_value(Unexpected::Str("tld that is not all letters nor begins with 'xn--' and has length of at least five"), &"a valid domain conforming to RFC 1123 which requires the last label (i.e., TLD) to either be all letters or have length of at least five and begins with 'xn--'")
}
}
/// Serde [`Visitor`] that deserializes a string into an [`Rfc1123Domain`].