commit fdd04912f5008d319ca595580ac4805a20c9f945
parent 451d988ac338a68d3448a3762c43471a2666da3e
Author: Zack Newman <zack@philomathiclife.com>
Date: Tue, 13 Feb 2024 19:50:37 -0700
change domains such that tlds are alphabetic or a-labels
Diffstat:
4 files changed, 49 insertions(+), 99 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
@@ -9,7 +9,7 @@ license = "MIT OR Apache-2.0"
name = "rpz"
readme = "README.md"
repository = "https://git.philomathiclife.com/repos/rpz/"
-version = "0.5.0"
+version = "0.6.0"
[lib]
name = "rpz"
@@ -20,7 +20,7 @@ name = "rpz"
path = "src/main.rs"
[dependencies]
-ascii_domain = { version = "0.5.0", default-features = false }
+ascii_domain = { version = "0.6.0", default-features = false }
num-bigint = { version = "0.4.4", default-features = false }
reqwest = { version = "0.11.24", default-features = false, features = ["brotli", "deflate", "gzip", "rustls-tls-native-roots", "trust-dns"] }
serde = { version = "1.0.196", default-features = false }
diff --git a/README.md b/README.md
@@ -73,7 +73,7 @@ with the requirement that the rule conforms to the following extended regex:
where `<domain>` conforms to a valid [`Domain`](https://docs.rs/ascii_domain/latest/ascii_domain/dom/struct.Domain.html) based on
[`ASCII_FIREFOX`](https://docs.rs/ascii_domain/latest/ascii_domain/char_set/constant.ASCII_FIREFOX.html) with the added requirements
-that it does not have the form of an IPv4 address and does not contain `$`, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace).
+that the TLD is either all letters or at least length five and begins with `xn--` and does not contain `$`, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace).
Lines that begin with `||` cause all subdomains to be blocked (i.e., the domain itself and all proper subdomains); without
`||`, only the specific domain is blocked.
@@ -90,7 +90,7 @@ with the requirement that the rule conforms to the following regex:
`^<ws>*<domain><ws>*(#.*)?$`
-where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX` but does not have the form of an IPv4 address, and `<ws>` is any sequence of ASCII whitespace.
+where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX`, the TLD is either all letters or at least length five and begins with `xn--`, and `<ws>` is any sequence of ASCII whitespace.
Domains only represent themselves (i.e., proper subdomains will not be blocked).
@@ -101,7 +101,7 @@ with the requirement that the rule conforms to the following extended regex:
`^<ws>*<ip><ws>+<domain><ws>*(#.*)?$`
-where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX` but does not have the form of an IPv4 address, `<ws>` is any sequence of ASCII whitespace, and `<ip>` is one of the following:
+where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX`, the TLD is either all letters or at least length five and begins with `xn--`, `<ws>` is any sequence of ASCII whitespace, and `<ip>` is one of the following:
`::`, `::1`, `0.0.0.0`, or `127.0.0.1`.
@@ -114,7 +114,7 @@ with the requirement that the rule conforms to the following extended regex:
`^<ws>*(\*\.)?<domain><ws>*(#.*)?$`
-where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX` but does not have the form of an IPv4 address, and `<ws>` is any sequence of ASCII whitespace.
+where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX`, the TLD is either all letters or at least length five and begins with `xn--`, and `<ws>` is any sequence of ASCII whitespace.
If `domain` begins with `*.`, then `domain` must have length less than 252 and all proper subdomains are blocked—this
does _not_ include the domain itself; otherwise, only the `domain` is blocked.
diff --git a/src/dom.rs b/src/dom.rs
@@ -27,8 +27,8 @@ use zfc::{BoundedCardinality, Cardinality, Set};
pub enum FirefoxDomainErr {
/// The domain is invalid based on [`Domain`] using [`ASCII_FIREFOX`].
InvalidDomain(DomainErr),
- /// The domain was an IPv4 address.
- Ipv4,
+ /// The domain had a TLD that was not all letters nor length of at least five beginning with `b"xn--"`.
+ InvalidTld,
/// The string passed to [`Adblock::parse_value`] contained `$`.
InvalidAdblockDomain,
/// The string passed to [`Hosts::parse_value`] did not conform
@@ -44,7 +44,7 @@ impl Display for FirefoxDomainErr {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
match *self {
Self::InvalidDomain(err) => err.fmt(f),
- Self::Ipv4 => f.write_str("domain was an IPv4 address"),
+ Self::InvalidTld => f.write_str("domain had a TLD that was not all letters nor at least five characters long starting with 'xn--'"),
Self::InvalidAdblockDomain => f.write_str("Adblock-style domain contained a '$'"),
Self::InvalidHostsIP => f.write_str("hosts-style domain does not begin with the IP '::', '::1', '0.0.0.0', or '127.0.0.1' followed by at least one space or tab"),
Self::InvalidWildcardDomain => f.write_str("non-wildcard portion of a wildcard domain had length of at least 252 which means there are 0 proper subdomains"),
@@ -54,38 +54,18 @@ impl Display for FirefoxDomainErr {
impl error::Error for FirefoxDomainErr {}
/// The ASCII we allow domains to have.
const CHARS: &AllowedAscii<[u8; 78]> = &ASCII_FIREFOX;
-/// Parses a `[u8]` into a `Domain` using `CHARS` with the added restriction that the `Domain` does not
-/// have the format of an IPv4 address.
-#[allow(clippy::arithmetic_side_effects, clippy::into_iter_on_ref)]
+/// Parses a `[u8]` into a `Domain` using `CHARS` with the added restriction that the `Domain` has a TLD
+/// that is either all letters or has length of at least five and begins with `b"xn--"`.
#[inline]
-fn domain_no_ip<'a: 'b, 'b>(val: &'a [u8]) -> Result<Domain<&'b str>, FirefoxDomainErr> {
+fn domain_icann_tld<'a: 'b, 'b>(val: &'a [u8]) -> Result<Domain<&'b str>, FirefoxDomainErr> {
Domain::try_from_bytes(val, CHARS)
.map_err(FirefoxDomainErr::InvalidDomain)
.and_then(|dom| {
- // We don't use `std::net::Ipv4Addr::from_str` since that does not consider octets with leading
- // 0s as valid. This means something like `0.0.0.01` is not considered an IPv4 address, but we
- // want to consider that as an IP.
- if (7..=15).contains(&dom.len().get())
- && dom
- .into_iter()
- .try_fold(0u8, |count, label| {
- // If there are more than 4 `Label`s, it's not an IPv4 address. Similarly if there is more
- // than 3 characters in the the `Label`, then it's not a valid IPv4 octet.
- if count < 4
- && label.len().get() < 4
- && label.as_str().parse::<u8>().is_ok()
- {
- Ok(count + 1)
- } else {
- Err(())
- }
- })
- // There must be exactly 4 `Label`s.
- .map_or(false, |count| count == 4)
- {
- Err(FirefoxDomainErr::Ipv4)
- } else {
+ let tld = dom.tld();
+ if tld.is_alphabetic() || (tld.len().get() > 4 && tld.as_bytes()[..4] == *b"xn--") {
Ok(dom.into())
+ } else {
+ Err(FirefoxDomainErr::InvalidTld)
}
})
}
@@ -269,7 +249,7 @@ pub trait ParsedDomain<'a>: Sized {
/// `^<ws>*(\|\|)?<ws>*<domain><ws>*\^?<ws>*$`
///
/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`] with the added requirement that it
-/// does not contain `$`, is not of the form of an IPv4 address, and `<ws>` is any sequence of
+/// does not contain `$`, the TLD is either all letters or at least length five and begins with `xn--`, and `<ws>` is any sequence of
/// [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace).
///
/// Comments are any lines that start with `!` or `#` (ignoring whitespace). Any in-line comments after a valid
@@ -701,7 +681,7 @@ impl<'a> ParsedDomain<'a> for Adblock<'a> {
}
})
.and_then(|()| {
- domain_no_ip(val2).map(|domain| {
+ domain_icann_tld(val2).map(|domain| {
// A domain of length 252 or 253 can't have subdomains due to there not being enough
// characters.
Value::Domain(Self {
@@ -739,8 +719,8 @@ impl<'a> ParsedDomain<'a> for Adblock<'a> {
///
/// `^<ws>*<domain><ws>*(#.*)?$`
///
-/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], is not of the form of an IPv4
-/// address, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace).
+/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], the TLD is either all letters
+/// or at least length five and begins with `xn--`, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace).
///
/// Comments are any lines that start with `#` (ignoring whitespace). Any in-line comments after a valid domain
/// are ignored and will be parsed into a [`Value::Domain`].
@@ -963,7 +943,7 @@ impl<'a> ParsedDomain<'a> for DomainOnly<'a> {
let comment = unsafe { str::from_utf8_unchecked(&value[1..]) };
Ok(Value::Comment(comment))
} else {
- domain_no_ip(
+ domain_icann_tld(
value[..value
.into_iter()
.try_fold(0, |i, byt2| if *byt2 == b'#' { Err(i) } else { Ok(i + 1) })
@@ -990,9 +970,9 @@ impl<'a> ParsedDomain<'a> for DomainOnly<'a> {
///
/// `^<ws>*<ip><ws>+<domain><ws>*(#.*)?$`
///
-/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], is not of the form of an IPv4
-/// address, `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace), and
-/// `<ip>` is one of the following:
+/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], the TLD is either all letters
+/// or at least length five and begins with `xn--`, `<ws>` is any sequence of
+/// [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace), and `<ip>` is one of the following:
///
/// `::`, `::1`, `0.0.0.0`, or `127.0.0.1`.
///
@@ -1229,7 +1209,7 @@ impl<'a> ParsedDomain<'a> for Hosts<'a> {
// There has to be at least one space or tab between the IP and domain.
Err(FirefoxDomainErr::InvalidHostsIP)
} else {
- domain_no_ip(
+ domain_icann_tld(
value[..value
.into_iter()
.try_fold(
@@ -1260,8 +1240,9 @@ impl<'a> ParsedDomain<'a> for Hosts<'a> {
///
/// `^<ws>*(\*\.)?<domain><ws>*(#.*)?$`
///
-/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], is not of the form of an IPv4
-/// address, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace).
+/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], the TLD is either all letters
+/// or at least length five and begins with `xn--`, and `<ws>` is any sequence of
+/// [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace).
///
/// If `domain` begins with `*.`, then `domain` must have length less than 252.
///
@@ -1548,7 +1529,7 @@ impl<'a> ParsedDomain<'a> for Wildcard<'a> {
}
},
);
- domain_no_ip(
+ domain_icann_tld(
val2[..val2
.into_iter()
.try_fold(0, |i, byt2| if *byt2 == b'#' { Err(i) } else { Ok(i + 1) })
@@ -2054,6 +2035,12 @@ mod tests {
// Test blank.
assert!(DomainOnly::parse_value(" \t\t \t\t \t ")
.map_or(false, |val| matches!(val, Value::Blank)));
+ // Test blank.
+ assert!(DomainOnly::parse_value("example.xn--abc")
+ .map_or(false, |val| matches!(val, Value::Domain(_))));
+ // Test invalid TLD.
+ assert!(DomainOnly::parse_value("www.c1m")
+ .map_or_else(|err| err == FirefoxDomainErr::InvalidTld, |_| false));
}
#[test]
fn test_hosts_parse_value() {
@@ -2329,20 +2316,18 @@ mod tests {
assert!(Adblock::parse_value("||a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a").map_or(false, |val| { let dom = val.unwrap_domain(); dom.domain.into_iter().count() == 127 && !dom.subdomains && dom.domain_count() == BigUint::new(vec![1]) }));
// Pre-calculated manually.
// This is the number of domains possible between 2 and 252 characters.
- // The other check is to ensure that IPv4 address subdomains are not counted.
assert!(Wildcard::parse_value("*.a").map_or(false, |val| {
- let val = val.unwrap_domain().domain_count();
- val == BigUint::new(vec![
- 375288404, 2460223985, 1334358771, 2543621408, 2519466280, 1133682239, 3589178618,
- 348125705, 1709233643, 958334503, 3780539710, 2181893897, 2457156833, 3204765645,
- 2728103430, 1817547150, 3102358416, 444185044, 3659003776, 10341713, 306326206,
- 1336386425, 3942332649, 2036577878, 2460939277, 3976861337, 2101094571, 2241770079,
- 2667853164, 3687350273, 109356153, 3455569358, 2333076459, 2433207896, 1553903141,
- 2621943843, 4223295645, 1753858368, 130924388, 965594304, 3942586845, 1573844087,
- 4237886128, 481383133, 56931017,
- ]) && Wildcard::parse_value("*.1").map_or(false, |val2| {
- val2.unwrap_domain().domain_count() == (val - BigUint::new(vec![366u32.pow(3)]))
- })
+ val.unwrap_domain().domain_count()
+ == BigUint::new(vec![
+ 375288404, 2460223985, 1334358771, 2543621408, 2519466280, 1133682239,
+ 3589178618, 348125705, 1709233643, 958334503, 3780539710, 2181893897,
+ 2457156833, 3204765645, 2728103430, 1817547150, 3102358416, 444185044,
+ 3659003776, 10341713, 306326206, 1336386425, 3942332649, 2036577878,
+ 2460939277, 3976861337, 2101094571, 2241770079, 2667853164, 3687350273,
+ 109356153, 3455569358, 2333076459, 2433207896, 1553903141, 2621943843,
+ 4223295645, 1753858368, 130924388, 965594304, 3942586845, 1573844087,
+ 4237886128, 481383133, 56931017,
+ ])
}));
}
}
diff --git a/src/dom_count_auto_gen.rs b/src/dom_count_auto_gen.rs
@@ -10,45 +10,11 @@ use num_bigint::BigUint;
clippy::unreadable_literal
)]
#[inline]
-// 10 + 90 + 156 = 256
-// 30 + 180 + 156 = 366
pub fn proper_subdomain_count(dom: &Domain<&str>) -> BigUint {
- /// Returns how many proper subdomains are IPv4 addresses. We need to calculate this so that we can
- /// subtract this value from the cached cardinalities. Note that we don't have to worry about the `Domain`
- /// itself since an IPv4 address can't be parsed into a `Domain` via `crate::dom::domain_no_ip`.
- #[allow(clippy::cast_lossless)]
- #[inline]
- fn ip_count(dom: &Domain<&str>) -> u32 {
- // `Domain`s that have 4 or more `Label`s can't be an IPv4 address. Also `Domain`s must have at least one
- // `Label`, so 0 < 4 - label count < 4 and `(10 * 3 + 90 * 2 + 156 * 1)^3 = (30 + 180 + 156)^3
- // = 366^3 <= u32::MAX`.
- dom.into_iter()
- .try_fold(0, |count, label| {
- // If this is the fourth `Label`, then it can't be an IPv4 address since it has too many
- // `Label`s.
- // Only a sequence of 1 to 3 digits whose value is between 0 and 255 is a valid octet
- // in an IPv4 address. For `Domain`s that have such `Label`s, the total number of IPv4
- // addresses is simply 366^(4- label count).
- // 366 comes from the fact that there are 3 distinct ways to represents integers < 10,
- // 2 distinct ways to represent integers inclusively between 10 and 99, and 1 way
- // to represent integers greater than 99 giving (3 * 10) + (2 * 90) + (1 * 156) = 366
- // ways a `Label` can be a valid octet for an IPv4 address.
- //
- // Note that `Label`s always have a length of at least 1 and any `Label` longer than 3
- // cannot be a valid octet in an IPv4 address.
- if count < 4 && label.len().get() < 4 && label.as_str().parse::<u8>().is_ok() {
- Ok(count + 1)
- } else {
- Err(())
- }
- })
- .map_or(0, |count| 366u32.pow(4 - count))
- }
// The commented out code at the end of the function was used to calculate the cardinalities
- // for each possible value of domain length allowing IPv4 addresses; however it takes
- // as much as 16 seconds to calculate for a give value of n, so we cache the results
- // instead.
- // The array is ordered based on the descending order of length of `Domain`s
+ // for each possible value of domain length; however it takes as much as 16 seconds to calculate
+ // for a given value of n, so we cache the results
+ // instead. The array is ordered based on the descending order of length of `Domain`s
// (e.g., index 0 corresponds to the number of subdomains for `Domain`s of length 251 which
// is the max length of a `Domain` with at least one proper subdomain).
[
@@ -1427,7 +1393,6 @@ pub fn proper_subdomain_count(dom: &Domain<&str>) -> BigUint {
]),
][251 - dom.len().get() as usize]
.clone()
- - BigUint::new(vec![ip_count(dom)])
// #![feature(int_roundings)]
// use num_bigint::BigUint;
// use std::fs::File;