rpz

Response policy zone (RPZ) file generator.
git clone https://git.philomathiclife.com/repos/rpz
Log | Files | Refs | README

commit fdd04912f5008d319ca595580ac4805a20c9f945
parent 451d988ac338a68d3448a3762c43471a2666da3e
Author: Zack Newman <zack@philomathiclife.com>
Date:   Tue, 13 Feb 2024 19:50:37 -0700

change domains such that tlds are alphabetic or a-labels

Diffstat:
MCargo.toml | 4++--
MREADME.md | 8++++----
Msrc/dom.rs | 95+++++++++++++++++++++++++++++++++----------------------------------------------
Msrc/dom_count_auto_gen.rs | 41+++--------------------------------------
4 files changed, 49 insertions(+), 99 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml @@ -9,7 +9,7 @@ license = "MIT OR Apache-2.0" name = "rpz" readme = "README.md" repository = "https://git.philomathiclife.com/repos/rpz/" -version = "0.5.0" +version = "0.6.0" [lib] name = "rpz" @@ -20,7 +20,7 @@ name = "rpz" path = "src/main.rs" [dependencies] -ascii_domain = { version = "0.5.0", default-features = false } +ascii_domain = { version = "0.6.0", default-features = false } num-bigint = { version = "0.4.4", default-features = false } reqwest = { version = "0.11.24", default-features = false, features = ["brotli", "deflate", "gzip", "rustls-tls-native-roots", "trust-dns"] } serde = { version = "1.0.196", default-features = false } diff --git a/README.md b/README.md @@ -73,7 +73,7 @@ with the requirement that the rule conforms to the following extended regex: where `<domain>` conforms to a valid [`Domain`](https://docs.rs/ascii_domain/latest/ascii_domain/dom/struct.Domain.html) based on [`ASCII_FIREFOX`](https://docs.rs/ascii_domain/latest/ascii_domain/char_set/constant.ASCII_FIREFOX.html) with the added requirements -that it does not have the form of an IPv4 address and does not contain `$`, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace). +that the TLD is either all letters or at least length five and begins with `xn--` and does not contain `$`, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace). Lines that begin with `||` cause all subdomains to be blocked (i.e., the domain itself and all proper subdomains); without `||`, only the specific domain is blocked. @@ -90,7 +90,7 @@ with the requirement that the rule conforms to the following regex: `^<ws>*<domain><ws>*(#.*)?$` -where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX` but does not have the form of an IPv4 address, and `<ws>` is any sequence of ASCII whitespace. +where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX`, the TLD is either all letters or at least length five and begins with `xn--`, and `<ws>` is any sequence of ASCII whitespace. Domains only represent themselves (i.e., proper subdomains will not be blocked). @@ -101,7 +101,7 @@ with the requirement that the rule conforms to the following extended regex: `^<ws>*<ip><ws>+<domain><ws>*(#.*)?$` -where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX` but does not have the form of an IPv4 address, `<ws>` is any sequence of ASCII whitespace, and `<ip>` is one of the following: +where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX`, the TLD is either all letters or at least length five and begins with `xn--`, `<ws>` is any sequence of ASCII whitespace, and `<ip>` is one of the following: `::`, `::1`, `0.0.0.0`, or `127.0.0.1`. @@ -114,7 +114,7 @@ with the requirement that the rule conforms to the following extended regex: `^<ws>*(\*\.)?<domain><ws>*(#.*)?$` -where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX` but does not have the form of an IPv4 address, and `<ws>` is any sequence of ASCII whitespace. +where `<domain>` conforms to a valid `Domain` based on `ASCII_FIREFOX`, the TLD is either all letters or at least length five and begins with `xn--`, and `<ws>` is any sequence of ASCII whitespace. If `domain` begins with `*.`, then `domain` must have length less than 252 and all proper subdomains are blocked—this does _not_ include the domain itself; otherwise, only the `domain` is blocked. diff --git a/src/dom.rs b/src/dom.rs @@ -27,8 +27,8 @@ use zfc::{BoundedCardinality, Cardinality, Set}; pub enum FirefoxDomainErr { /// The domain is invalid based on [`Domain`] using [`ASCII_FIREFOX`]. InvalidDomain(DomainErr), - /// The domain was an IPv4 address. - Ipv4, + /// The domain had a TLD that was not all letters nor length of at least five beginning with `b"xn--"`. + InvalidTld, /// The string passed to [`Adblock::parse_value`] contained `$`. InvalidAdblockDomain, /// The string passed to [`Hosts::parse_value`] did not conform @@ -44,7 +44,7 @@ impl Display for FirefoxDomainErr { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { match *self { Self::InvalidDomain(err) => err.fmt(f), - Self::Ipv4 => f.write_str("domain was an IPv4 address"), + Self::InvalidTld => f.write_str("domain had a TLD that was not all letters nor at least five characters long starting with 'xn--'"), Self::InvalidAdblockDomain => f.write_str("Adblock-style domain contained a '$'"), Self::InvalidHostsIP => f.write_str("hosts-style domain does not begin with the IP '::', '::1', '0.0.0.0', or '127.0.0.1' followed by at least one space or tab"), Self::InvalidWildcardDomain => f.write_str("non-wildcard portion of a wildcard domain had length of at least 252 which means there are 0 proper subdomains"), @@ -54,38 +54,18 @@ impl Display for FirefoxDomainErr { impl error::Error for FirefoxDomainErr {} /// The ASCII we allow domains to have. const CHARS: &AllowedAscii<[u8; 78]> = &ASCII_FIREFOX; -/// Parses a `[u8]` into a `Domain` using `CHARS` with the added restriction that the `Domain` does not -/// have the format of an IPv4 address. -#[allow(clippy::arithmetic_side_effects, clippy::into_iter_on_ref)] +/// Parses a `[u8]` into a `Domain` using `CHARS` with the added restriction that the `Domain` has a TLD +/// that is either all letters or has length of at least five and begins with `b"xn--"`. #[inline] -fn domain_no_ip<'a: 'b, 'b>(val: &'a [u8]) -> Result<Domain<&'b str>, FirefoxDomainErr> { +fn domain_icann_tld<'a: 'b, 'b>(val: &'a [u8]) -> Result<Domain<&'b str>, FirefoxDomainErr> { Domain::try_from_bytes(val, CHARS) .map_err(FirefoxDomainErr::InvalidDomain) .and_then(|dom| { - // We don't use `std::net::Ipv4Addr::from_str` since that does not consider octets with leading - // 0s as valid. This means something like `0.0.0.01` is not considered an IPv4 address, but we - // want to consider that as an IP. - if (7..=15).contains(&dom.len().get()) - && dom - .into_iter() - .try_fold(0u8, |count, label| { - // If there are more than 4 `Label`s, it's not an IPv4 address. Similarly if there is more - // than 3 characters in the the `Label`, then it's not a valid IPv4 octet. - if count < 4 - && label.len().get() < 4 - && label.as_str().parse::<u8>().is_ok() - { - Ok(count + 1) - } else { - Err(()) - } - }) - // There must be exactly 4 `Label`s. - .map_or(false, |count| count == 4) - { - Err(FirefoxDomainErr::Ipv4) - } else { + let tld = dom.tld(); + if tld.is_alphabetic() || (tld.len().get() > 4 && tld.as_bytes()[..4] == *b"xn--") { Ok(dom.into()) + } else { + Err(FirefoxDomainErr::InvalidTld) } }) } @@ -269,7 +249,7 @@ pub trait ParsedDomain<'a>: Sized { /// `^<ws>*(\|\|)?<ws>*<domain><ws>*\^?<ws>*$` /// /// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`] with the added requirement that it -/// does not contain `$`, is not of the form of an IPv4 address, and `<ws>` is any sequence of +/// does not contain `$`, the TLD is either all letters or at least length five and begins with `xn--`, and `<ws>` is any sequence of /// [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace). /// /// Comments are any lines that start with `!` or `#` (ignoring whitespace). Any in-line comments after a valid @@ -701,7 +681,7 @@ impl<'a> ParsedDomain<'a> for Adblock<'a> { } }) .and_then(|()| { - domain_no_ip(val2).map(|domain| { + domain_icann_tld(val2).map(|domain| { // A domain of length 252 or 253 can't have subdomains due to there not being enough // characters. Value::Domain(Self { @@ -739,8 +719,8 @@ impl<'a> ParsedDomain<'a> for Adblock<'a> { /// /// `^<ws>*<domain><ws>*(#.*)?$` /// -/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], is not of the form of an IPv4 -/// address, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace). +/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], the TLD is either all letters +/// or at least length five and begins with `xn--`, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace). /// /// Comments are any lines that start with `#` (ignoring whitespace). Any in-line comments after a valid domain /// are ignored and will be parsed into a [`Value::Domain`]. @@ -963,7 +943,7 @@ impl<'a> ParsedDomain<'a> for DomainOnly<'a> { let comment = unsafe { str::from_utf8_unchecked(&value[1..]) }; Ok(Value::Comment(comment)) } else { - domain_no_ip( + domain_icann_tld( value[..value .into_iter() .try_fold(0, |i, byt2| if *byt2 == b'#' { Err(i) } else { Ok(i + 1) }) @@ -990,9 +970,9 @@ impl<'a> ParsedDomain<'a> for DomainOnly<'a> { /// /// `^<ws>*<ip><ws>+<domain><ws>*(#.*)?$` /// -/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], is not of the form of an IPv4 -/// address, `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace), and -/// `<ip>` is one of the following: +/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], the TLD is either all letters +/// or at least length five and begins with `xn--`, `<ws>` is any sequence of +/// [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace), and `<ip>` is one of the following: /// /// `::`, `::1`, `0.0.0.0`, or `127.0.0.1`. /// @@ -1229,7 +1209,7 @@ impl<'a> ParsedDomain<'a> for Hosts<'a> { // There has to be at least one space or tab between the IP and domain. Err(FirefoxDomainErr::InvalidHostsIP) } else { - domain_no_ip( + domain_icann_tld( value[..value .into_iter() .try_fold( @@ -1260,8 +1240,9 @@ impl<'a> ParsedDomain<'a> for Hosts<'a> { /// /// `^<ws>*(\*\.)?<domain><ws>*(#.*)?$` /// -/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], is not of the form of an IPv4 -/// address, and `<ws>` is any sequence of [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace). +/// where `<domain>` conforms to a valid [`Domain`] based on [`ASCII_FIREFOX`], the TLD is either all letters +/// or at least length five and begins with `xn--`, and `<ws>` is any sequence of +/// [ASCII whitespace](https://infra.spec.whatwg.org/#ascii-whitespace). /// /// If `domain` begins with `*.`, then `domain` must have length less than 252. /// @@ -1548,7 +1529,7 @@ impl<'a> ParsedDomain<'a> for Wildcard<'a> { } }, ); - domain_no_ip( + domain_icann_tld( val2[..val2 .into_iter() .try_fold(0, |i, byt2| if *byt2 == b'#' { Err(i) } else { Ok(i + 1) }) @@ -2054,6 +2035,12 @@ mod tests { // Test blank. assert!(DomainOnly::parse_value(" \t\t \t\t \t ") .map_or(false, |val| matches!(val, Value::Blank))); + // Test blank. + assert!(DomainOnly::parse_value("example.xn--abc") + .map_or(false, |val| matches!(val, Value::Domain(_)))); + // Test invalid TLD. + assert!(DomainOnly::parse_value("www.c1m") + .map_or_else(|err| err == FirefoxDomainErr::InvalidTld, |_| false)); } #[test] fn test_hosts_parse_value() { @@ -2329,20 +2316,18 @@ mod tests { assert!(Adblock::parse_value("||a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a.a").map_or(false, |val| { let dom = val.unwrap_domain(); dom.domain.into_iter().count() == 127 && !dom.subdomains && dom.domain_count() == BigUint::new(vec![1]) })); // Pre-calculated manually. // This is the number of domains possible between 2 and 252 characters. - // The other check is to ensure that IPv4 address subdomains are not counted. assert!(Wildcard::parse_value("*.a").map_or(false, |val| { - let val = val.unwrap_domain().domain_count(); - val == BigUint::new(vec![ - 375288404, 2460223985, 1334358771, 2543621408, 2519466280, 1133682239, 3589178618, - 348125705, 1709233643, 958334503, 3780539710, 2181893897, 2457156833, 3204765645, - 2728103430, 1817547150, 3102358416, 444185044, 3659003776, 10341713, 306326206, - 1336386425, 3942332649, 2036577878, 2460939277, 3976861337, 2101094571, 2241770079, - 2667853164, 3687350273, 109356153, 3455569358, 2333076459, 2433207896, 1553903141, - 2621943843, 4223295645, 1753858368, 130924388, 965594304, 3942586845, 1573844087, - 4237886128, 481383133, 56931017, - ]) && Wildcard::parse_value("*.1").map_or(false, |val2| { - val2.unwrap_domain().domain_count() == (val - BigUint::new(vec![366u32.pow(3)])) - }) + val.unwrap_domain().domain_count() + == BigUint::new(vec![ + 375288404, 2460223985, 1334358771, 2543621408, 2519466280, 1133682239, + 3589178618, 348125705, 1709233643, 958334503, 3780539710, 2181893897, + 2457156833, 3204765645, 2728103430, 1817547150, 3102358416, 444185044, + 3659003776, 10341713, 306326206, 1336386425, 3942332649, 2036577878, + 2460939277, 3976861337, 2101094571, 2241770079, 2667853164, 3687350273, + 109356153, 3455569358, 2333076459, 2433207896, 1553903141, 2621943843, + 4223295645, 1753858368, 130924388, 965594304, 3942586845, 1573844087, + 4237886128, 481383133, 56931017, + ]) })); } } diff --git a/src/dom_count_auto_gen.rs b/src/dom_count_auto_gen.rs @@ -10,45 +10,11 @@ use num_bigint::BigUint; clippy::unreadable_literal )] #[inline] -// 10 + 90 + 156 = 256 -// 30 + 180 + 156 = 366 pub fn proper_subdomain_count(dom: &Domain<&str>) -> BigUint { - /// Returns how many proper subdomains are IPv4 addresses. We need to calculate this so that we can - /// subtract this value from the cached cardinalities. Note that we don't have to worry about the `Domain` - /// itself since an IPv4 address can't be parsed into a `Domain` via `crate::dom::domain_no_ip`. - #[allow(clippy::cast_lossless)] - #[inline] - fn ip_count(dom: &Domain<&str>) -> u32 { - // `Domain`s that have 4 or more `Label`s can't be an IPv4 address. Also `Domain`s must have at least one - // `Label`, so 0 < 4 - label count < 4 and `(10 * 3 + 90 * 2 + 156 * 1)^3 = (30 + 180 + 156)^3 - // = 366^3 <= u32::MAX`. - dom.into_iter() - .try_fold(0, |count, label| { - // If this is the fourth `Label`, then it can't be an IPv4 address since it has too many - // `Label`s. - // Only a sequence of 1 to 3 digits whose value is between 0 and 255 is a valid octet - // in an IPv4 address. For `Domain`s that have such `Label`s, the total number of IPv4 - // addresses is simply 366^(4- label count). - // 366 comes from the fact that there are 3 distinct ways to represents integers < 10, - // 2 distinct ways to represent integers inclusively between 10 and 99, and 1 way - // to represent integers greater than 99 giving (3 * 10) + (2 * 90) + (1 * 156) = 366 - // ways a `Label` can be a valid octet for an IPv4 address. - // - // Note that `Label`s always have a length of at least 1 and any `Label` longer than 3 - // cannot be a valid octet in an IPv4 address. - if count < 4 && label.len().get() < 4 && label.as_str().parse::<u8>().is_ok() { - Ok(count + 1) - } else { - Err(()) - } - }) - .map_or(0, |count| 366u32.pow(4 - count)) - } // The commented out code at the end of the function was used to calculate the cardinalities - // for each possible value of domain length allowing IPv4 addresses; however it takes - // as much as 16 seconds to calculate for a give value of n, so we cache the results - // instead. - // The array is ordered based on the descending order of length of `Domain`s + // for each possible value of domain length; however it takes as much as 16 seconds to calculate + // for a given value of n, so we cache the results + // instead. The array is ordered based on the descending order of length of `Domain`s // (e.g., index 0 corresponds to the number of subdomains for `Domain`s of length 251 which // is the max length of a `Domain` with at least one proper subdomain). [ @@ -1427,7 +1393,6 @@ pub fn proper_subdomain_count(dom: &Domain<&str>) -> BigUint { ]), ][251 - dom.len().get() as usize] .clone() - - BigUint::new(vec![ip_count(dom)]) // #![feature(int_roundings)] // use num_bigint::BigUint; // use std::fs::File;