commit c836f88ff2a7f94ee7427ae04b91e702a31ab52a
parent 8b660ae090179248544c9dc713a5ae2d896aad37
Author: Daniel GarcĂa <dani-garcia@users.noreply.github.com>
Date: Sun, 7 Feb 2021 22:28:02 +0100
Remove soup and use a newer html5ever directly
Diffstat:
M | Cargo.lock | | | 209 | +++++++++++++++++++++++-------------------------------------------------------- |
M | Cargo.toml | | | 3 | ++- |
M | src/api/icons.rs | | | 73 | ++++++++++++++++++++++++++++++++++++++++++++++--------------------------- |
3 files changed, 108 insertions(+), 177 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
@@ -37,12 +37,6 @@ dependencies = [
[[package]]
name = "autocfg"
-version = "0.1.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2"
-
-[[package]]
-name = "autocfg"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a"
@@ -134,11 +128,13 @@ dependencies = [
"dotenv",
"fern",
"handlebars",
+ "html5ever",
"idna 0.2.1",
"jsonwebtoken",
"lettre",
"libsqlite3-sys",
"log 0.4.14",
+ "markup5ever_rcdom",
"multipart",
"newline-converter",
"num-derive",
@@ -159,7 +155,6 @@ dependencies = [
"rocket_contrib",
"serde",
"serde_json",
- "soup",
"syslog",
"time 0.2.25",
"u2f",
@@ -299,15 +294,6 @@ dependencies = [
]
[[package]]
-name = "cloudabi"
-version = "0.0.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
-dependencies = [
- "bitflags",
-]
-
-[[package]]
name = "const_fn"
version = "0.4.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -847,16 +833,16 @@ dependencies = [
[[package]]
name = "html5ever"
-version = "0.22.5"
+version = "0.25.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c213fa6a618dc1da552f54f85cba74b05d8e883c92ec4e89067736938084c26e"
+checksum = "aafcf38a1a36118242d29b92e1b08ef84e67e4a5ed06e0a80be20e6a32bfed6b"
dependencies = [
"log 0.4.14",
"mac",
"markup5ever",
- "proc-macro2 0.4.30",
- "quote 0.6.13",
- "syn 0.15.44",
+ "proc-macro2 1.0.24",
+ "quote 1.0.8",
+ "syn 1.0.60",
]
[[package]]
@@ -1005,7 +991,7 @@ version = "1.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1fa934250de4de8aef298d81c729a7d33d8c239daa3a7575e6b92bfc7313b"
dependencies = [
- "autocfg 1.0.1",
+ "autocfg",
"hashbrown",
]
@@ -1170,10 +1156,11 @@ checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"
[[package]]
name = "markup5ever"
-version = "0.7.5"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "897636f9850c3eef4905a5540683ed53dc9393860f0846cab2c2ddf9939862ff"
+checksum = "aae38d669396ca9b707bfc3db254bc382ddb94f57cc5c235f34623a669a01dab"
dependencies = [
+ "log 0.4.14",
"phf",
"phf_codegen",
"serde",
@@ -1185,6 +1172,18 @@ dependencies = [
]
[[package]]
+name = "markup5ever_rcdom"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f015da43bcd8d4f144559a3423f4591d69b8ce0652c905374da7205df336ae2b"
+dependencies = [
+ "html5ever",
+ "markup5ever",
+ "tendril",
+ "xml5ever",
+]
+
+[[package]]
name = "match_cfg"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -1261,7 +1260,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0f2d26ec3309788e423cfbf68ad1800f061638098d76a83681af979dc4eda19d"
dependencies = [
"adler",
- "autocfg 1.0.1",
+ "autocfg",
]
[[package]]
@@ -1431,7 +1430,7 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "090c7f9998ee0ff65aa5b723e4009f7b217707f1fb5ea551329cc4d6231fb304"
dependencies = [
- "autocfg 1.0.1",
+ "autocfg",
"num-integer",
"num-traits",
]
@@ -1453,7 +1452,7 @@ version = "0.1.44"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d2cc698a63b549a70bc047073d2949cce27cd1c7b0a4a862d08a8031bc2801db"
dependencies = [
- "autocfg 1.0.1",
+ "autocfg",
"num-traits",
]
@@ -1463,7 +1462,7 @@ version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290"
dependencies = [
- "autocfg 1.0.1",
+ "autocfg",
]
[[package]]
@@ -1548,7 +1547,7 @@ version = "0.9.60"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "921fc71883267538946025deffb622905ecad223c28efbfdef9bb59a0175f3e6"
dependencies = [
- "autocfg 1.0.1",
+ "autocfg",
"cc",
"libc",
"openssl-src",
@@ -1735,18 +1734,18 @@ dependencies = [
[[package]]
name = "phf"
-version = "0.7.24"
+version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3da44b85f8e8dfaec21adae67f95d93244b2ecf6ad2a692320598dcc8e6dd18"
+checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_shared",
]
[[package]]
name = "phf_codegen"
-version = "0.7.24"
+version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b03e85129e324ad4166b06b2c7491ae27fe3ec353af72e72cd1654c7225d517e"
+checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator",
"phf_shared",
@@ -1754,19 +1753,19 @@ dependencies = [
[[package]]
name = "phf_generator"
-version = "0.7.24"
+version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "09364cc93c159b8b06b1f4dd8a4398984503483891b0c26b867cf431fb132662"
+checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared",
- "rand 0.6.5",
+ "rand 0.7.3",
]
[[package]]
name = "phf_shared"
-version = "0.7.24"
+version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "234f71a15de2288bcb7e3b6515828d22af7ec8598ee6d24c3b526fa0a80b67a0"
+checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
@@ -1954,25 +1953,6 @@ dependencies = [
[[package]]
name = "rand"
-version = "0.6.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca"
-dependencies = [
- "autocfg 0.1.7",
- "libc",
- "rand_chacha 0.1.1",
- "rand_core 0.4.2",
- "rand_hc 0.1.0",
- "rand_isaac",
- "rand_jitter",
- "rand_os",
- "rand_pcg",
- "rand_xorshift",
- "winapi 0.3.9",
-]
-
-[[package]]
-name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
@@ -1982,6 +1962,7 @@ dependencies = [
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc 0.2.0",
+ "rand_pcg",
]
[[package]]
@@ -1998,16 +1979,6 @@ dependencies = [
[[package]]
name = "rand_chacha"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "556d3a1ca6600bfcbab7c7c91ccb085ac7fbbcd70e008a98742e7847f4f7bcef"
-dependencies = [
- "autocfg 0.1.7",
- "rand_core 0.3.1",
-]
-
-[[package]]
-name = "rand_chacha"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
@@ -2061,15 +2032,6 @@ dependencies = [
[[package]]
name = "rand_hc"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b40677c7be09ae76218dc623efbf7b18e34bced3f38883af07bb75630a21bc4"
-dependencies = [
- "rand_core 0.3.1",
-]
-
-[[package]]
-name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
@@ -2087,56 +2049,12 @@ dependencies = [
]
[[package]]
-name = "rand_isaac"
-version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08"
-dependencies = [
- "rand_core 0.3.1",
-]
-
-[[package]]
-name = "rand_jitter"
-version = "0.1.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b"
-dependencies = [
- "libc",
- "rand_core 0.4.2",
- "winapi 0.3.9",
-]
-
-[[package]]
-name = "rand_os"
-version = "0.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071"
-dependencies = [
- "cloudabi",
- "fuchsia-cprng",
- "libc",
- "rand_core 0.4.2",
- "rdrand",
- "winapi 0.3.9",
-]
-
-[[package]]
name = "rand_pcg"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
-dependencies = [
- "autocfg 0.1.7",
- "rand_core 0.4.2",
-]
-
-[[package]]
-name = "rand_xorshift"
-version = "0.1.1"
+version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
+checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
- "rand_core 0.3.1",
+ "rand_core 0.5.1",
]
[[package]]
@@ -2564,9 +2482,9 @@ dependencies = [
[[package]]
name = "siphasher"
-version = "0.2.3"
+version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0b8de496cf83d4ed58b6be86c3a275b8602f6ffe98d3024a869e124147a9a3ac"
+checksum = "fa8f3741c7372e75519bd9346068370c9cdaabcc1f9599cbcf2a2719352286b7"
[[package]]
name = "slab"
@@ -2601,16 +2519,6 @@ dependencies = [
]
[[package]]
-name = "soup"
-version = "0.5.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee42b8c117ede655c8ffe18dafcd239b23eb3bb7a2c71b1f01237587736f139f"
-dependencies = [
- "html5ever",
- "regex",
-]
-
-[[package]]
name = "spin"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2688,39 +2596,30 @@ checksum = "213701ba3370744dcd1a12960caa4843b3d68b4d1c0a5d575e0d65b2ee9d16c0"
[[package]]
name = "string_cache"
-version = "0.7.5"
+version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89c058a82f9fd69b1becf8c274f412281038877c553182f1d02eb027045a2d67"
+checksum = "8ddb1139b5353f96e429e1a5e19fbaf663bddedaa06d1dbd49f82e352601209a"
dependencies = [
"lazy_static",
"new_debug_unreachable",
"phf_shared",
"precomputed-hash",
"serde",
- "string_cache_codegen",
- "string_cache_shared",
]
[[package]]
name = "string_cache_codegen"
-version = "0.4.4"
+version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f0f45ed1b65bf9a4bf2f7b7dc59212d1926e9eaf00fa998988e420fd124467c6"
+checksum = "f24c8e5e19d22a726626f1a5e16fe15b132dcf21d10177fa5a45ce7962996b97"
dependencies = [
"phf_generator",
"phf_shared",
"proc-macro2 1.0.24",
"quote 1.0.8",
- "string_cache_shared",
]
[[package]]
-name = "string_cache_shared"
-version = "0.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b1884d1bc09741d466d9b14e6d37ac89d6909cbcac41dd9ae982d4d063bbedfc"
-
-[[package]]
name = "subtle"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -2879,7 +2778,7 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e8190d04c665ea9e6b6a0dc45523ade572c088d2e6566244c1122671dbf4ae3a"
dependencies = [
- "autocfg 1.0.1",
+ "autocfg",
"bytes 1.0.1",
"libc",
"memchr",
@@ -3322,6 +3221,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85e60b0d1b5f99db2556934e21937020776a5d31520bf169e851ac44e6420214"
[[package]]
+name = "xml5ever"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b1b52e6e8614d4a58b8e70cf51ec0cc21b256ad8206708bcff8139b5bbd6a59"
+dependencies = [
+ "log 0.4.14",
+ "mac",
+ "markup5ever",
+ "time 0.1.44",
+]
+
+[[package]]
name = "yansi"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
diff --git a/Cargo.toml b/Cargo.toml
@@ -106,7 +106,8 @@ newline-converter = "0.1.0"
handlebars = { version = "3.5.2", features = ["dir_source"] }
# For favicon extraction from main website
-soup = "0.5.0"
+html5ever = "0.25.1"
+markup5ever_rcdom = "0.1.0"
regex = { version = "1.4.3", features = ["std", "perf"], default-features = false }
data-url = "0.1.0"
diff --git a/src/api/icons.rs b/src/api/icons.rs
@@ -11,7 +11,6 @@ use once_cell::sync::Lazy;
use regex::Regex;
use reqwest::{blocking::Client, blocking::Response, header, Url};
use rocket::{http::ContentType, http::Cookie, response::Content, Route};
-use soup::prelude::*;
use crate::{error::Error, util::Cached, CONFIG};
@@ -332,6 +331,42 @@ impl Icon {
}
}
+fn get_favicons_node(node: &std::rc::Rc<markup5ever_rcdom::Node>, icons: &mut Vec<Icon>, url: &Url) {
+ if let markup5ever_rcdom::NodeData::Element { name, attrs, .. } = &node.data {
+ if name.local.as_ref() == "link" {
+ let mut has_rel = false;
+ let mut href = None;
+ let mut sizes = None;
+
+ let attrs = attrs.borrow();
+ for attr in attrs.iter() {
+ let attr_name = attr.name.local.as_ref();
+ let attr_value = attr.value.as_ref();
+
+ if attr_name == "rel" && ICON_REL_REGEX.is_match(attr_value) {
+ has_rel = true;
+ } else if attr_name == "href" {
+ href = Some(attr_value);
+ } else if attr_name == "sizes" {
+ sizes = Some(attr_value);
+ }
+ }
+
+ if has_rel && href.is_some() {
+ if let Ok(full_href) = url.join(&href.unwrap()).map(|h| h.into_string()) {
+ let priority = get_icon_priority(&full_href, sizes);
+ icons.push(Icon::new(priority, full_href));
+ }
+ }
+ }
+ }
+
+ // TODO: Might want to limit the recursion depth?
+ for child in node.children.borrow().iter() {
+ get_favicons_node(child, icons, url);
+ }
+}
+
struct IconUrlResult {
iconlist: Vec<Icon>,
cookies: String,
@@ -431,30 +466,14 @@ fn get_icon_url(domain: &str) -> Result<IconUrlResult, Error> {
// 512KB should be more than enough for the HTML, though as we only really need
// the HTML header, it could potentially be reduced even further
- let limited_reader = content.take(512 * 1024);
-
- let soup = Soup::from_reader(limited_reader)?;
- // Search for and filter
- let favicons = soup
- .tag("link")
- .attr("rel", ICON_REL_REGEX.clone()) // Only use icon rels
- .attr_name("href") // Make sure there is a href
- .find_all();
-
- // Loop through all the found icons and determine it's priority
- for favicon in favicons {
- let sizes = favicon.get("sizes");
- let href = favicon.get("href").unwrap();
- // Skip invalid url's
- let full_href = match url.join(&href) {
- Ok(h) => h.into_string(),
- _ => continue,
- };
-
- let priority = get_icon_priority(&full_href, sizes);
-
- iconlist.push(Icon::new(priority, full_href))
- }
+ let mut limited_reader = content.take(512 * 1024);
+
+ use html5ever::tendril::TendrilSink;
+ let dom = html5ever::parse_document(markup5ever_rcdom::RcDom::default(), Default::default())
+ .from_utf8()
+ .read_from(&mut limited_reader)?;
+
+ get_favicons_node(&dom.document, &mut iconlist, &url);
} else {
// Add the default favicon.ico to the list with just the given domain
iconlist.push(Icon::new(35, format!("{}/favicon.ico", ssldomain)));
@@ -506,7 +525,7 @@ fn get_page_with_cookies(url: &str, cookie_str: &str, referer: &str) -> Result<R
/// priority1 = get_icon_priority("http://example.com/path/to/a/favicon.png", "32x32");
/// priority2 = get_icon_priority("https://example.com/path/to/a/favicon.ico", "");
/// ```
-fn get_icon_priority(href: &str, sizes: Option<String>) -> u8 {
+fn get_icon_priority(href: &str, sizes: Option<&str>) -> u8 {
// Check if there is a dimension set
let (width, height) = parse_sizes(sizes);
@@ -554,7 +573,7 @@ fn get_icon_priority(href: &str, sizes: Option<String>) -> u8 {
/// let (width, height) = parse_sizes("x128x128"); // (128, 128)
/// let (width, height) = parse_sizes("32"); // (0, 0)
/// ```
-fn parse_sizes(sizes: Option<String>) -> (u16, u16) {
+fn parse_sizes(sizes: Option<&str>) -> (u16, u16) {
let mut width: u16 = 0;
let mut height: u16 = 0;