commit 3d2df6ce114366786d08bc21f52388753ab4991f
parent d1af468700021f6856f1fa6b991a84ed92727554
Author: Mathijs van Veluw <black.dex@gmail.com>
Date: Sun, 13 Aug 2023 19:31:43 +0200
Merge pull request #3751 from BlackDex/optimize-icon-fetching
Optimized Favicon downloading
Diffstat:
2 files changed, 116 insertions(+), 108 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
@@ -124,7 +124,7 @@ email_address = "0.2.4"
handlebars = { version = "4.3.7", features = ["dir_source"] }
# HTTP client (Used for favicons, version check, DUO and HIBP API)
-reqwest = { version = "0.11.18", features = ["stream", "json", "gzip", "brotli", "socks", "cookies", "trust-dns"] }
+reqwest = { version = "0.11.18", features = ["stream", "json", "deflate", "gzip", "brotli", "socks", "cookies", "trust-dns", "native-tls-alpn"] }
# Favicon extraction libraries
html5gum = "0.5.7"
diff --git a/src/api/icons.rs b/src/api/icons.rs
@@ -19,7 +19,7 @@ use tokio::{
net::lookup_host,
};
-use html5gum::{Emitter, EndTag, HtmlString, InfallibleTokenizer, Readable, StartTag, StringReader, Tokenizer};
+use html5gum::{Emitter, HtmlString, InfallibleTokenizer, Readable, StringReader, Tokenizer};
use crate::{
error::Error,
@@ -46,10 +46,15 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
// Generate the cookie store
let cookie_store = Arc::new(Jar::default());
+ let icon_download_timeout = Duration::from_secs(CONFIG.icon_download_timeout());
+ let pool_idle_timeout = Duration::from_secs(10);
// Reuse the client between requests
let client = get_reqwest_client_builder()
.cookie_provider(Arc::clone(&cookie_store))
- .timeout(Duration::from_secs(CONFIG.icon_download_timeout()))
+ .timeout(icon_download_timeout)
+ .pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections
+ .pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds
+ .trust_dns(true)
.default_headers(default_headers.clone());
match client.build() {
@@ -58,9 +63,11 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
error!("Possible trust-dns error, trying with trust-dns disabled: '{e}'");
get_reqwest_client_builder()
.cookie_provider(cookie_store)
- .timeout(Duration::from_secs(CONFIG.icon_download_timeout()))
- .default_headers(default_headers)
+ .timeout(icon_download_timeout)
+ .pool_max_idle_per_host(5) // Configure the Hyper Pool to only have max 5 idle connections
+ .pool_idle_timeout(pool_idle_timeout) // Configure the Hyper Pool to timeout after 10 seconds
.trust_dns(false)
+ .default_headers(default_headers)
.build()
.expect("Failed to build client")
}
@@ -258,7 +265,7 @@ mod tests {
}
}
-#[derive(Debug, Clone)]
+#[derive(Clone)]
enum DomainBlacklistReason {
Regex,
IP,
@@ -415,38 +422,34 @@ fn get_favicons_node(
const TAG_LINK: &[u8] = b"link";
const TAG_BASE: &[u8] = b"base";
const TAG_HEAD: &[u8] = b"head";
- const ATTR_REL: &[u8] = b"rel";
const ATTR_HREF: &[u8] = b"href";
const ATTR_SIZES: &[u8] = b"sizes";
let mut base_url = url.clone();
- let mut icon_tags: Vec<StartTag> = Vec::new();
+ let mut icon_tags: Vec<Tag> = Vec::new();
for token in dom {
- match token {
- FaviconToken::StartTag(tag) => {
- if *tag.name == TAG_LINK
- && tag.attributes.contains_key(ATTR_REL)
- && tag.attributes.contains_key(ATTR_HREF)
- {
- let rel_value = std::str::from_utf8(tag.attributes.get(ATTR_REL).unwrap())
- .unwrap_or_default()
- .to_ascii_lowercase();
- if rel_value.contains("icon") && !rel_value.contains("mask-icon") {
- icon_tags.push(tag);
- }
- } else if *tag.name == TAG_BASE && tag.attributes.contains_key(ATTR_HREF) {
- let href = std::str::from_utf8(tag.attributes.get(ATTR_HREF).unwrap()).unwrap_or_default();
+ let tag_name: &[u8] = &token.tag.name;
+ match tag_name {
+ TAG_LINK => {
+ icon_tags.push(token.tag);
+ }
+ TAG_BASE => {
+ base_url = if let Some(href) = token.tag.attributes.get(ATTR_HREF) {
+ let href = std::str::from_utf8(href).unwrap_or_default();
debug!("Found base href: {href}");
- base_url = match base_url.join(href) {
+ match base_url.join(href) {
Ok(inner_url) => inner_url,
- _ => url.clone(),
- };
- }
+ _ => continue,
+ }
+ } else {
+ continue;
+ };
}
- FaviconToken::EndTag(tag) => {
- if *tag.name == TAG_HEAD {
- break;
- }
+ TAG_HEAD if token.closing => {
+ break;
+ }
+ _ => {
+ continue;
}
}
}
@@ -820,43 +823,64 @@ impl reqwest::cookie::CookieStore for Jar {
}
/// Custom FaviconEmitter for the html5gum parser.
-/// The FaviconEmitter is using an almost 1:1 copy of the DefaultEmitter with some small changes.
+/// The FaviconEmitter is using an optimized version of the DefaultEmitter.
/// This prevents emitting tags like comments, doctype and also strings between the tags.
+/// But it will also only emit the tags we need and only if they have the correct attributes
/// Therefor parsing the HTML content is faster.
-use std::collections::{BTreeSet, VecDeque};
+use std::collections::BTreeMap;
+
+#[derive(Default)]
+pub struct Tag {
+ /// The tag's name, such as `"link"` or `"base"`.
+ pub name: HtmlString,
-#[derive(Debug)]
-enum FaviconToken {
- StartTag(StartTag),
- EndTag(EndTag),
+ /// A mapping for any HTML attributes this start tag may have.
+ ///
+ /// Duplicate attributes are ignored after the first one as per WHATWG spec.
+ pub attributes: BTreeMap<HtmlString, HtmlString>,
}
-#[derive(Default, Debug)]
+struct FaviconToken {
+ tag: Tag,
+ closing: bool,
+}
+
+#[derive(Default)]
struct FaviconEmitter {
current_token: Option<FaviconToken>,
last_start_tag: HtmlString,
current_attribute: Option<(HtmlString, HtmlString)>,
- seen_attributes: BTreeSet<HtmlString>,
- emitted_tokens: VecDeque<FaviconToken>,
+ emit_token: bool,
}
impl FaviconEmitter {
- fn emit_token(&mut self, token: FaviconToken) {
- self.emitted_tokens.push_front(token);
- }
+ fn flush_current_attribute(&mut self, emit_current_tag: bool) {
+ const ATTR_HREF: &[u8] = b"href";
+ const ATTR_REL: &[u8] = b"rel";
+ const TAG_LINK: &[u8] = b"link";
+ const TAG_BASE: &[u8] = b"base";
+ const TAG_HEAD: &[u8] = b"head";
+
+ if let Some(ref mut token) = self.current_token {
+ let tag_name: &[u8] = &token.tag.name;
+
+ if self.current_attribute.is_some() && (tag_name == TAG_BASE || tag_name == TAG_LINK) {
+ let (k, v) = self.current_attribute.take().unwrap();
+ token.tag.attributes.entry(k).and_modify(|_| {}).or_insert(v);
+ }
- fn flush_current_attribute(&mut self) {
- if let Some((k, v)) = self.current_attribute.take() {
- match self.current_token {
- Some(FaviconToken::StartTag(ref mut tag)) => {
- tag.attributes.entry(k).and_modify(|_| {}).or_insert(v);
- }
- Some(FaviconToken::EndTag(_)) => {
- self.seen_attributes.insert(k);
- }
- _ => {
- debug_assert!(false);
+ let tag_attr = &token.tag.attributes;
+ match tag_name {
+ TAG_HEAD if token.closing => self.emit_token = true,
+ TAG_BASE if tag_attr.contains_key(ATTR_HREF) => self.emit_token = true,
+ TAG_LINK if emit_current_tag && tag_attr.contains_key(ATTR_REL) && tag_attr.contains_key(ATTR_HREF) => {
+ let rel_value =
+ std::str::from_utf8(token.tag.attributes.get(ATTR_REL).unwrap()).unwrap_or_default();
+ if rel_value.contains("icon") && !rel_value.contains("mask-icon") {
+ self.emit_token = true
+ }
}
+ _ => (),
}
}
}
@@ -871,87 +895,71 @@ impl Emitter for FaviconEmitter {
}
fn pop_token(&mut self) -> Option<Self::Token> {
- self.emitted_tokens.pop_back()
+ if self.emit_token {
+ self.emit_token = false;
+ return self.current_token.take();
+ }
+ None
}
fn init_start_tag(&mut self) {
- self.current_token = Some(FaviconToken::StartTag(StartTag::default()));
+ self.current_token = Some(FaviconToken {
+ tag: Tag::default(),
+ closing: false,
+ });
}
fn init_end_tag(&mut self) {
- self.current_token = Some(FaviconToken::EndTag(EndTag::default()));
- self.seen_attributes.clear();
+ self.current_token = Some(FaviconToken {
+ tag: Tag::default(),
+ closing: true,
+ });
}
fn emit_current_tag(&mut self) -> Option<html5gum::State> {
- self.flush_current_attribute();
- let mut token = self.current_token.take().unwrap();
- let mut emit = false;
- match token {
- FaviconToken::EndTag(ref mut tag) => {
- // Always clean seen attributes
- self.seen_attributes.clear();
- self.set_last_start_tag(None);
-
- // Only trigger an emit for the </head> tag.
- // This is matched, and will break the for-loop.
- if *tag.name == b"head" {
- emit = true;
- }
- }
- FaviconToken::StartTag(ref mut tag) => {
- // Only trriger an emit for <link> and <base> tags.
- // These are the only tags we want to parse.
- if *tag.name == b"link" || *tag.name == b"base" {
- self.set_last_start_tag(Some(&tag.name));
- emit = true;
- } else {
- self.set_last_start_tag(None);
- }
- }
- }
-
- // Only emit the tags we want to parse.
- if emit {
- self.emit_token(token);
+ self.flush_current_attribute(true);
+ self.last_start_tag.clear();
+ if self.current_token.is_some() && !self.current_token.as_ref().unwrap().closing {
+ self.last_start_tag.extend(&*self.current_token.as_ref().unwrap().tag.name);
}
- None
+ html5gum::naive_next_state(&self.last_start_tag)
}
fn push_tag_name(&mut self, s: &[u8]) {
- match self.current_token {
- Some(
- FaviconToken::StartTag(StartTag {
- ref mut name,
- ..
- })
- | FaviconToken::EndTag(EndTag {
- ref mut name,
- ..
- }),
- ) => {
- name.extend(s);
- }
- _ => debug_assert!(false),
+ if let Some(ref mut token) = self.current_token {
+ token.tag.name.extend(s);
}
}
fn init_attribute(&mut self) {
- self.flush_current_attribute();
- self.current_attribute = Some(Default::default());
+ self.flush_current_attribute(false);
+ self.current_attribute = match &self.current_token {
+ Some(token) => {
+ let tag_name: &[u8] = &token.tag.name;
+ match tag_name {
+ b"link" | b"head" | b"base" => Some(Default::default()),
+ _ => None,
+ }
+ }
+ _ => None,
+ };
}
fn push_attribute_name(&mut self, s: &[u8]) {
- self.current_attribute.as_mut().unwrap().0.extend(s);
+ if let Some(attr) = &mut self.current_attribute {
+ attr.0.extend(s)
+ }
}
fn push_attribute_value(&mut self, s: &[u8]) {
- self.current_attribute.as_mut().unwrap().1.extend(s);
+ if let Some(attr) = &mut self.current_attribute {
+ attr.1.extend(s)
+ }
}
fn current_is_appropriate_end_tag_token(&mut self) -> bool {
- match self.current_token {
- Some(FaviconToken::EndTag(ref tag)) => !self.last_start_tag.is_empty() && self.last_start_tag == tag.name,
+ match &self.current_token {
+ Some(token) if token.closing => !self.last_start_tag.is_empty() && self.last_start_tag == token.tag.name,
_ => false,
}
}