From 661f97a073bf0aa4d17c8cb3c5a47cf603e5e910 Mon Sep 17 00:00:00 2001 From: Rens Groothuijsen Date: Thu, 6 Jan 2022 14:13:17 +0100 Subject: [PATCH] Use correct encoding when fetching non-UTF-8 site metadata (#2015) * Use correct encoding when fetching non-UTF-8 site metadata * Style fixes --- crates/utils/Cargo.toml | 1 + crates/utils/src/request.rs | 25 ++++++++++++++++++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/crates/utils/Cargo.toml b/crates/utils/Cargo.toml index a8c35a06..894ae0c1 100644 --- a/crates/utils/Cargo.toml +++ b/crates/utils/Cargo.toml @@ -45,3 +45,4 @@ webpage = { version = "1.4.0", default-features = false, features = ["serde"] } jsonwebtoken = "7.2.0" doku = "0.10.2" uuid = { version = "0.8.2", features = ["serde", "v4"] } +encoding = "0.2.33" \ No newline at end of file diff --git a/crates/utils/src/request.rs b/crates/utils/src/request.rs index 081eaa2b..6af360a5 100644 --- a/crates/utils/src/request.rs +++ b/crates/utils/src/request.rs @@ -1,5 +1,6 @@ use crate::{settings::structs::Settings, version::VERSION, LemmyError}; use anyhow::anyhow; +use encoding::{all::encodings, DecoderTrap}; use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest_middleware::ClientWithMiddleware; use serde::{Deserialize, Serialize}; @@ -75,16 +76,17 @@ pub async fn fetch_site_metadata( .map_err(|e| RecvError(e.to_string()))? .to_vec(); - let html = String::from_utf8_lossy(&html_bytes); - - let tags = html_to_site_metadata(&html)?; + let tags = html_to_site_metadata(&html_bytes)?; Ok(tags) } -fn html_to_site_metadata(html: &str) -> Result { +fn html_to_site_metadata(html_bytes: &[u8]) -> Result { + let html = String::from_utf8_lossy(html_bytes); + // Make sure the first line is doctype html let first_line = html + .trim_start() .lines() .into_iter() .next() @@ -97,7 +99,20 @@ fn html_to_site_metadata(html: &str) -> Result { )); } - let page = HTML::from_string(html.to_string(), None)?; + let mut page = HTML::from_string(html.to_string(), None)?; + + // If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the + // proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8 + // version. + if let Some(charset) = page.meta.get("charset") { + if charset.to_lowercase() != "utf-8" { + if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) { + if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) { + page = HTML::from_string(html_with_encoding, None)?; + } + } + } + } let page_title = page.title; let page_description = page.description; -- 2.44.1