1 use crate::post::SiteMetadata;
2 use encoding::{all::encodings, DecoderTrap};
3 use lemmy_db_schema::newtypes::DbUrl;
5 error::{LemmyError, LemmyErrorType},
6 settings::structs::Settings,
10 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
11 use reqwest_middleware::ClientWithMiddleware;
12 use serde::Deserialize;
17 /// Fetches the post link html tags (like title, description, image, etc)
18 #[tracing::instrument(skip_all)]
19 pub async fn fetch_site_metadata(
20 client: &ClientWithMiddleware,
22 ) -> Result<SiteMetadata, LemmyError> {
23 info!("Fetching site metadata for url: {}", url);
24 let response = client.get(url.as_str()).send().await?;
26 // Can't use .text() here, because it only checks the content header, not the actual bytes
27 // https://github.com/LemmyNet/lemmy/issues/1964
28 let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
30 let tags = html_to_site_metadata(&html_bytes, url)?;
35 fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, LemmyError> {
36 let html = String::from_utf8_lossy(html_bytes);
38 // Make sure the first line is doctype html
43 .ok_or(LemmyErrorType::NoLinesInHtml)?
46 if !first_line.starts_with("<!doctype html>") {
47 Err(LemmyErrorType::SiteMetadataPageIsNotDoctypeHtml)?;
50 let mut page = HTML::from_string(html.to_string(), None)?;
52 // If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the
53 // proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8
55 if let Some(charset) = page.meta.get("charset") {
56 if charset.to_lowercase() != "utf-8" {
57 if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) {
58 if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) {
59 page = HTML::from_string(html_with_encoding, None)?;
65 let page_title = page.title;
66 let page_description = page.description;
68 let og_description = page
72 .map(std::string::ToString::to_string);
77 .map(std::string::ToString::to_string);
82 // join also works if the target URL is absolute
83 .and_then(|ogo| url.join(&ogo.url).ok());
84 let og_embed_url = page
88 // join also works if the target URL is absolute
89 .and_then(|v| url.join(&v.url).ok());
92 title: og_title.or(page_title),
93 description: og_description.or(page_description),
94 image: og_image.map(Into::into),
95 embed_video_url: og_embed_url.map(Into::into),
99 #[derive(Deserialize, Debug, Clone)]
100 pub(crate) struct PictrsResponse {
101 files: Vec<PictrsFile>,
105 #[derive(Deserialize, Debug, Clone)]
106 pub(crate) struct PictrsFile {
109 delete_token: String,
112 #[derive(Deserialize, Debug, Clone)]
113 pub(crate) struct PictrsPurgeResponse {
117 #[tracing::instrument(skip_all)]
118 pub(crate) async fn fetch_pictrs(
119 client: &ClientWithMiddleware,
122 ) -> Result<PictrsResponse, LemmyError> {
123 let pictrs_config = settings.pictrs_config()?;
124 is_image_content_type(client, image_url).await?;
126 let fetch_url = format!(
127 "{}image/download?url={}",
129 utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed
132 let response = client
134 .timeout(REQWEST_TIMEOUT)
138 let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
140 if response.msg == "ok" {
143 Err(LemmyErrorType::PictrsResponseError(response.msg))?
147 /// Purges an image from pictrs
148 /// Note: This should often be coerced from a Result to .ok() in order to fail softly, because:
149 /// - It might fail due to image being not local
150 /// - It might not be an image
151 /// - Pictrs might not be set up
152 pub async fn purge_image_from_pictrs(
153 client: &ClientWithMiddleware,
156 ) -> Result<(), LemmyError> {
157 let pictrs_config = settings.pictrs_config()?;
158 is_image_content_type(client, image_url).await?;
160 let alias = image_url
162 .ok_or(LemmyErrorType::ImageUrlMissingPathSegments)?
164 .ok_or(LemmyErrorType::ImageUrlMissingLastPathSegment)?;
166 let purge_url = format!("{}/internal/purge?alias={}", pictrs_config.url, alias);
168 let pictrs_api_key = pictrs_config
170 .ok_or(LemmyErrorType::PictrsApiKeyNotProvided)?;
171 let response = client
173 .timeout(REQWEST_TIMEOUT)
174 .header("x-api-token", pictrs_api_key)
178 let response: PictrsPurgeResponse = response.json().await.map_err(LemmyError::from)?;
180 if response.msg == "ok" {
183 Err(LemmyErrorType::PictrsPurgeResponseError(response.msg))?
187 /// Both are options, since the URL might be either an html page, or an image
188 /// Returns the SiteMetadata, and a Pictrs URL, if there is a picture associated
189 #[tracing::instrument(skip_all)]
190 pub async fn fetch_site_data(
191 client: &ClientWithMiddleware,
195 ) -> (Option<SiteMetadata>, Option<DbUrl>) {
199 // Ignore errors, since it may be an image, or not have the data.
200 // Warning, this may ignore SSL errors
201 let metadata_option = fetch_site_metadata(client, url).await.ok();
203 return (metadata_option, None);
206 let missing_pictrs_file =
207 |r: PictrsResponse| r.files.first().expect("missing pictrs file").file.clone();
209 // Fetch pictrs thumbnail
210 let pictrs_hash = match &metadata_option {
211 Some(metadata_res) => match &metadata_res.image {
212 // Metadata, with image
213 // Try to generate a small thumbnail if there's a full sized one from post-links
214 Some(metadata_image) => fetch_pictrs(client, settings, metadata_image)
216 .map(missing_pictrs_file),
217 // Metadata, but no image
218 None => fetch_pictrs(client, settings, url)
220 .map(missing_pictrs_file),
222 // No metadata, try to fetch the URL as an image
223 None => fetch_pictrs(client, settings, url)
225 .map(missing_pictrs_file),
228 // The full urls are necessary for federation
229 let pictrs_thumbnail = pictrs_hash
232 "{}/pictrs/image/{}",
233 settings.get_protocol_and_hostname(),
241 (metadata_option, pictrs_thumbnail.map(Into::into))
243 None => (None, None),
247 #[tracing::instrument(skip_all)]
248 async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
249 let response = client.get(url.as_str()).send().await?;
253 .ok_or(LemmyErrorType::NoContentTypeHeader)?
255 .starts_with("image/")
259 Err(LemmyErrorType::NotAnImageType)?
263 pub fn build_user_agent(settings: &Settings) -> String {
267 settings.get_protocol_and_hostname()
273 #![allow(clippy::unwrap_used)]
274 #![allow(clippy::indexing_slicing)]
276 use crate::request::{
279 html_to_site_metadata,
282 use lemmy_utils::settings::SETTINGS;
285 // These helped with testing
287 async fn test_site_metadata() {
288 let settings = &SETTINGS.clone();
289 let client = reqwest::Client::builder()
290 .user_agent(build_user_agent(settings))
294 let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap();
295 let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap();
298 title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
300 "The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()
303 Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
307 embed_video_url: None,
314 // fn test_pictshare() {
315 // let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg");
316 // assert!(res.is_ok());
317 // let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
318 // assert!(res_other.is_err());
322 fn test_resolve_image_url() {
323 // url that lists the opengraph fields
324 let url = Url::parse("https://example.com/one/two.html").unwrap();
327 let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='/image.jpg'></head><body></body></html>";
328 let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
331 Some(Url::parse("https://example.com/image.jpg").unwrap().into())
335 let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='image.jpg'></head><body></body></html>";
336 let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
340 Url::parse("https://example.com/one/image.jpg")
347 let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='https://cdn.host.com/image.jpg'></head><body></body></html>";
348 let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
351 Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into())
354 // protocol relative url
355 let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='//example.com/image.jpg'></head><body></body></html>";
356 let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
359 Some(Url::parse("https://example.com/image.jpg").unwrap().into())