X-Git-Url: http://these/git/?a=blobdiff_plain;f=crates%2Fapi_common%2Fsrc%2Frequest.rs;h=b62514c02f22b3139ec4e3c268ff0ce15ba1bd5e;hb=70fae9d68d65b1e4d153e30d3c065cc315b75eaf;hp=7e3b2152cbbdbf4618b2007522cd42f298b775d1;hpb=3aa3d75a1e04b6ed4bc7566f86f45e6883c5c39b;p=lemmy.git diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs index 7e3b2152..b62514c0 100644 --- a/crates/api_common/src/request.rs +++ b/crates/api_common/src/request.rs @@ -1,6 +1,12 @@ use crate::post::SiteMetadata; use encoding::{all::encodings, DecoderTrap}; -use lemmy_utils::{settings::structs::Settings, version::VERSION, LemmyError, REQWEST_TIMEOUT}; +use lemmy_db_schema::newtypes::DbUrl; +use lemmy_utils::{ + error::{LemmyError, LemmyErrorType}, + settings::structs::Settings, + version::VERSION, + REQWEST_TIMEOUT, +}; use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC}; use reqwest_middleware::ClientWithMiddleware; use serde::Deserialize; @@ -15,37 +21,30 @@ pub async fn fetch_site_metadata( url: &Url, ) -> Result { info!("Fetching site metadata for url: {}", url); - let response = client - .get(url.as_str()) - .timeout(REQWEST_TIMEOUT) - .send() - .await?; + let response = client.get(url.as_str()).send().await?; // Can't use .text() here, because it only checks the content header, not the actual bytes // https://github.com/LemmyNet/lemmy/issues/1964 let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec(); - let tags = html_to_site_metadata(&html_bytes)?; + let tags = html_to_site_metadata(&html_bytes, url)?; Ok(tags) } -fn html_to_site_metadata(html_bytes: &[u8]) -> Result { +fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result { let html = String::from_utf8_lossy(html_bytes); // Make sure the first line is doctype html let first_line = html .trim_start() .lines() - .into_iter() .next() - .ok_or_else(|| LemmyError::from_message("No lines in html"))? + .ok_or(LemmyErrorType::NoLinesInHtml)? .to_lowercase(); if !first_line.starts_with("") { - return Err(LemmyError::from_message( - "Site metadata page fetch is not DOCTYPE html", - )); + Err(LemmyErrorType::SiteMetadataPageIsNotDoctypeHtml)?; } let mut page = HTML::from_string(html.to_string(), None)?; @@ -70,27 +69,30 @@ fn html_to_site_metadata(html_bytes: &[u8]) -> Result .opengraph .properties .get("description") - .map(|t| t.to_string()); + .map(std::string::ToString::to_string); let og_title = page .opengraph .properties .get("title") - .map(|t| t.to_string()); + .map(std::string::ToString::to_string); let og_image = page .opengraph .images - .get(0) - .and_then(|ogo| Url::parse(&ogo.url).ok()); - - let title = og_title.or(page_title); - let description = og_description.or(page_description); - let image = og_image; + .first() + // join also works if the target URL is absolute + .and_then(|ogo| url.join(&ogo.url).ok()); + let og_embed_url = page + .opengraph + .videos + .first() + // join also works if the target URL is absolute + .and_then(|v| url.join(&v.url).ok()); Ok(SiteMetadata { - title, - description, - image, - html: None, + title: og_title.or(page_title), + description: og_description.or(page_description), + image: og_image.map(Into::into), + embed_video_url: og_embed_url.map(Into::into), }) } @@ -107,36 +109,78 @@ pub(crate) struct PictrsFile { delete_token: String, } +#[derive(Deserialize, Debug, Clone)] +pub(crate) struct PictrsPurgeResponse { + msg: String, +} + #[tracing::instrument(skip_all)] pub(crate) async fn fetch_pictrs( client: &ClientWithMiddleware, settings: &Settings, image_url: &Url, ) -> Result { - if let Some(pictrs_url) = settings.pictrs_url.to_owned() { - is_image_content_type(client, image_url).await?; + let pictrs_config = settings.pictrs_config()?; + is_image_content_type(client, image_url).await?; - let fetch_url = format!( - "{}/image/download?url={}", - pictrs_url, - utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed - ); + let fetch_url = format!( + "{}image/download?url={}", + pictrs_config.url, + utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed + ); - let response = client - .get(&fetch_url) - .timeout(REQWEST_TIMEOUT) - .send() - .await?; + let response = client + .get(&fetch_url) + .timeout(REQWEST_TIMEOUT) + .send() + .await?; - let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?; + let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?; - if response.msg == "ok" { - Ok(response) - } else { - Err(LemmyError::from_message(&response.msg)) - } + if response.msg == "ok" { + Ok(response) } else { - Err(LemmyError::from_message("pictrs_url not set up in config")) + Err(LemmyErrorType::PictrsResponseError(response.msg))? + } +} + +/// Purges an image from pictrs +/// Note: This should often be coerced from a Result to .ok() in order to fail softly, because: +/// - It might fail due to image being not local +/// - It might not be an image +/// - Pictrs might not be set up +pub async fn purge_image_from_pictrs( + client: &ClientWithMiddleware, + settings: &Settings, + image_url: &Url, +) -> Result<(), LemmyError> { + let pictrs_config = settings.pictrs_config()?; + is_image_content_type(client, image_url).await?; + + let alias = image_url + .path_segments() + .ok_or(LemmyErrorType::ImageUrlMissingPathSegments)? + .next_back() + .ok_or(LemmyErrorType::ImageUrlMissingLastPathSegment)?; + + let purge_url = format!("{}/internal/purge?alias={}", pictrs_config.url, alias); + + let pictrs_api_key = pictrs_config + .api_key + .ok_or(LemmyErrorType::PictrsApiKeyNotProvided)?; + let response = client + .post(&purge_url) + .timeout(REQWEST_TIMEOUT) + .header("x-api-token", pictrs_api_key) + .send() + .await?; + + let response: PictrsPurgeResponse = response.json().await.map_err(LemmyError::from)?; + + if response.msg == "ok" { + Ok(()) + } else { + Err(LemmyErrorType::PictrsPurgeResponseError(response.msg))? } } @@ -147,13 +191,20 @@ pub async fn fetch_site_data( client: &ClientWithMiddleware, settings: &Settings, url: Option<&Url>, -) -> (Option, Option) { + include_image: bool, +) -> (Option, Option) { match &url { Some(url) => { // Fetch metadata // Ignore errors, since it may be an image, or not have the data. // Warning, this may ignore SSL errors let metadata_option = fetch_site_metadata(client, url).await.ok(); + if !include_image { + return (metadata_option, None); + } + + let missing_pictrs_file = + |r: PictrsResponse| r.files.first().expect("missing pictrs file").file.clone(); // Fetch pictrs thumbnail let pictrs_hash = match &metadata_option { @@ -162,16 +213,16 @@ pub async fn fetch_site_data( // Try to generate a small thumbnail if there's a full sized one from post-links Some(metadata_image) => fetch_pictrs(client, settings, metadata_image) .await - .map(|r| r.files[0].file.to_owned()), + .map(missing_pictrs_file), // Metadata, but no image None => fetch_pictrs(client, settings, url) .await - .map(|r| r.files[0].file.to_owned()), + .map(missing_pictrs_file), }, // No metadata, try to fetch the URL as an image None => fetch_pictrs(client, settings, url) .await - .map(|r| r.files[0].file.to_owned()), + .map(missing_pictrs_file), }; // The full urls are necessary for federation @@ -187,7 +238,7 @@ pub async fn fetch_site_data( .ok() .flatten(); - (metadata_option, pictrs_thumbnail) + (metadata_option, pictrs_thumbnail.map(Into::into)) } None => (None, None), } @@ -195,21 +246,17 @@ pub async fn fetch_site_data( #[tracing::instrument(skip_all)] async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> { - let response = client - .get(url.as_str()) - .timeout(REQWEST_TIMEOUT) - .send() - .await?; + let response = client.get(url.as_str()).send().await?; if response .headers() .get("Content-Type") - .ok_or_else(|| LemmyError::from_message("No Content-Type header"))? + .ok_or(LemmyErrorType::NoContentTypeHeader)? .to_str()? .starts_with("image/") { Ok(()) } else { - Err(LemmyError::from_message("Not an image type.")) + Err(LemmyErrorType::NotAnImageType)? } } @@ -223,16 +270,24 @@ pub fn build_user_agent(settings: &Settings) -> String { #[cfg(test)] mod tests { - use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata}; - use lemmy_utils::settings::structs::Settings; + #![allow(clippy::unwrap_used)] + #![allow(clippy::indexing_slicing)] + + use crate::request::{ + build_user_agent, + fetch_site_metadata, + html_to_site_metadata, + SiteMetadata, + }; + use lemmy_utils::settings::SETTINGS; use url::Url; // These helped with testing - #[actix_rt::test] + #[tokio::test] async fn test_site_metadata() { - let settings = Settings::init().unwrap(); + let settings = &SETTINGS.clone(); let client = reqwest::Client::builder() - .user_agent(build_user_agent(&settings)) + .user_agent(build_user_agent(settings)) .build() .unwrap() .into(); @@ -247,21 +302,12 @@ mod tests { image: Some( Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png") .unwrap() + .into() ), - html: None, + embed_video_url: None, }, sample_res ); - - let youtube_url = Url::parse("https://www.youtube.com/watch?v=IquO_TcMZIQ").unwrap(); - let youtube_res = fetch_site_metadata(&client, &youtube_url).await.unwrap(); - assert_eq!( - SiteMetadata { - title: Some("A Hard Look at Rent and Rent Seeking with Michael Hudson & Pepe Escobar".to_string()), - description: Some("An interactive discussion on wealth inequality and the “Great Game” on the control of natural resources.In this webinar organized jointly by the Henry George...".to_string()), - image: Some(Url::parse("https://i.ytimg.com/vi/IquO_TcMZIQ/maxresdefault.jpg").unwrap()), - html: None, - }, youtube_res); } // #[test] @@ -271,4 +317,46 @@ mod tests { // let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu"); // assert!(res_other.is_err()); // } + + #[test] + fn test_resolve_image_url() { + // url that lists the opengraph fields + let url = Url::parse("https://example.com/one/two.html").unwrap(); + + // root relative url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some(Url::parse("https://example.com/image.jpg").unwrap().into()) + ); + + // base relative url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some( + Url::parse("https://example.com/one/image.jpg") + .unwrap() + .into() + ) + ); + + // absolute url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into()) + ); + + // protocol relative url + let html_bytes = b""; + let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata"); + assert_eq!( + metadata.image, + Some(Url::parse("https://example.com/image.jpg").unwrap().into()) + ); + } }