Cache & Optimize Woodpecker CI (#3450)

[lemmy.git] / crates / api_common / src / request.rs
diff --git a/crates/api_common/src/request.rs b/crates/api_common/src/request.rs

index 7e3b2152cbbdbf4618b2007522cd42f298b775d1..82126887ab4c66cd4b4ed12346ff3ad6907364cc 100644 (file)
--- a/crates/api_common/src/request.rs
+++ b/crates/api_common/src/request.rs
@@ -1,6 +1,12 @@
  use crate::post::SiteMetadata;
  use encoding::{all::encodings, DecoderTrap};
-use lemmy_utils::{settings::structs::Settings, version::VERSION, LemmyError, REQWEST_TIMEOUT};
+use lemmy_db_schema::newtypes::DbUrl;
+use lemmy_utils::{
+  error::{LemmyError, LemmyErrorType},
+  settings::structs::Settings,
+  version::VERSION,
+  REQWEST_TIMEOUT,
+};
  use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
  use reqwest_middleware::ClientWithMiddleware;
  use serde::Deserialize;
@@ -15,37 +21,30 @@ pub async fn fetch_site_metadata(
    url: &Url,
  ) -> Result<SiteMetadata, LemmyError> {
    info!("Fetching site metadata for url: {}", url);
-  let response = client
-    .get(url.as_str())
-    .timeout(REQWEST_TIMEOUT)
-    .send()
-    .await?;
+  let response = client.get(url.as_str()).send().await?;
  
    // Can't use .text() here, because it only checks the content header, not the actual bytes
    // https://github.com/LemmyNet/lemmy/issues/1964
    let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
  
-  let tags = html_to_site_metadata(&html_bytes)?;
+  let tags = html_to_site_metadata(&html_bytes, url)?;
  
    Ok(tags)
  }
  
-fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError> {
+fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, LemmyError> {
    let html = String::from_utf8_lossy(html_bytes);
  
    // Make sure the first line is doctype html
    let first_line = html
      .trim_start()
      .lines()
-    .into_iter()
      .next()
-    .ok_or_else(|| LemmyError::from_message("No lines in html"))?
+    .ok_or(LemmyErrorType::NoLinesInHtml)?
      .to_lowercase();
  
    if !first_line.starts_with("<!doctype html>") {
-    return Err(LemmyError::from_message(
-      "Site metadata page fetch is not DOCTYPE html",
-    ));
+    return Err(LemmyErrorType::SiteMetadataPageIsNotDoctypeHtml)?;
    }
  
    let mut page = HTML::from_string(html.to_string(), None)?;
@@ -70,27 +69,30 @@ fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError>
      .opengraph
      .properties
      .get("description")
-    .map(|t| t.to_string());
+    .map(std::string::ToString::to_string);
    let og_title = page
      .opengraph
      .properties
      .get("title")
-    .map(|t| t.to_string());
+    .map(std::string::ToString::to_string);
    let og_image = page
      .opengraph
      .images
-    .get(0)
-    .and_then(|ogo| Url::parse(&ogo.url).ok());
-
-  let title = og_title.or(page_title);
-  let description = og_description.or(page_description);
-  let image = og_image;
+    .first()
+    // join also works if the target URL is absolute
+    .and_then(|ogo| url.join(&ogo.url).ok());
+  let og_embed_url = page
+    .opengraph
+    .videos
+    .first()
+    // join also works if the target URL is absolute
+    .and_then(|v| url.join(&v.url).ok());
  
    Ok(SiteMetadata {
-    title,
-    description,
-    image,
-    html: None,
+    title: og_title.or(page_title),
+    description: og_description.or(page_description),
+    image: og_image.map(Into::into),
+    embed_video_url: og_embed_url.map(Into::into),
    })
  }
  
@@ -107,36 +109,78 @@ pub(crate) struct PictrsFile {
    delete_token: String,
  }
  
+#[derive(Deserialize, Debug, Clone)]
+pub(crate) struct PictrsPurgeResponse {
+  msg: String,
+}
+
  #[tracing::instrument(skip_all)]
  pub(crate) async fn fetch_pictrs(
    client: &ClientWithMiddleware,
    settings: &Settings,
    image_url: &Url,
  ) -> Result<PictrsResponse, LemmyError> {
-  if let Some(pictrs_url) = settings.pictrs_url.to_owned() {
-    is_image_content_type(client, image_url).await?;
+  let pictrs_config = settings.pictrs_config()?;
+  is_image_content_type(client, image_url).await?;
  
-    let fetch_url = format!(
-      "{}/image/download?url={}",
-      pictrs_url,
-      utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed
-    );
+  let fetch_url = format!(
+    "{}image/download?url={}",
+    pictrs_config.url,
+    utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed
+  );
  
-    let response = client
-      .get(&fetch_url)
-      .timeout(REQWEST_TIMEOUT)
-      .send()
-      .await?;
+  let response = client
+    .get(&fetch_url)
+    .timeout(REQWEST_TIMEOUT)
+    .send()
+    .await?;
  
-    let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
+  let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
  
-    if response.msg == "ok" {
-      Ok(response)
-    } else {
-      Err(LemmyError::from_message(&response.msg))
-    }
+  if response.msg == "ok" {
+    Ok(response)
    } else {
-    Err(LemmyError::from_message("pictrs_url not set up in config"))
+    Err(LemmyErrorType::PictrsResponseError(response.msg))?
+  }
+}
+
+/// Purges an image from pictrs
+/// Note: This should often be coerced from a Result to .ok() in order to fail softly, because:
+/// - It might fail due to image being not local
+/// - It might not be an image
+/// - Pictrs might not be set up
+pub async fn purge_image_from_pictrs(
+  client: &ClientWithMiddleware,
+  settings: &Settings,
+  image_url: &Url,
+) -> Result<(), LemmyError> {
+  let pictrs_config = settings.pictrs_config()?;
+  is_image_content_type(client, image_url).await?;
+
+  let alias = image_url
+    .path_segments()
+    .ok_or(LemmyErrorType::ImageUrlMissingPathSegments)?
+    .next_back()
+    .ok_or(LemmyErrorType::ImageUrlMissingLastPathSegment)?;
+
+  let purge_url = format!("{}/internal/purge?alias={}", pictrs_config.url, alias);
+
+  let pictrs_api_key = pictrs_config
+    .api_key
+    .ok_or(LemmyErrorType::PictrsApiKeyNotProvided)?;
+  let response = client
+    .post(&purge_url)
+    .timeout(REQWEST_TIMEOUT)
+    .header("x-api-token", pictrs_api_key)
+    .send()
+    .await?;
+
+  let response: PictrsPurgeResponse = response.json().await.map_err(LemmyError::from)?;
+
+  if response.msg == "ok" {
+    Ok(())
+  } else {
+    Err(LemmyErrorType::PictrsPurgeResponseError(response.msg))?
    }
  }
  
@@ -147,13 +191,20 @@ pub async fn fetch_site_data(
    client: &ClientWithMiddleware,
    settings: &Settings,
    url: Option<&Url>,
-) -> (Option<SiteMetadata>, Option<Url>) {
+  include_image: bool,
+) -> (Option<SiteMetadata>, Option<DbUrl>) {
    match &url {
      Some(url) => {
        // Fetch metadata
        // Ignore errors, since it may be an image, or not have the data.
        // Warning, this may ignore SSL errors
        let metadata_option = fetch_site_metadata(client, url).await.ok();
+      if !include_image {
+        return (metadata_option, None);
+      }
+
+      let missing_pictrs_file =
+        |r: PictrsResponse| r.files.first().expect("missing pictrs file").file.clone();
  
        // Fetch pictrs thumbnail
        let pictrs_hash = match &metadata_option {
@@ -162,16 +213,16 @@ pub async fn fetch_site_data(
            // Try to generate a small thumbnail if there's a full sized one from post-links
            Some(metadata_image) => fetch_pictrs(client, settings, metadata_image)
              .await
-            .map(|r| r.files[0].file.to_owned()),
+            .map(missing_pictrs_file),
            // Metadata, but no image
            None => fetch_pictrs(client, settings, url)
              .await
-            .map(|r| r.files[0].file.to_owned()),
+            .map(missing_pictrs_file),
          },
          // No metadata, try to fetch the URL as an image
          None => fetch_pictrs(client, settings, url)
            .await
-          .map(|r| r.files[0].file.to_owned()),
+          .map(missing_pictrs_file),
        };
  
        // The full urls are necessary for federation
@@ -187,7 +238,7 @@ pub async fn fetch_site_data(
          .ok()
          .flatten();
  
-      (metadata_option, pictrs_thumbnail)
+      (metadata_option, pictrs_thumbnail.map(Into::into))
      }
      None => (None, None),
    }
@@ -195,21 +246,17 @@ pub async fn fetch_site_data(
  
  #[tracing::instrument(skip_all)]
  async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
-  let response = client
-    .get(url.as_str())
-    .timeout(REQWEST_TIMEOUT)
-    .send()
-    .await?;
+  let response = client.get(url.as_str()).send().await?;
    if response
      .headers()
      .get("Content-Type")
-    .ok_or_else(|| LemmyError::from_message("No Content-Type header"))?
+    .ok_or(LemmyErrorType::NoContentTypeHeader)?
      .to_str()?
      .starts_with("image/")
    {
      Ok(())
    } else {
-    Err(LemmyError::from_message("Not an image type."))
+    Err(LemmyErrorType::NotAnImageType)?
    }
  }
  
@@ -223,16 +270,24 @@ pub fn build_user_agent(settings: &Settings) -> String {
  
  #[cfg(test)]
  mod tests {
-  use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata};
-  use lemmy_utils::settings::structs::Settings;
+  #![allow(clippy::unwrap_used)]
+  #![allow(clippy::indexing_slicing)]
+
+  use crate::request::{
+    build_user_agent,
+    fetch_site_metadata,
+    html_to_site_metadata,
+    SiteMetadata,
+  };
+  use lemmy_utils::settings::SETTINGS;
    use url::Url;
  
    // These helped with testing
-  #[actix_rt::test]
+  #[tokio::test]
    async fn test_site_metadata() {
-    let settings = Settings::init().unwrap();
+    let settings = &SETTINGS.clone();
      let client = reqwest::Client::builder()
-      .user_agent(build_user_agent(&settings))
+      .user_agent(build_user_agent(settings))
        .build()
        .unwrap()
        .into();
@@ -247,21 +302,12 @@ mod tests {
          image: Some(
            Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
              .unwrap()
+            .into()
          ),
-        html: None,
+        embed_video_url: None,
        },
        sample_res
      );
-
-    let youtube_url = Url::parse("https://www.youtube.com/watch?v=IquO_TcMZIQ").unwrap();
-    let youtube_res = fetch_site_metadata(&client, &youtube_url).await.unwrap();
-    assert_eq!(
-      SiteMetadata {
-        title: Some("A Hard Look at Rent and Rent Seeking with Michael Hudson & Pepe Escobar".to_string()),
-        description: Some("An interactive discussion on wealth inequality and the “Great Game” on the control of natural resources.In this webinar organized jointly by the Henry George...".to_string()),
-        image: Some(Url::parse("https://i.ytimg.com/vi/IquO_TcMZIQ/maxresdefault.jpg").unwrap()),
-        html: None,
-      }, youtube_res);
    }
  
    // #[test]
@@ -271,4 +317,46 @@ mod tests {
    //   let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
    //   assert!(res_other.is_err());
    // }
+
+  #[test]
+  fn test_resolve_image_url() {
+    // url that lists the opengraph fields
+    let url = Url::parse("https://example.com/one/two.html").unwrap();
+
+    // root relative url
+    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='/image.jpg'></head><body></body></html>";
+    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    assert_eq!(
+      metadata.image,
+      Some(Url::parse("https://example.com/image.jpg").unwrap().into())
+    );
+
+    // base relative url
+    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='image.jpg'></head><body></body></html>";
+    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    assert_eq!(
+      metadata.image,
+      Some(
+        Url::parse("https://example.com/one/image.jpg")
+          .unwrap()
+          .into()
+      )
+    );
+
+    // absolute url
+    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='https://cdn.host.com/image.jpg'></head><body></body></html>";
+    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    assert_eq!(
+      metadata.image,
+      Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into())
+    );
+
+    // protocol relative url
+    let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='//example.com/image.jpg'></head><body></body></html>";
+    let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
+    assert_eq!(
+      metadata.image,
+      Some(Url::parse("https://example.com/image.jpg").unwrap().into())
+    );
+  }
  }