crates/api_common/src/request.rs

   1 use crate::post::SiteMetadata;
   2 use encoding::{all::encodings, DecoderTrap};
   3 use lemmy_utils::{settings::structs::Settings, version::VERSION, LemmyError, REQWEST_TIMEOUT};
   4 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
   5 use reqwest_middleware::ClientWithMiddleware;
   6 use serde::Deserialize;
   7 use tracing::info;
   8 use url::Url;
   9 use webpage::HTML;
  10
  11 /// Fetches the post link html tags (like title, description, image, etc)
  12 #[tracing::instrument(skip_all)]
  13 pub async fn fetch_site_metadata(
  14   client: &ClientWithMiddleware,
  15   url: &Url,
  16 ) -> Result<SiteMetadata, LemmyError> {
  17   info!("Fetching site metadata for url: {}", url);
  18   let response = client
  19     .get(url.as_str())
  20     .timeout(REQWEST_TIMEOUT)
  21     .send()
  22     .await?;
  23
  24   // Can't use .text() here, because it only checks the content header, not the actual bytes
  25   // https://github.com/LemmyNet/lemmy/issues/1964
  26   let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
  27
  28   let tags = html_to_site_metadata(&html_bytes)?;
  29
  30   Ok(tags)
  31 }
  32
  33 fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError> {
  34   let html = String::from_utf8_lossy(html_bytes);
  35
  36   // Make sure the first line is doctype html
  37   let first_line = html
  38     .trim_start()
  39     .lines()
  40     .into_iter()
  41     .next()
  42     .ok_or_else(|| LemmyError::from_message("No lines in html"))?
  43     .to_lowercase();
  44
  45   if !first_line.starts_with("<!doctype html>") {
  46     return Err(LemmyError::from_message(
  47       "Site metadata page fetch is not DOCTYPE html",
  48     ));
  49   }
  50
  51   let mut page = HTML::from_string(html.to_string(), None)?;
  52
  53   // If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the
  54   // proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8
  55   // version.
  56   if let Some(charset) = page.meta.get("charset") {
  57     if charset.to_lowercase() != "utf-8" {
  58       if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) {
  59         if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) {
  60           page = HTML::from_string(html_with_encoding, None)?;
  61         }
  62       }
  63     }
  64   }
  65
  66   let page_title = page.title;
  67   let page_description = page.description;
  68
  69   let og_description = page
  70     .opengraph
  71     .properties
  72     .get("description")
  73     .map(|t| t.to_string());
  74   let og_title = page
  75     .opengraph
  76     .properties
  77     .get("title")
  78     .map(|t| t.to_string());
  79   let og_image = page
  80     .opengraph
  81     .images
  82     .get(0)
  83     .and_then(|ogo| Url::parse(&ogo.url).ok());
  84
  85   let title = og_title.or(page_title);
  86   let description = og_description.or(page_description);
  87   let image = og_image;
  88
  89   Ok(SiteMetadata {
  90     title,
  91     description,
  92     image,
  93     html: None,
  94   })
  95 }
  96
  97 #[derive(Deserialize, Debug, Clone)]
  98 pub(crate) struct PictrsResponse {
  99   files: Vec<PictrsFile>,
 100   msg: String,
 101 }
 102
 103 #[derive(Deserialize, Debug, Clone)]
 104 pub(crate) struct PictrsFile {
 105   file: String,
 106   #[allow(dead_code)]
 107   delete_token: String,
 108 }
 109
 110 #[tracing::instrument(skip_all)]
 111 pub(crate) async fn fetch_pictrs(
 112   client: &ClientWithMiddleware,
 113   settings: &Settings,
 114   image_url: &Url,
 115 ) -> Result<PictrsResponse, LemmyError> {
 116   if let Some(pictrs_url) = settings.pictrs_url.to_owned() {
 117     is_image_content_type(client, image_url).await?;
 118
 119     let fetch_url = format!(
 120       "{}/image/download?url={}",
 121       pictrs_url,
 122       utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed
 123     );
 124
 125     let response = client
 126       .get(&fetch_url)
 127       .timeout(REQWEST_TIMEOUT)
 128       .send()
 129       .await?;
 130
 131     let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
 132
 133     if response.msg == "ok" {
 134       Ok(response)
 135     } else {
 136       Err(LemmyError::from_message(&response.msg))
 137     }
 138   } else {
 139     Err(LemmyError::from_message("pictrs_url not set up in config"))
 140   }
 141 }
 142
 143 /// Both are options, since the URL might be either an html page, or an image
 144 /// Returns the SiteMetadata, and a Pictrs URL, if there is a picture associated
 145 #[tracing::instrument(skip_all)]
 146 pub async fn fetch_site_data(
 147   client: &ClientWithMiddleware,
 148   settings: &Settings,
 149   url: Option<&Url>,
 150 ) -> (Option<SiteMetadata>, Option<Url>) {
 151   match &url {
 152     Some(url) => {
 153       // Fetch metadata
 154       // Ignore errors, since it may be an image, or not have the data.
 155       // Warning, this may ignore SSL errors
 156       let metadata_option = fetch_site_metadata(client, url).await.ok();
 157
 158       // Fetch pictrs thumbnail
 159       let pictrs_hash = match &metadata_option {
 160         Some(metadata_res) => match &metadata_res.image {
 161           // Metadata, with image
 162           // Try to generate a small thumbnail if there's a full sized one from post-links
 163           Some(metadata_image) => fetch_pictrs(client, settings, metadata_image)
 164             .await
 165             .map(|r| r.files[0].file.to_owned()),
 166           // Metadata, but no image
 167           None => fetch_pictrs(client, settings, url)
 168             .await
 169             .map(|r| r.files[0].file.to_owned()),
 170         },
 171         // No metadata, try to fetch the URL as an image
 172         None => fetch_pictrs(client, settings, url)
 173           .await
 174           .map(|r| r.files[0].file.to_owned()),
 175       };
 176
 177       // The full urls are necessary for federation
 178       let pictrs_thumbnail = pictrs_hash
 179         .map(|p| {
 180           Url::parse(&format!(
 181             "{}/pictrs/image/{}",
 182             settings.get_protocol_and_hostname(),
 183             p
 184           ))
 185           .ok()
 186         })
 187         .ok()
 188         .flatten();
 189
 190       (metadata_option, pictrs_thumbnail)
 191     }
 192     None => (None, None),
 193   }
 194 }
 195
 196 #[tracing::instrument(skip_all)]
 197 async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
 198   let response = client
 199     .get(url.as_str())
 200     .timeout(REQWEST_TIMEOUT)
 201     .send()
 202     .await?;
 203   if response
 204     .headers()
 205     .get("Content-Type")
 206     .ok_or_else(|| LemmyError::from_message("No Content-Type header"))?
 207     .to_str()?
 208     .starts_with("image/")
 209   {
 210     Ok(())
 211   } else {
 212     Err(LemmyError::from_message("Not an image type."))
 213   }
 214 }
 215
 216 pub fn build_user_agent(settings: &Settings) -> String {
 217   format!(
 218     "Lemmy/{}; +{}",
 219     VERSION,
 220     settings.get_protocol_and_hostname()
 221   )
 222 }
 223
 224 #[cfg(test)]
 225 mod tests {
 226   use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata};
 227   use lemmy_utils::settings::structs::Settings;
 228   use url::Url;
 229
 230   // These helped with testing
 231   #[actix_rt::test]
 232   async fn test_site_metadata() {
 233     let settings = Settings::init().unwrap();
 234     let client = reqwest::Client::builder()
 235       .user_agent(build_user_agent(&settings))
 236       .build()
 237       .unwrap()
 238       .into();
 239     let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap();
 240     let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap();
 241     assert_eq!(
 242       SiteMetadata {
 243         title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
 244         description: Some(
 245           "The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()
 246         ),
 247         image: Some(
 248           Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
 249             .unwrap()
 250         ),
 251         html: None,
 252       },
 253       sample_res
 254     );
 255   }
 256
 257   // #[test]
 258   // fn test_pictshare() {
 259   //   let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg");
 260   //   assert!(res.is_ok());
 261   //   let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
 262   //   assert!(res_other.is_err());
 263   // }
 264 }