crates/api_common/src/request.rs

   1 use crate::post::SiteMetadata;
   2 use encoding::{all::encodings, DecoderTrap};
   3 use lemmy_utils::{error::LemmyError, settings::structs::Settings, version::VERSION};
   4 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
   5 use reqwest_middleware::ClientWithMiddleware;
   6 use serde::Deserialize;
   7 use tracing::info;
   8 use url::Url;
   9 use webpage::HTML;
  10
  11 /// Fetches the post link html tags (like title, description, image, etc)
  12 #[tracing::instrument(skip_all)]
  13 pub async fn fetch_site_metadata(
  14   client: &ClientWithMiddleware,
  15   url: &Url,
  16 ) -> Result<SiteMetadata, LemmyError> {
  17   info!("Fetching site metadata for url: {}", url);
  18   let response = client.get(url.as_str()).send().await?;
  19
  20   // Can't use .text() here, because it only checks the content header, not the actual bytes
  21   // https://github.com/LemmyNet/lemmy/issues/1964
  22   let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
  23
  24   let tags = html_to_site_metadata(&html_bytes)?;
  25
  26   Ok(tags)
  27 }
  28
  29 fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError> {
  30   let html = String::from_utf8_lossy(html_bytes);
  31
  32   // Make sure the first line is doctype html
  33   let first_line = html
  34     .trim_start()
  35     .lines()
  36     .into_iter()
  37     .next()
  38     .ok_or_else(|| LemmyError::from_message("No lines in html"))?
  39     .to_lowercase();
  40
  41   if !first_line.starts_with("<!doctype html>") {
  42     return Err(LemmyError::from_message(
  43       "Site metadata page fetch is not DOCTYPE html",
  44     ));
  45   }
  46
  47   let mut page = HTML::from_string(html.to_string(), None)?;
  48
  49   // If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the
  50   // proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8
  51   // version.
  52   if let Some(charset) = page.meta.get("charset") {
  53     if charset.to_lowercase() != "utf-8" {
  54       if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) {
  55         if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) {
  56           page = HTML::from_string(html_with_encoding, None)?;
  57         }
  58       }
  59     }
  60   }
  61
  62   let page_title = page.title;
  63   let page_description = page.description;
  64
  65   let og_description = page
  66     .opengraph
  67     .properties
  68     .get("description")
  69     .map(|t| t.to_string());
  70   let og_title = page
  71     .opengraph
  72     .properties
  73     .get("title")
  74     .map(|t| t.to_string());
  75   let og_image = page
  76     .opengraph
  77     .images
  78     .get(0)
  79     .and_then(|ogo| Url::parse(&ogo.url).ok());
  80
  81   let title = og_title.or(page_title);
  82   let description = og_description.or(page_description);
  83   let image = og_image;
  84
  85   Ok(SiteMetadata {
  86     title,
  87     description,
  88     image,
  89     html: None,
  90   })
  91 }
  92
  93 #[derive(Deserialize, Debug, Clone)]
  94 pub(crate) struct PictrsResponse {
  95   files: Vec<PictrsFile>,
  96   msg: String,
  97 }
  98
  99 #[derive(Deserialize, Debug, Clone)]
 100 pub(crate) struct PictrsFile {
 101   file: String,
 102   #[allow(dead_code)]
 103   delete_token: String,
 104 }
 105
 106 #[tracing::instrument(skip_all)]
 107 pub(crate) async fn fetch_pictrs(
 108   client: &ClientWithMiddleware,
 109   settings: &Settings,
 110   image_url: &Url,
 111 ) -> Result<PictrsResponse, LemmyError> {
 112   if let Some(pictrs_url) = settings.pictrs_url.to_owned() {
 113     is_image_content_type(client, image_url).await?;
 114
 115     let fetch_url = format!(
 116       "{}/image/download?url={}",
 117       pictrs_url,
 118       utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed
 119     );
 120
 121     let response = client.get(&fetch_url).send().await?;
 122
 123     let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
 124
 125     if response.msg == "ok" {
 126       Ok(response)
 127     } else {
 128       Err(LemmyError::from_message(&response.msg))
 129     }
 130   } else {
 131     Err(LemmyError::from_message("pictrs_url not set up in config"))
 132   }
 133 }
 134
 135 /// Both are options, since the URL might be either an html page, or an image
 136 /// Returns the SiteMetadata, and a Pictrs URL, if there is a picture associated
 137 #[tracing::instrument(skip_all)]
 138 pub async fn fetch_site_data(
 139   client: &ClientWithMiddleware,
 140   settings: &Settings,
 141   url: Option<&Url>,
 142 ) -> (Option<SiteMetadata>, Option<Url>) {
 143   match &url {
 144     Some(url) => {
 145       // Fetch metadata
 146       // Ignore errors, since it may be an image, or not have the data.
 147       // Warning, this may ignore SSL errors
 148       let metadata_option = fetch_site_metadata(client, url).await.ok();
 149
 150       // Fetch pictrs thumbnail
 151       let pictrs_hash = match &metadata_option {
 152         Some(metadata_res) => match &metadata_res.image {
 153           // Metadata, with image
 154           // Try to generate a small thumbnail if there's a full sized one from post-links
 155           Some(metadata_image) => fetch_pictrs(client, settings, metadata_image)
 156             .await
 157             .map(|r| r.files[0].file.to_owned()),
 158           // Metadata, but no image
 159           None => fetch_pictrs(client, settings, url)
 160             .await
 161             .map(|r| r.files[0].file.to_owned()),
 162         },
 163         // No metadata, try to fetch the URL as an image
 164         None => fetch_pictrs(client, settings, url)
 165           .await
 166           .map(|r| r.files[0].file.to_owned()),
 167       };
 168
 169       // The full urls are necessary for federation
 170       let pictrs_thumbnail = pictrs_hash
 171         .map(|p| {
 172           Url::parse(&format!(
 173             "{}/pictrs/image/{}",
 174             settings.get_protocol_and_hostname(),
 175             p
 176           ))
 177           .ok()
 178         })
 179         .ok()
 180         .flatten();
 181
 182       (metadata_option, pictrs_thumbnail)
 183     }
 184     None => (None, None),
 185   }
 186 }
 187
 188 #[tracing::instrument(skip_all)]
 189 async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
 190   let response = client.get(url.as_str()).send().await?;
 191   if response
 192     .headers()
 193     .get("Content-Type")
 194     .ok_or_else(|| LemmyError::from_message("No Content-Type header"))?
 195     .to_str()?
 196     .starts_with("image/")
 197   {
 198     Ok(())
 199   } else {
 200     Err(LemmyError::from_message("Not an image type."))
 201   }
 202 }
 203
 204 pub fn build_user_agent(settings: &Settings) -> String {
 205   format!(
 206     "Lemmy/{}; +{}",
 207     VERSION,
 208     settings.get_protocol_and_hostname()
 209   )
 210 }
 211
 212 #[cfg(test)]
 213 mod tests {
 214   use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata};
 215   use lemmy_utils::settings::structs::Settings;
 216   use url::Url;
 217
 218   // These helped with testing
 219   #[actix_rt::test]
 220   async fn test_site_metadata() {
 221     let settings = Settings::init().unwrap();
 222     let client = reqwest::Client::builder()
 223       .user_agent(build_user_agent(&settings))
 224       .build()
 225       .unwrap()
 226       .into();
 227     let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap();
 228     let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap();
 229     assert_eq!(
 230       SiteMetadata {
 231         title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
 232         description: Some(
 233           "The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()
 234         ),
 235         image: Some(
 236           Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
 237             .unwrap()
 238         ),
 239         html: None,
 240       },
 241       sample_res
 242     );
 243   }
 244
 245   // #[test]
 246   // fn test_pictshare() {
 247   //   let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg");
 248   //   assert!(res.is_ok());
 249   //   let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
 250   //   assert!(res_other.is_err());
 251   // }
 252 }