crates/api_common/src/request.rs

   1 use crate::post::SiteMetadata;
   2 use encoding::{all::encodings, DecoderTrap};
   3 use lemmy_db_schema::newtypes::DbUrl;
   4 use lemmy_utils::{
   5   error::LemmyError,
   6   settings::structs::Settings,
   7   version::VERSION,
   8   REQWEST_TIMEOUT,
   9 };
  10 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
  11 use reqwest_middleware::ClientWithMiddleware;
  12 use serde::Deserialize;
  13 use tracing::info;
  14 use url::Url;
  15 use webpage::HTML;
  16
  17 /// Fetches the post link html tags (like title, description, image, etc)
  18 #[tracing::instrument(skip_all)]
  19 pub async fn fetch_site_metadata(
  20   client: &ClientWithMiddleware,
  21   url: &Url,
  22 ) -> Result<SiteMetadata, LemmyError> {
  23   info!("Fetching site metadata for url: {}", url);
  24   let response = client.get(url.as_str()).send().await?;
  25
  26   // Can't use .text() here, because it only checks the content header, not the actual bytes
  27   // https://github.com/LemmyNet/lemmy/issues/1964
  28   let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
  29
  30   let tags = html_to_site_metadata(&html_bytes, url)?;
  31
  32   Ok(tags)
  33 }
  34
  35 fn html_to_site_metadata(html_bytes: &[u8], url: &Url) -> Result<SiteMetadata, LemmyError> {
  36   let html = String::from_utf8_lossy(html_bytes);
  37
  38   // Make sure the first line is doctype html
  39   let first_line = html
  40     .trim_start()
  41     .lines()
  42     .next()
  43     .ok_or_else(|| LemmyError::from_message("No lines in html"))?
  44     .to_lowercase();
  45
  46   if !first_line.starts_with("<!doctype html>") {
  47     return Err(LemmyError::from_message(
  48       "Site metadata page fetch is not DOCTYPE html",
  49     ));
  50   }
  51
  52   let mut page = HTML::from_string(html.to_string(), None)?;
  53
  54   // If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the
  55   // proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8
  56   // version.
  57   if let Some(charset) = page.meta.get("charset") {
  58     if charset.to_lowercase() != "utf-8" {
  59       if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) {
  60         if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) {
  61           page = HTML::from_string(html_with_encoding, None)?;
  62         }
  63       }
  64     }
  65   }
  66
  67   let page_title = page.title;
  68   let page_description = page.description;
  69
  70   let og_description = page
  71     .opengraph
  72     .properties
  73     .get("description")
  74     .map(std::string::ToString::to_string);
  75   let og_title = page
  76     .opengraph
  77     .properties
  78     .get("title")
  79     .map(std::string::ToString::to_string);
  80   let og_image = page
  81     .opengraph
  82     .images
  83     .first()
  84     // join also works if the target URL is absolute
  85     .and_then(|ogo| url.join(&ogo.url).ok());
  86   let og_embed_url = page
  87     .opengraph
  88     .videos
  89     .first()
  90     // join also works if the target URL is absolute
  91     .and_then(|v| url.join(&v.url).ok());
  92
  93   Ok(SiteMetadata {
  94     title: og_title.or(page_title),
  95     description: og_description.or(page_description),
  96     image: og_image.map(Into::into),
  97     embed_video_url: og_embed_url.map(Into::into),
  98   })
  99 }
 100
 101 #[derive(Deserialize, Debug, Clone)]
 102 pub(crate) struct PictrsResponse {
 103   files: Vec<PictrsFile>,
 104   msg: String,
 105 }
 106
 107 #[derive(Deserialize, Debug, Clone)]
 108 pub(crate) struct PictrsFile {
 109   file: String,
 110   #[allow(dead_code)]
 111   delete_token: String,
 112 }
 113
 114 #[derive(Deserialize, Debug, Clone)]
 115 pub(crate) struct PictrsPurgeResponse {
 116   msg: String,
 117 }
 118
 119 #[tracing::instrument(skip_all)]
 120 pub(crate) async fn fetch_pictrs(
 121   client: &ClientWithMiddleware,
 122   settings: &Settings,
 123   image_url: &Url,
 124 ) -> Result<PictrsResponse, LemmyError> {
 125   let pictrs_config = settings.pictrs_config()?;
 126   is_image_content_type(client, image_url).await?;
 127
 128   let fetch_url = format!(
 129     "{}image/download?url={}",
 130     pictrs_config.url,
 131     utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed
 132   );
 133
 134   let response = client
 135     .get(&fetch_url)
 136     .timeout(REQWEST_TIMEOUT)
 137     .send()
 138     .await?;
 139
 140   let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
 141
 142   if response.msg == "ok" {
 143     Ok(response)
 144   } else {
 145     Err(LemmyError::from_message(&response.msg))
 146   }
 147 }
 148
 149 /// Purges an image from pictrs
 150 /// Note: This should often be coerced from a Result to .ok() in order to fail softly, because:
 151 /// - It might fail due to image being not local
 152 /// - It might not be an image
 153 /// - Pictrs might not be set up
 154 pub async fn purge_image_from_pictrs(
 155   client: &ClientWithMiddleware,
 156   settings: &Settings,
 157   image_url: &Url,
 158 ) -> Result<(), LemmyError> {
 159   let pictrs_config = settings.pictrs_config()?;
 160   is_image_content_type(client, image_url).await?;
 161
 162   let alias = image_url
 163     .path_segments()
 164     .ok_or_else(|| LemmyError::from_message("Image URL missing path segments"))?
 165     .next_back()
 166     .ok_or_else(|| LemmyError::from_message("Image URL missing last path segment"))?;
 167
 168   let purge_url = format!("{}/internal/purge?alias={}", pictrs_config.url, alias);
 169
 170   let pictrs_api_key = pictrs_config
 171     .api_key
 172     .ok_or_else(|| LemmyError::from_message("pictrs_api_key_not_provided"))?;
 173   let response = client
 174     .post(&purge_url)
 175     .timeout(REQWEST_TIMEOUT)
 176     .header("x-api-token", pictrs_api_key)
 177     .send()
 178     .await?;
 179
 180   let response: PictrsPurgeResponse = response.json().await.map_err(LemmyError::from)?;
 181
 182   if response.msg == "ok" {
 183     Ok(())
 184   } else {
 185     Err(LemmyError::from_message(&response.msg))
 186   }
 187 }
 188
 189 /// Both are options, since the URL might be either an html page, or an image
 190 /// Returns the SiteMetadata, and a Pictrs URL, if there is a picture associated
 191 #[tracing::instrument(skip_all)]
 192 pub async fn fetch_site_data(
 193   client: &ClientWithMiddleware,
 194   settings: &Settings,
 195   url: Option<&Url>,
 196   include_image: bool,
 197 ) -> (Option<SiteMetadata>, Option<DbUrl>) {
 198   match &url {
 199     Some(url) => {
 200       // Fetch metadata
 201       // Ignore errors, since it may be an image, or not have the data.
 202       // Warning, this may ignore SSL errors
 203       let metadata_option = fetch_site_metadata(client, url).await.ok();
 204       if !include_image {
 205         return (metadata_option, None);
 206       }
 207
 208       let missing_pictrs_file =
 209         |r: PictrsResponse| r.files.first().expect("missing pictrs file").file.clone();
 210
 211       // Fetch pictrs thumbnail
 212       let pictrs_hash = match &metadata_option {
 213         Some(metadata_res) => match &metadata_res.image {
 214           // Metadata, with image
 215           // Try to generate a small thumbnail if there's a full sized one from post-links
 216           Some(metadata_image) => fetch_pictrs(client, settings, metadata_image)
 217             .await
 218             .map(missing_pictrs_file),
 219           // Metadata, but no image
 220           None => fetch_pictrs(client, settings, url)
 221             .await
 222             .map(missing_pictrs_file),
 223         },
 224         // No metadata, try to fetch the URL as an image
 225         None => fetch_pictrs(client, settings, url)
 226           .await
 227           .map(missing_pictrs_file),
 228       };
 229
 230       // The full urls are necessary for federation
 231       let pictrs_thumbnail = pictrs_hash
 232         .map(|p| {
 233           Url::parse(&format!(
 234             "{}/pictrs/image/{}",
 235             settings.get_protocol_and_hostname(),
 236             p
 237           ))
 238           .ok()
 239         })
 240         .ok()
 241         .flatten();
 242
 243       (metadata_option, pictrs_thumbnail.map(Into::into))
 244     }
 245     None => (None, None),
 246   }
 247 }
 248
 249 #[tracing::instrument(skip_all)]
 250 async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
 251   let response = client.get(url.as_str()).send().await?;
 252   if response
 253     .headers()
 254     .get("Content-Type")
 255     .ok_or_else(|| LemmyError::from_message("No Content-Type header"))?
 256     .to_str()?
 257     .starts_with("image/")
 258   {
 259     Ok(())
 260   } else {
 261     Err(LemmyError::from_message("Not an image type."))
 262   }
 263 }
 264
 265 pub fn build_user_agent(settings: &Settings) -> String {
 266   format!(
 267     "Lemmy/{}; +{}",
 268     VERSION,
 269     settings.get_protocol_and_hostname()
 270   )
 271 }
 272
 273 #[cfg(test)]
 274 mod tests {
 275   use crate::request::{
 276     build_user_agent,
 277     fetch_site_metadata,
 278     html_to_site_metadata,
 279     SiteMetadata,
 280   };
 281   use lemmy_utils::settings::SETTINGS;
 282   use url::Url;
 283
 284   // These helped with testing
 285   #[tokio::test]
 286   async fn test_site_metadata() {
 287     let settings = &SETTINGS.clone();
 288     let client = reqwest::Client::builder()
 289       .user_agent(build_user_agent(settings))
 290       .build()
 291       .unwrap()
 292       .into();
 293     let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap();
 294     let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap();
 295     assert_eq!(
 296       SiteMetadata {
 297         title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
 298         description: Some(
 299           "The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()
 300         ),
 301         image: Some(
 302           Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
 303             .unwrap()
 304             .into()
 305         ),
 306         embed_video_url: None,
 307       },
 308       sample_res
 309     );
 310   }
 311
 312   // #[test]
 313   // fn test_pictshare() {
 314   //   let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg");
 315   //   assert!(res.is_ok());
 316   //   let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
 317   //   assert!(res_other.is_err());
 318   // }
 319
 320   #[test]
 321   fn test_resolve_image_url() {
 322     // url that lists the opengraph fields
 323     let url = Url::parse("https://example.com/one/two.html").unwrap();
 324
 325     // root relative url
 326     let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='/image.jpg'></head><body></body></html>";
 327     let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
 328     assert_eq!(
 329       metadata.image,
 330       Some(Url::parse("https://example.com/image.jpg").unwrap().into())
 331     );
 332
 333     // base relative url
 334     let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='image.jpg'></head><body></body></html>";
 335     let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
 336     assert_eq!(
 337       metadata.image,
 338       Some(
 339         Url::parse("https://example.com/one/image.jpg")
 340           .unwrap()
 341           .into()
 342       )
 343     );
 344
 345     // absolute url
 346     let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='https://cdn.host.com/image.jpg'></head><body></body></html>";
 347     let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
 348     assert_eq!(
 349       metadata.image,
 350       Some(Url::parse("https://cdn.host.com/image.jpg").unwrap().into())
 351     );
 352
 353     // protocol relative url
 354     let html_bytes = b"<!DOCTYPE html><html><head><meta property='og:image' content='//example.com/image.jpg'></head><body></body></html>";
 355     let metadata = html_to_site_metadata(html_bytes, &url).expect("Unable to parse metadata");
 356     assert_eq!(
 357       metadata.image,
 358       Some(Url::parse("https://example.com/image.jpg").unwrap().into())
 359     );
 360   }
 361 }