1 use crate::post::SiteMetadata;
2 use encoding::{all::encodings, DecoderTrap};
3 use lemmy_utils::{error::LemmyError, settings::structs::Settings, version::VERSION};
4 use percent_encoding::{utf8_percent_encode, NON_ALPHANUMERIC};
5 use reqwest_middleware::ClientWithMiddleware;
6 use serde::Deserialize;
11 /// Fetches the post link html tags (like title, description, image, etc)
12 #[tracing::instrument(skip_all)]
13 pub async fn fetch_site_metadata(
14 client: &ClientWithMiddleware,
16 ) -> Result<SiteMetadata, LemmyError> {
17 info!("Fetching site metadata for url: {}", url);
18 let response = client.get(url.as_str()).send().await?;
20 // Can't use .text() here, because it only checks the content header, not the actual bytes
21 // https://github.com/LemmyNet/lemmy/issues/1964
22 let html_bytes = response.bytes().await.map_err(LemmyError::from)?.to_vec();
24 let tags = html_to_site_metadata(&html_bytes)?;
29 fn html_to_site_metadata(html_bytes: &[u8]) -> Result<SiteMetadata, LemmyError> {
30 let html = String::from_utf8_lossy(html_bytes);
32 // Make sure the first line is doctype html
38 .ok_or_else(|| LemmyError::from_message("No lines in html"))?
41 if !first_line.starts_with("<!doctype html>") {
42 return Err(LemmyError::from_message(
43 "Site metadata page fetch is not DOCTYPE html",
47 let mut page = HTML::from_string(html.to_string(), None)?;
49 // If the web page specifies that it isn't actually UTF-8, re-decode the received bytes with the
50 // proper encoding. If the specified encoding cannot be found, fall back to the original UTF-8
52 if let Some(charset) = page.meta.get("charset") {
53 if charset.to_lowercase() != "utf-8" {
54 if let Some(encoding_ref) = encodings().iter().find(|e| e.name() == charset) {
55 if let Ok(html_with_encoding) = encoding_ref.decode(html_bytes, DecoderTrap::Replace) {
56 page = HTML::from_string(html_with_encoding, None)?;
62 let page_title = page.title;
63 let page_description = page.description;
65 let og_description = page
69 .map(|t| t.to_string());
74 .map(|t| t.to_string());
79 .and_then(|ogo| Url::parse(&ogo.url).ok());
81 let title = og_title.or(page_title);
82 let description = og_description.or(page_description);
93 #[derive(Deserialize, Debug, Clone)]
94 pub(crate) struct PictrsResponse {
95 files: Vec<PictrsFile>,
99 #[derive(Deserialize, Debug, Clone)]
100 pub(crate) struct PictrsFile {
103 delete_token: String,
106 #[tracing::instrument(skip_all)]
107 pub(crate) async fn fetch_pictrs(
108 client: &ClientWithMiddleware,
111 ) -> Result<PictrsResponse, LemmyError> {
112 if let Some(pictrs_url) = settings.pictrs_url.to_owned() {
113 is_image_content_type(client, image_url).await?;
115 let fetch_url = format!(
116 "{}/image/download?url={}",
118 utf8_percent_encode(image_url.as_str(), NON_ALPHANUMERIC) // TODO this might not be needed
121 let response = client.get(&fetch_url).send().await?;
123 let response: PictrsResponse = response.json().await.map_err(LemmyError::from)?;
125 if response.msg == "ok" {
128 Err(LemmyError::from_message(&response.msg))
131 Err(LemmyError::from_message("pictrs_url not set up in config"))
135 /// Both are options, since the URL might be either an html page, or an image
136 /// Returns the SiteMetadata, and a Pictrs URL, if there is a picture associated
137 #[tracing::instrument(skip_all)]
138 pub async fn fetch_site_data(
139 client: &ClientWithMiddleware,
142 ) -> (Option<SiteMetadata>, Option<Url>) {
146 // Ignore errors, since it may be an image, or not have the data.
147 // Warning, this may ignore SSL errors
148 let metadata_option = fetch_site_metadata(client, url).await.ok();
150 // Fetch pictrs thumbnail
151 let pictrs_hash = match &metadata_option {
152 Some(metadata_res) => match &metadata_res.image {
153 // Metadata, with image
154 // Try to generate a small thumbnail if there's a full sized one from post-links
155 Some(metadata_image) => fetch_pictrs(client, settings, metadata_image)
157 .map(|r| r.files[0].file.to_owned()),
158 // Metadata, but no image
159 None => fetch_pictrs(client, settings, url)
161 .map(|r| r.files[0].file.to_owned()),
163 // No metadata, try to fetch the URL as an image
164 None => fetch_pictrs(client, settings, url)
166 .map(|r| r.files[0].file.to_owned()),
169 // The full urls are necessary for federation
170 let pictrs_thumbnail = pictrs_hash
173 "{}/pictrs/image/{}",
174 settings.get_protocol_and_hostname(),
182 (metadata_option, pictrs_thumbnail)
184 None => (None, None),
188 #[tracing::instrument(skip_all)]
189 async fn is_image_content_type(client: &ClientWithMiddleware, url: &Url) -> Result<(), LemmyError> {
190 let response = client.get(url.as_str()).send().await?;
194 .ok_or_else(|| LemmyError::from_message("No Content-Type header"))?
196 .starts_with("image/")
200 Err(LemmyError::from_message("Not an image type."))
204 pub fn build_user_agent(settings: &Settings) -> String {
208 settings.get_protocol_and_hostname()
214 use crate::request::{build_user_agent, fetch_site_metadata, SiteMetadata};
215 use lemmy_utils::settings::structs::Settings;
218 // These helped with testing
220 async fn test_site_metadata() {
221 let settings = Settings::init().unwrap();
222 let client = reqwest::Client::builder()
223 .user_agent(build_user_agent(&settings))
227 let sample_url = Url::parse("https://gitlab.com/IzzyOnDroid/repo/-/wikis/FAQ").unwrap();
228 let sample_res = fetch_site_metadata(&client, &sample_url).await.unwrap();
231 title: Some("FAQ · Wiki · IzzyOnDroid / repo · GitLab".to_string()),
233 "The F-Droid compatible repo at https://apt.izzysoft.de/fdroid/".to_string()
236 Url::parse("https://gitlab.com/uploads/-/system/project/avatar/4877469/iod_logo.png")
246 // fn test_pictshare() {
247 // let res = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpg");
248 // assert!(res.is_ok());
249 // let res_other = fetch_pictshare("https://upload.wikimedia.org/wikipedia/en/2/27/The_Mandalorian_logo.jpgaoeu");
250 // assert!(res_other.is_err());