coyote/htmlhelper.cpp

490 lines
21 KiB
C++

#include "models.h"
#include "settings.h"
#include "timeutils.h"
#include "curlu_wrapper.h"
#include "font_awesome.h"
#include "blankie/escape.h"
#include "htmlhelper.h"
static inline void preprocess_html(const httplib::Request& req, const std::string& domain_name, const std::vector<Emoji>& emojis, lxb_dom_element_t* element);
static inline void preprocess_link(const httplib::Request& req, const std::string& domain_name, lxb_dom_element_t* element);
static inline void preprocess_blockquote(lxb_dom_element_t* element);
static inline bool should_fix_link(lxb_dom_element_t* element, const std::string& element_cls);
static inline void get_text_content(lxb_dom_node_t* node, std::string& out, size_t blockquote_depth);
static inline lxb_dom_node_t* emojify(lxb_dom_node_t* child, const std::vector<Emoji>& emojis);
static inline std::vector<lxb_dom_node*> emojify(lxb_dom_document_t* document, std::string str, const std::vector<Emoji>& emojis);
struct PostStatus {
const char* icon_html;
Node info_node;
};
static Element serialize_post(const httplib::Request& req, const std::string& server, const Post& post, bool main_post, const std::optional<PostStatus>& post_status, const Post* reblogged = nullptr);
static inline Element serialize_media(const Media& media);
static inline Element serialize_poll(const httplib::Request& req, const Poll& poll);
Element serialize_post(const httplib::Request& req, const std::string& server, const Post& post, bool pinned, bool main_post) {
using namespace std::string_literals;
if (post.reblog) {
PostStatus post_status = {
fa_retweet,
preprocess_html(req, post.account.emojis, post.account.display_name + " boosted"),
};
return serialize_post(req, server, *post.reblog, main_post, post_status, &post);
} else if (pinned) {
PostStatus post_status = {
fa_thumbtack,
blankie::html::HTMLString("Pinned post"),
};
return serialize_post(req, server, post, main_post, post_status);
} else if (post.in_reply_to_id && post.in_reply_to_account_id && post.account.id == *post.in_reply_to_account_id) {
PostStatus post_status = {
fa_reply,
preprocess_html(req, post.account.emojis, "Replied to "s + post.account.display_name),
};
return serialize_post(req, server, post, main_post, post_status);
} else {
return serialize_post(req, server, post, main_post, std::nullopt);
}
}
std::string get_text_content(lxb_dom_node_t* child) {
std::string out;
get_text_content(child, out, 0);
if (!out.empty()) {
size_t remove_from = out.size();
while (remove_from && out[remove_from - 1] == '\n') {
remove_from--;
}
if (out.size() > remove_from) {
out.erase(remove_from);
}
}
if (!out.empty()) {
size_t remove_to = 0;
while (out.size() > remove_to && out[remove_to] == '\n') {
remove_to++;
}
out.erase(0, remove_to);
}
return out;
}
std::string get_text_content(blankie::html::HTMLString str) {
LXB::HTML::Document document(str.str);
return get_text_content(document.body());
}
blankie::html::HTMLString preprocess_html(const httplib::Request& req, const std::string& domain_name, const std::vector<Emoji>& emojis, const blankie::html::HTMLString& str) {
LXB::HTML::Document document(str.str);
preprocess_html(req, domain_name, emojis, document.body_element());
return blankie::html::HTMLString(document.serialize());
}
blankie::html::HTMLString preprocess_html(const httplib::Request& req, const std::vector<Emoji>& emojis, const std::string& str) {
return preprocess_html(req, "", emojis, blankie::html::HTMLString(blankie::html::escape(str)));
}
static inline void preprocess_html(const httplib::Request& req, const std::string& domain_name, const std::vector<Emoji>& emojis, lxb_dom_element_t* element) {
const char* tag_name = reinterpret_cast<const char*>(lxb_dom_element_tag_name(element, nullptr));
if (strncmp(tag_name, "A", 2) == 0) {
// Proprocess links
preprocess_link(req, domain_name, element);
} else if (strncmp(tag_name, "BLOCKQUOTE", 11) == 0) {
// Prepend "> " to <blockquote>
preprocess_blockquote(element);
}
// Walk through the element's children
lxb_dom_node_t* child = lxb_dom_node_first_child(lxb_dom_interface_node(element));
while (child) {
if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) {
preprocess_html(req, domain_name, emojis, lxb_dom_interface_element(child));
} else if (child->type == LXB_DOM_NODE_TYPE_TEXT) {
child = emojify(child, emojis);
}
child = lxb_dom_node_next(child);
}
}
// examples of mention and hashtag: https://anarres.family/@ashten@social.platypus-sandbox.com/111532064387673301
static std::regex proxy_class_re("\\b(mention|hashtag)\\b");
static inline void preprocess_link(const httplib::Request& req, const std::string& domain_name, lxb_dom_element_t* element) {
using namespace std::string_literals;
// Remove target=...
lxb_status_t status = lxb_dom_element_remove_attribute(element, reinterpret_cast<const lxb_char_t*>("target"), 6);
if (status != LXB_STATUS_OK) {
throw LXB::Exception(status);
}
size_t href_c_len;
const lxb_char_t* href_c = lxb_dom_element_get_attribute(element, reinterpret_cast<const lxb_char_t*>("href"), 4, &href_c_len);
if (!href_c) {
return;
}
std::string href(reinterpret_cast<const char*>(href_c), href_c_len);
size_t cls_c_len;
const lxb_char_t* cls_c = lxb_dom_element_class(element, &cls_c_len);
std::string cls = cls_c ? std::string(reinterpret_cast<const char*>(cls_c), cls_c_len) : "";
try {
CurlUrl href_url;
href_url.set(CURLUPART_URL, get_origin(req));
href_url.set(CURLUPART_PATH, std::string(href_url.get(CURLUPART_PATH).get()) + req.path);
href_url.set(CURLUPART_URL, href);
CurlUrl instance_url_base;
instance_url_base.set(CURLUPART_SCHEME, "https");
instance_url_base.set(CURLUPART_HOST, domain_name);
// Instance base is used for link fields
if (std::regex_search(cls, proxy_class_re) || starts_with(href_url, instance_url_base)) {
// Proxy this instance's URLs to Coyote
href = proxy_mastodon_url(req, std::move(href));
lxb_dom_element_set_attribute(element, reinterpret_cast<const lxb_char_t*>("href"), 4, reinterpret_cast<const lxb_char_t*>(href.data()), href.size());
}
} catch (const CurlUrlException& e) {
// example: <a href=""></a> on eldritch.cafe/about
if (e.code != CURLUE_MALFORMED_INPUT) {
throw;
}
}
if (should_fix_link(element, cls)) {
// Set the content of each <a> to its href
status = lxb_dom_node_text_content_set(lxb_dom_interface_node(element), reinterpret_cast<const lxb_char_t*>(href.data()), href.size());
if (status != LXB_STATUS_OK) {
throw LXB::Exception(status);
}
}
}
// https://tech.lgbt/@mia@void.rehab/111500676785694526
static inline void preprocess_blockquote(lxb_dom_element_t* element) {
using namespace std::string_literals;
lxb_dom_node_t* child = lxb_dom_node_first_child(lxb_dom_interface_node(element));
while (child) {
if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) {
preprocess_blockquote(lxb_dom_interface_element(child));
} else if (child->type == LXB_DOM_NODE_TYPE_TEXT) {
size_t len;
const char* text = reinterpret_cast<const char*>(lxb_dom_node_text_content(child, &len));
std::string new_text = "> "s + std::string(text, len);
lxb_dom_node_t* new_child = lxb_dom_interface_node(lxb_dom_document_create_text_node(child->owner_document, reinterpret_cast<const lxb_char_t*>(new_text.data()), new_text.size()));
lxb_dom_node_insert_after(child, new_child);
lxb_dom_node_remove(child);
child = new_child;
}
child = lxb_dom_node_next(child);
}
}
static std::regex unhandled_link_re("\\bunhandled-link\\b");
static inline bool should_fix_link(lxb_dom_element_t* element, const std::string& element_cls) {
// https://vt.social/@LucydiaLuminous/111448085044245037
if (std::regex_search(element_cls, unhandled_link_re)) {
return true;
}
auto expected_element = [](lxb_dom_node_t* node, const char* expected_cls) {
if (!node || node->type != LXB_DOM_NODE_TYPE_ELEMENT) {
return false;
}
lxb_dom_element_t* span = lxb_dom_interface_element(node);
const char* tag_name = reinterpret_cast<const char*>(lxb_dom_element_tag_name(span, nullptr));
if (strncmp(tag_name, "SPAN", 5) != 0) {
return false;
}
const lxb_char_t* cls = lxb_dom_element_get_attribute(span, reinterpret_cast<const lxb_char_t*>("class"), 5, nullptr);
return cls && strcmp(reinterpret_cast<const char*>(cls), expected_cls) == 0;
};
lxb_dom_node_t* child = lxb_dom_node_first_child(lxb_dom_interface_node(element));
if (!expected_element(child, "invisible")) {
return false;
}
child = lxb_dom_node_next(child);
if (!expected_element(child, "ellipsis") && !expected_element(child, "")) {
return false;
}
child = lxb_dom_node_next(child);
if (!expected_element(child, "invisible")) {
return false;
}
child = lxb_dom_node_next(child);
return child == nullptr;
}
static inline void get_text_content(lxb_dom_node_t* node, std::string& out, size_t blockquote_depth) {
bool is_br = false, is_p = false, is_blockquote = false;
if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
size_t len;
const char* text = reinterpret_cast<const char*>(lxb_dom_node_text_content(node, &len));
for (size_t i = 0; i < blockquote_depth; i++) {
out += "> ";
}
out.append(text, len);
} else if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
lxb_dom_element_t* element = lxb_dom_interface_element(node);
const char* tag_name = reinterpret_cast<const char*>(lxb_dom_element_tag_name(element, nullptr));
is_p = strncmp(tag_name, "P", 2) == 0;
is_br = strncmp(tag_name, "BR", 3) == 0;
is_blockquote = strncmp(tag_name, "BLOCKQUOTE", 11) == 0;
}
if (is_p || is_br || is_blockquote) {
out.push_back('\n');
}
lxb_dom_node_t* child = lxb_dom_node_first_child(node);
while (child) {
get_text_content(child, out, blockquote_depth + is_blockquote);
child = lxb_dom_node_next(child);
}
if (is_p || is_blockquote) {
out.push_back('\n');
}
}
static inline lxb_dom_node_t* emojify(lxb_dom_node_t* child, const std::vector<Emoji>& emojis) {
size_t len;
const char* text = reinterpret_cast<const char*>(lxb_dom_node_text_content(child, &len));
std::vector<lxb_dom_node_t*> nodes = emojify(child->owner_document, std::string(text, len), emojis);
lxb_dom_node_insert_after(child, nodes[0]);
lxb_dom_node_destroy(child);
child = nodes[0];
for (size_t i = 1; i < nodes.size(); i++) {
lxb_dom_node_insert_after(child, nodes[i]);
child = nodes[i];
}
return child;
}
static std::regex shortcode_re(":([a-zA-Z0-9_]+):");
static inline std::vector<lxb_dom_node_t*> emojify(lxb_dom_document_t* document, std::string str, const std::vector<Emoji>& emojis) {
std::string buf;
std::smatch sm;
std::vector<lxb_dom_node*> res;
while (std::regex_search(str, sm, shortcode_re)) {
buf += sm.prefix();
std::string group_0 = sm.str(0);
auto emoji = std::find_if(emojis.begin(), emojis.end(), [&](const Emoji& i) { return i.shortcode == sm.str(1); });
if (emoji != emojis.end()) {
res.push_back(lxb_dom_interface_node(lxb_dom_document_create_text_node(document, reinterpret_cast<const lxb_char_t*>(buf.data()), buf.size())));
buf.clear();
lxb_dom_element_t* img = lxb_dom_document_create_element(document, reinterpret_cast<const lxb_char_t*>("IMG"), 3, nullptr);
lxb_dom_element_set_attribute(img, reinterpret_cast<const lxb_char_t*>("class"), 5, reinterpret_cast<const lxb_char_t*>("custom_emoji"), 12);
lxb_dom_element_set_attribute(img, reinterpret_cast<const lxb_char_t*>("alt"), 3, reinterpret_cast<const lxb_char_t*>(group_0.data()), group_0.size());
lxb_dom_element_set_attribute(img, reinterpret_cast<const lxb_char_t*>("title"), 5, reinterpret_cast<const lxb_char_t*>(group_0.data()), group_0.size());
lxb_dom_element_set_attribute(img, reinterpret_cast<const lxb_char_t*>("src"), 3, reinterpret_cast<const lxb_char_t*>(emoji->url.data()), emoji->url.size());
res.push_back(lxb_dom_interface_node(img));
} else {
buf += group_0;
}
str = sm.suffix();
}
if (!str.empty()) {
buf += std::move(str);
}
if (!buf.empty()) {
res.push_back(lxb_dom_interface_node(lxb_dom_document_create_text_node(document, reinterpret_cast<const lxb_char_t*>(buf.data()), buf.size())));
}
return res;
}
static Element serialize_post(const httplib::Request& req, const std::string& server, const Post& post, bool main_post, const std::optional<PostStatus>& post_status, const Post* reblogged) {
using namespace std::string_literals;
bool user_known = !post.account.id.empty();
bool user_ref_known = !post.account.username.empty() && !post.account.server.empty();
// `reblogged == nullptr` since a malicious server could take down the frontend
// by sending a post that is not a reblog with no account information
std::string post_url = user_known || reblogged == nullptr
? get_origin(req) + '/' + server + "/@" + post.account.acct(false) + '/' + post.id + "#m"
: get_origin(req) + '/' + server + "/@" + reblogged->account.acct(false) + '/' + reblogged->id + "#m";
std::string time_title = post.edited_at < 0
? full_time(post.created_at)
: "Created: "s + full_time(post.created_at) + "\nEdited: " + full_time(post.edited_at);
const char* time_badge = post.edited_at < 0 ? "" : " (edited)";
blankie::html::HTMLString preprocessed_html = preprocess_html(req, server, post.emojis, post.content);
// Workaround for https://vt.social/@a1ba@suya.place/110552480243348878#m
if (preprocessed_html.str.find("<p>") == std::string::npos) {
size_t offset = 0;
while ((offset = preprocessed_html.str.find('\n', offset)) != std::string::npos) {
preprocessed_html.str.replace(offset, 1, "<br>");
offset += 4;
}
preprocessed_html.str.reserve(preprocessed_html.str.size() + 3 + 4);
preprocessed_html.str.insert(0, "<p>");
preprocessed_html.str.append("</p>");
}
Element contents("div", {{"class", "post-contents"}}, {std::move(preprocessed_html)});
Element post_attachments("div", {{"class", "post-attachments"}}, {});
post_attachments.nodes.reserve(post.media_attachments.size());
for (const Media& media : post.media_attachments) {
post_attachments.nodes.push_back(serialize_media(media));
}
contents.nodes.push_back(std::move(post_attachments));
if (post.poll) {
contents.nodes.push_back(serialize_poll(req, *post.poll));
}
if (post.sensitive) {
std::string spoiler_text = !post.spoiler_text.empty() ? post.spoiler_text : "See more";
contents = Element("details", {
Element("summary", {preprocess_html(req, post.emojis, std::move(spoiler_text))}),
std::move(contents),
});
if (UserSettings(req).auto_open_cw) {
contents.attributes.push_back({"open", ""});
}
}
Element div("div", {{"class", "post"}}, {
Element("div", {{"class", "post-header"}}, {
user_ref_known ? Element("a", {{"href", get_origin(req) + '/' + server + "/@" + post.account.acct(false)}}, {
!post.account.avatar_static.empty()
? Element("img", {{"class", "post-avatar"}, {"alt", "User profile picture"}, {"loading", "lazy"}, {"src", post.account.avatar_static}}, {})
: Node(""),
Element("span", {
Element("b", {preprocess_html(req, post.account.emojis, post.account.display_name)}),
Element("br"), "@", post.account.acct(),
}),
}) : Element("b", {"Unknown user"}),
Element("a", {{"class", "post-time_header"}, {"href", std::move(post_url)}, {"title", time_title}}, {
Element("time", {{"datetime", to_rfc3339(post.created_at)}}, {relative_time(post.created_at, current_time()), time_badge}),
}),
}),
contents,
});
if (post_status) {
div.nodes.insert(div.nodes.begin(), Element("p", {
blankie::html::HTMLString(post_status->icon_html), " ", post_status->info_node,
}));
}
if (main_post) {
div.attributes = {{"class", "post main_post"}, {"id", "m"}};
}
return div;
}
static inline Element serialize_media(const Media& media) {
Element element = [&]() {
if (media.type == "image") {
return Element("a", {{"href", media.url}}, {
Element("img", {{"loading", "lazy"}, {"src", media.preview_url.value_or(media.url)}}, {}),
});
} else if (media.type == "video") {
Element video("video", {{"controls", ""}, {"src", media.url}}, {});
if (media.preview_url) {
video.attributes.push_back({"poster", *media.preview_url});
}
return video;
} else if (media.type == "audio") {
return Element("audio", {{"controls", ""}, {"src", media.url}}, {});
} else if (media.type == "gifv") {
// https://hachyderm.io/@Impossible_PhD/111444541628207638
Element video("video", {{"controls", ""}, {"loop", ""}, {"muted", ""}, {"autoplay", ""}, {"src", media.url}}, {});
if (media.preview_url) {
video.attributes.push_back({"poster", *media.preview_url});
}
return video;
} else if (media.type == "unknown") {
if (media.remote_url) {
// https://botsin.space/@lina@vt.social/111053598696451525
return Element("a", {{"class", "unknown_media"}, {"href", *media.remote_url}}, {"Media is not available from this instance, view externally"});
} else {
return Element("p", {{"class", "unknown_media"}}, {"Media is not available from this instance"});
}
} else {
return Element("p", {"Unsupported media type: ", media.type});
}
}();
if (media.description) {
element.attributes.push_back({"alt", *media.description});
element.attributes.push_back({"title", *media.description});
}
return element;
}
static inline Element serialize_poll(const httplib::Request& req, const Poll& poll) {
using namespace std::string_literals;
uint64_t voters_count = poll.voters_count >= 0 ? static_cast<uint64_t>(poll.voters_count) : poll.votes_count;
Element div("div");
auto pick_form = [](uint64_t count, const char* singular, const char* plural) {
return count == 1 ? singular : plural;
};
div.nodes.reserve(poll.options.size() + 1);
for (const PollOption& option : poll.options) {
std::string percentage = voters_count
? std::to_string(option.votes_count * 100 / voters_count) + '%'
: "0%";
div.nodes.push_back(Element("div", {{"class", "poll-option"}, {"title", std::to_string(option.votes_count) + pick_form(option.votes_count, " vote", " votes")}}, {
Element("b", {{"class", "poll-percentage"}}, {percentage}), " ", preprocess_html(req, poll.emojis, option.title),
Element("object", {{"class", "poll-bar"}, {"width", percentage}}, {}),
}));
}
Element p("p", poll.voters_count >= 0
? std::vector<Node>({std::to_string(voters_count), " ", pick_form(voters_count, "voter", "voters")})
: std::vector<Node>({std::to_string(poll.votes_count), " ", pick_form(poll.votes_count, "vote", "votes")})
);
if (poll.expired) {
p.nodes.push_back(" / ");
p.nodes.push_back(Element("time", {{"datetime", to_rfc3339(poll.expires_at)}, {"title", "Expired on "s + full_time(poll.expires_at)}}, {"Expired"}));
} else if (poll.expires_at >= 0) {
p.nodes.push_back(" / ");
p.nodes.push_back(Element("time", {{"datetime", to_rfc3339(poll.expires_at)}, {"title", full_time(poll.expires_at)}}, {
"Expires in ", relative_time(current_time(), poll.expires_at),
}));
}
div.nodes.push_back(std::move(p));
return div;
}