diff --git a/.gitmodules b/.gitmodules index 6a59a3d..e2e295c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "thirdparty/httplib"] path = thirdparty/httplib url = https://github.com/yhirose/cpp-httplib.git +[submodule "thirdparty/lexbor"] + path = thirdparty/lexbor + url = https://github.com/lexbor/lexbor.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 42b78c5..52ab7e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,8 @@ find_package(nlohmann_json REQUIRED) find_package(CURL REQUIRED) set(HTTPLIB_REQUIRE_OPENSSL ON) add_subdirectory(thirdparty/httplib) +set(LEXBOR_BUILD_SHARED OFF) +add_subdirectory(thirdparty/lexbor) #find_package(PkgConfig REQUIRED) #pkg_check_modules(HIREDIS REQUIRED hiredis) @@ -36,5 +38,5 @@ set_target_properties(${PROJECT_NAME} CXX_EXTENSIONS NO ) target_include_directories(${PROJECT_NAME} PRIVATE thirdparty ${HIREDIS_INCLUDE_DIRS}) -target_link_libraries(${PROJECT_NAME} PRIVATE nlohmann_json::nlohmann_json httplib::httplib CURL::libcurl ${HIREDIS_LINK_LIBRARIES}) +target_link_libraries(${PROJECT_NAME} PRIVATE nlohmann_json::nlohmann_json CURL::libcurl httplib::httplib lexbor_static ${HIREDIS_LINK_LIBRARIES}) target_compile_options(${PROJECT_NAME} PRIVATE ${FLAGS}) diff --git a/lxb_wrapper.h b/lxb_wrapper.h new file mode 100644 index 0000000..cd32123 --- /dev/null +++ b/lxb_wrapper.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#include +#pragma GCC diagnostic pop + +namespace LXB { + +class Exception : public std::exception { +public: + Exception(lxb_status_t status) { + using namespace std::string_literals; + this->_msg = "LXB Exception: "s + std::to_string(status); + } + + const char* what() const noexcept { + return this->_msg.c_str(); + } + +private: + std::string _msg; +}; + +namespace HTML { + +class Document { +public: + Document(const Document&&) = delete; + Document&& operator=(const Document&&) = delete; + + Document(const std::string& str) { + this->_document = lxb_html_document_create(); + if (!this->_document) { + throw std::bad_alloc(); + } + lxb_status_t status = lxb_html_document_parse(this->_document, reinterpret_cast(str.data()), str.size()); + if (status != LXB_STATUS_OK) { + lxb_html_document_destroy(this->_document); + throw Exception(status); + } + } + ~Document() { + lxb_html_document_destroy(this->_document); + } + + constexpr lxb_dom_node_t* body() const noexcept { + lxb_dom_node_t* node = lxb_dom_interface_node(this->_document); + lxb_dom_node_t* html = lxb_dom_node_first_child(node); + lxb_dom_node_t* body = lxb_dom_node_last_child(html); + return body; + } + constexpr lxb_dom_element_t* body_element() const noexcept { + lxb_dom_node_t* body = this->body(); + assert(body->type == LXB_DOM_NODE_TYPE_ELEMENT); + return lxb_dom_interface_element(body); + } + + std::string serialize() const { + std::string res; + lxb_dom_node_t* body = this->body(); + + lxb_dom_node_t* child = lxb_dom_node_first_child(body); + while (child) { + lxb_status_t status = lxb_html_serialize_tree_cb(child, Document::_serialize_cb, &res); + if (status != LXB_STATUS_OK) { + throw Exception(status); + } + + child = lxb_dom_node_next(child); + } + + return res; + } + +private: + static lxb_status_t _serialize_cb(const lxb_char_t* data, size_t len, void* ctx) { + std::string* str = reinterpret_cast(ctx); + str->append(reinterpret_cast(data), len); + return LXB_STATUS_OK; + } + + lxb_html_document_t* _document; +}; + +}; // namespace HTML + +}; // namespace LXB diff --git a/routes/user.cpp b/routes/user.cpp index a58a18e..4079f62 100644 --- a/routes/user.cpp +++ b/routes/user.cpp @@ -8,7 +8,7 @@ static const char* sorting_method_suffixes[3] = {"", "/with_replies", "/media"}; static inline PostSortingMethod get_sorting_method(const std::string& method); static inline Element user_header(const httplib::Request& req, const Account& account, PostSortingMethod sorting_method); -static inline Element user_link_field(const AccountField& field); +static inline Element user_link_field(const httplib::Request& req, const std::string& domain_name, const AccountField& field); static inline Element sorting_method_link(const httplib::Request& req, const Account& account, PostSortingMethod current_method, PostSortingMethod new_method); @@ -59,7 +59,7 @@ static inline Element user_header(const httplib::Request& req, const Account& ac Element user_links("table", {{"class", "user_page-user_links"}}, {}); user_links.nodes.reserve(account.fields.size()); for (const AccountField& i : account.fields) { - user_links.nodes.push_back(user_link_field(i)); + user_links.nodes.push_back(user_link_field(req, account.domain_name, i)); } Element header("header", { @@ -82,7 +82,7 @@ static inline Element user_header(const httplib::Request& req, const Account& ac }), Element("div", {{"class", "user_page-user_description"}}, { - Element("div", {{"class", "user_page-user_bio"}}, {account.note_html}), + Element("div", {{"class", "user_page-user_bio"}}, {preprocess_html(req, account.domain_name, account.note_html)}), std::move(user_links), }), @@ -96,12 +96,12 @@ static inline Element user_header(const httplib::Request& req, const Account& ac return header; } -static inline Element user_link_field(const AccountField& field) { +static inline Element user_link_field(const httplib::Request& req, const std::string& domain_name, const AccountField& field) { using namespace std::string_literals; Element tr("tr", { Element("th", {field.name}), - Element("td", {field.value_html}), + Element("td", {preprocess_html(req, domain_name, field.value_html)}), }); if (field.verified_at >= 0) { struct tm verified_at; diff --git a/servehelper.cpp b/servehelper.cpp index 0d9d02a..e8e1203 100644 --- a/servehelper.cpp +++ b/servehelper.cpp @@ -6,8 +6,13 @@ #include "config.h" #include "servehelper.h" +#include "lxb_wrapper.h" #include "routes/routes.h" +static inline void preprocess_html(const httplib::Request& req, const std::string& domain_name, lxb_dom_element_t* element); +static inline void preprocess_link(const httplib::Request& req, const std::string& domain_name, lxb_dom_element_t* element); +static inline bool should_fix_link(lxb_dom_element_t* element); + class CurlUrlException : public std::exception { public: CurlUrlException(CURLUcode code_) : code(code_) {} @@ -119,18 +124,18 @@ std::string proxy_mastodon_url(const httplib::Request& req, const std::string& u throw CurlUrlException(code); } - auto get_part = [&](CURLUPart part) { - char* content; + auto get_part = [&](CURLUPart part, CURLUcode ignore = CURLUE_OK) { + char* content = nullptr; CURLUcode code = curl_url_get(url.get(), part, &content, 0); - if (code) { + if (code && code != ignore) { throw CurlUrlException(code); } return CurlStr(content, curl_free); }; CurlStr host = get_part(CURLUPART_HOST); CurlStr path = get_part(CURLUPART_PATH); - CurlStr query = get_part(CURLUPART_QUERY); - CurlStr fragment = get_part(CURLUPART_FRAGMENT); + CurlStr query = get_part(CURLUPART_QUERY, CURLUE_NO_QUERY); + CurlStr fragment = get_part(CURLUPART_FRAGMENT, CURLUE_NO_FRAGMENT); std::string new_url = get_origin(req) + '/' + host.get() + path.get(); if (query) { @@ -153,3 +158,92 @@ bool should_send_304(const httplib::Request& req, uint64_t hash) { size_t pos = header.find(std::string(1, '"') + std::to_string(hash) + '"'); return pos != std::string::npos && (pos == 0 || header[pos - 1] != '/'); } + +blankie::html::HTMLString preprocess_html(const httplib::Request& req, const std::string& domain_name, const blankie::html::HTMLString& str) { + LXB::HTML::Document document(str.str); + preprocess_html(req, domain_name, document.body_element()); + return blankie::html::HTMLString(document.serialize()); +} + + + +static inline void preprocess_html(const httplib::Request& req, const std::string& domain_name, lxb_dom_element_t* element) { + const char* tag_name = reinterpret_cast(lxb_dom_element_tag_name(element, nullptr)); + + if (strncmp(tag_name, "A", 2) == 0) { + // Proprocess links + preprocess_link(req, domain_name, element); + } + + // Walk through the element's children + lxb_dom_node_t* child = lxb_dom_node_first_child(lxb_dom_interface_node(element)); + while (child) { + if (child->type == LXB_DOM_NODE_TYPE_ELEMENT) { + preprocess_html(req, domain_name, lxb_dom_interface_element(child)); + } + + child = lxb_dom_node_next(child); + } +} + +static inline void preprocess_link(const httplib::Request& req, const std::string& domain_name, lxb_dom_element_t* element) { + using namespace std::string_literals; + + size_t href_c_len; + const lxb_char_t* href_c = lxb_dom_element_get_attribute(element, reinterpret_cast("href"), 4, &href_c_len); + if (!href_c) { + return; + } + std::string href(reinterpret_cast(href_c), href_c_len); + + std::string instance_url_base = "https://"s + domain_name; + if (href.starts_with(instance_url_base + '/') || href == instance_url_base) { + // Proxy this instance's URLs to Coyote + href = proxy_mastodon_url(req, std::move(href)); + + lxb_dom_element_set_attribute(element, reinterpret_cast("href"), 4, reinterpret_cast(href.data()), href.size()); + } + + if (should_fix_link(element)) { + // Set the content of each to its href + lxb_status_t status = lxb_dom_node_text_content_set(lxb_dom_interface_node(element), reinterpret_cast(href.data()), href.size()); + if (status != LXB_STATUS_OK) { + throw LXB::Exception(status); + } + } +} + +static inline bool should_fix_link(lxb_dom_element_t* element) { + auto expected_element = [](lxb_dom_node_t* node, const char* expected_cls) { + if (node->type != LXB_DOM_NODE_TYPE_ELEMENT) { + return false; + } + lxb_dom_element_t* span = lxb_dom_interface_element(node); + + const char* tag_name = reinterpret_cast(lxb_dom_element_tag_name(span, nullptr)); + if (strncmp(tag_name, "SPAN", 5) != 0) { + return false; + } + + const lxb_char_t* cls = lxb_dom_element_get_attribute(span, reinterpret_cast("class"), 5, nullptr); + return cls && strcmp(reinterpret_cast(cls), expected_cls) == 0; + }; + + lxb_dom_node_t* child = lxb_dom_node_first_child(lxb_dom_interface_node(element)); + if (!expected_element(child, "invisible")) { + return false; + } + + child = lxb_dom_node_next(child); + if (!expected_element(child, "ellipsis") && !expected_element(child, "")) { + return false; + } + + child = lxb_dom_node_next(child); + if (!expected_element(child, "invisible")) { + return false; + } + + child = lxb_dom_node_next(child); + return child == nullptr; +} diff --git a/servehelper.h b/servehelper.h index 9b57d6c..9bef23b 100644 --- a/servehelper.h +++ b/servehelper.h @@ -17,3 +17,5 @@ void serve_redirect(const httplib::Request& req, httplib::Response& res, std::st std::string get_origin(const httplib::Request& req); std::string proxy_mastodon_url(const httplib::Request& req, const std::string& url_str); bool should_send_304(const httplib::Request& req, uint64_t hash); + +blankie::html::HTMLString preprocess_html(const httplib::Request& req, const std::string& domain_name, const blankie::html::HTMLString& str); diff --git a/thirdparty/lexbor b/thirdparty/lexbor new file mode 160000 index 0000000..1416684 --- /dev/null +++ b/thirdparty/lexbor @@ -0,0 +1 @@ +Subproject commit 14166847cfa85d80c9041f5387014961f70f3831