Fix HTML in the description not being parsed
https://pixwhile.zangetsu.kaizoku.cyou/artworks/87524102
This commit is contained in:
		
							parent
							
								
									c8a079d85e
								
							
						
					
					
						commit
						50a52e38d4
					
				|  | @ -27,7 +27,7 @@ add_link_options(${FLAGS}) | |||
| 
 | ||||
| add_executable(${PROJECT_NAME} main.cpp misc.cpp config.cpp servehelper.cpp numberhelper.cpp pixivclient.cpp pixivmodels.cpp hiredis_wrapper.cpp | ||||
|     routes/home.cpp routes/css.cpp routes/artworks.cpp routes/tags.cpp routes/guess_extension.cpp routes/users/common.cpp routes/users/illustrations.cpp | ||||
|     blankie/serializer.cpp blankie/escape.cpp blankie/murl.cpp) | ||||
|     blankie/serializer.cpp blankie/escape.cpp blankie/unescape.cpp blankie/murl.cpp) | ||||
| set_target_properties(${PROJECT_NAME} | ||||
|     PROPERTIES | ||||
|         CXX_STANDARD 20 | ||||
|  |  | |||
|  | @ -26,11 +26,6 @@ | |||
| // Looser than RFC 3986, but fragments might as well own everything
 | ||||
| #define FRAGMENT "([^ ]*)" | ||||
| 
 | ||||
| #define FULL_HTTP_HTTPS_REGEX \ | ||||
|     "(?:(https?)?:)?//" AUTHORITY \ | ||||
|     PATH_ABEMPTY \ | ||||
|     "(?:\\?" QUERY ")?" \ | ||||
|     "(?:#" FRAGMENT ")?" | ||||
| #define HTTP_HTTPS_REGEX \ | ||||
|     "(?:(https?)?:)?(?://" AUTHORITY ")?" \ | ||||
|     PATH_ABEMPTY \ | ||||
|  | @ -45,8 +40,7 @@ static std::string tolower(std::string str); | |||
| namespace blankie { | ||||
| namespace murl { | ||||
| 
 | ||||
| std::regex full_url_regex(FULL_HTTP_HTTPS_REGEX, std::regex::icase); | ||||
| std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase); | ||||
| const std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase); | ||||
| 
 | ||||
| Url::Url(const std::string& str) { | ||||
|     std::smatch sm; | ||||
|  |  | |||
|  | @ -6,8 +6,6 @@ | |||
| namespace blankie { | ||||
| namespace murl { | ||||
| 
 | ||||
| extern std::regex full_url_regex; | ||||
| 
 | ||||
| struct Url { | ||||
|     std::string scheme; | ||||
|     std::string userinfo; | ||||
|  |  | |||
|  | @ -36,6 +36,8 @@ std::string Element::serialize() const { | |||
|             out += escape(*text); | ||||
|         } else if (const std::string* str = std::get_if<std::string>(&node)) { | ||||
|             out += escape(*str); | ||||
|         } else if (const HTMLString* html_str = std::get_if<HTMLString>(&node)) { | ||||
|             out += html_str->str; | ||||
|         } else { | ||||
|             throw std::runtime_error("Encountered unknown node"); | ||||
|         } | ||||
|  |  | |||
|  | @ -9,9 +9,15 @@ namespace blankie { | |||
| namespace html { | ||||
| 
 | ||||
| struct Element; | ||||
| struct HTMLString { | ||||
|     HTMLString() = default; | ||||
|     explicit HTMLString(std::string str_) : str(std::move(str_)) {} | ||||
| 
 | ||||
|     std::string str; | ||||
| }; | ||||
| 
 | ||||
| typedef std::pair<const char*, std::string> Attribute; | ||||
| typedef std::variant<Element, const char*, std::string> Node; | ||||
| typedef std::variant<Element, const char*, std::string, HTMLString> Node; | ||||
| 
 | ||||
| struct Element { | ||||
|     const char* tag; | ||||
|  |  | |||
|  | @ -0,0 +1,125 @@ | |||
| #include <string> | ||||
| #include <climits> | ||||
| #include <stdexcept> | ||||
| 
 | ||||
| #include "unescape.h" | ||||
| #include "unescape_data.h" | ||||
| 
 | ||||
| // https://stackoverflow.com/a/42013433
 | ||||
| static inline size_t codepoint_to_utf8(char* buf, const unsigned long code) { | ||||
|     // you see, i don't care
 | ||||
|     // https://t.me/NightShadowsHangout/670534
 | ||||
|     #pragma GCC diagnostic push | ||||
|     #pragma GCC diagnostic ignored "-Wconversion" | ||||
|     if (code <= 0x7F) { | ||||
|         buf[0] = static_cast<char>(code); | ||||
|         return 1; | ||||
|     } else if (code <= 0x7FF) { | ||||
|         buf[0] = 0xC0 | static_cast<char>(code >> 6);   /* 110xxxxx */ | ||||
|         buf[1] = 0x80 | (code & 0x3F);                  /* 10xxxxxx */ | ||||
|         return 2; | ||||
|     } else if (code <= 0xFFFF) { | ||||
|         buf[0] = 0xE0 | static_cast<char>(code >> 12);  /* 1110xxxx */ | ||||
|         buf[1] = 0x80 | ((code >> 6) & 0x3F);           /* 10xxxxxx */ | ||||
|         buf[2] = 0x80 | (code & 0x3F);                  /* 10xxxxxx */ | ||||
|         return 3; | ||||
|     } else if (code <= 0x10FFFF) { | ||||
|         buf[0] = 0xF0 | static_cast<char>(code >> 18);  /* 11110xxx */ | ||||
|         buf[1] = 0x80 | ((code >> 12) & 0x3F);          /* 10xxxxxx */ | ||||
|         buf[2] = 0x80 | ((code >> 6) & 0x3F);           /* 10xxxxxx */ | ||||
|         buf[3] = 0x80 | (code & 0x3F);                  /* 10xxxxxx */ | ||||
|         return 4; | ||||
|     #pragma GCC diagnostic pop | ||||
|     } else { | ||||
|         throw std::invalid_argument("codepoint passed is bigger than 0x10FFFF"); | ||||
|     } | ||||
| } | ||||
| 
 | ||||
| static inline bool isdigit(char ch) { | ||||
|     return ch >= '0' && ch <= '9'; | ||||
| } | ||||
| 
 | ||||
| static inline bool ishex(char ch) { | ||||
|     return isdigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); | ||||
| } | ||||
| 
 | ||||
| static inline unsigned long decode_numeric_entity(const std::string& entity) { | ||||
|     unsigned long codepoint; | ||||
|     char* last_converted_char; | ||||
| 
 | ||||
|     errno = 0; | ||||
|     if (entity[1] == 'x' || entity[1] == 'X') { | ||||
|         if (entity.size() <= 2 || !ishex(entity[2])) { | ||||
|             return 0; | ||||
|         } | ||||
|         codepoint = strtoul(&entity.c_str()[2], &last_converted_char, 16); | ||||
|     } else { | ||||
|         if (entity.size() <= 1 || !isdigit(entity[1])) { | ||||
|             return 0; | ||||
|         } | ||||
|         codepoint = strtoul(&entity.c_str()[1], &last_converted_char, 10); | ||||
|     } | ||||
| 
 | ||||
|     if ((codepoint == ULONG_MAX && errno == ERANGE) || last_converted_char[0] != '\0') { | ||||
|         return 0; | ||||
|     } | ||||
| 
 | ||||
|     if (codepoint >= 0x80 && codepoint <= 0x9F) { | ||||
|         codepoint = windows1252_repl[codepoint - 0x80]; | ||||
|     } | ||||
|     if (!codepoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) { | ||||
|         codepoint = 0xFFFD; | ||||
|     } | ||||
| 
 | ||||
|     return codepoint; | ||||
| } | ||||
| 
 | ||||
| static inline unsigned long decode_string_entity(const std::string& entity) { | ||||
|     for (const Entity& i : string_entities) { | ||||
|         if (entity == i.string) { | ||||
|             return i.codepoint; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     return 0; | ||||
| } | ||||
| 
 | ||||
| static inline unsigned long decode_entity(std::string entity) { | ||||
|     return !entity.empty() && entity[0] == '#' | ||||
|         ? decode_numeric_entity(entity) | ||||
|         : decode_string_entity(entity); | ||||
| } | ||||
| 
 | ||||
| namespace blankie { | ||||
| namespace html { | ||||
| 
 | ||||
| [[nodiscard]] std::string unescape(const std::string& str) { | ||||
|     std::string output; | ||||
|     unsigned long codepoint; | ||||
|     char codepoint_buf[4]; | ||||
|     size_t offset = 0, old_offset = 0, offset_end, codepoint_buf_size; | ||||
| 
 | ||||
|     output.reserve(str.size()); | ||||
|     while ((offset = str.find('&', offset)) != std::string::npos) { | ||||
|         offset_end = str.find(';', ++offset); | ||||
|         if (offset_end == std::string::npos) { | ||||
|             break; | ||||
|         } | ||||
| 
 | ||||
|         codepoint = decode_entity(str.substr(offset, offset_end - offset)); | ||||
|         if (codepoint) { | ||||
|             codepoint_buf_size = codepoint_to_utf8(codepoint_buf, codepoint); | ||||
|             output.append(str, old_offset, offset - old_offset - 1); | ||||
|             output.append(codepoint_buf, codepoint_buf_size); | ||||
|             old_offset = offset = offset_end + 1; | ||||
|         } | ||||
|     } | ||||
| 
 | ||||
|     if (str.size() > old_offset) { | ||||
|         output.append(str, old_offset, std::string::npos); | ||||
|     } | ||||
|     return output; | ||||
| } | ||||
| 
 | ||||
| } // namespace html
 | ||||
| } // namespace blankie
 | ||||
|  | @ -0,0 +1,9 @@ | |||
| #pragma once | ||||
| 
 | ||||
| namespace blankie { | ||||
| namespace html { | ||||
| 
 | ||||
| [[nodiscard]] std::string unescape(const std::string& str); | ||||
| 
 | ||||
| } // namespace html
 | ||||
| } // namespace blankie
 | ||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -117,8 +117,8 @@ void from_json(const nlohmann::json& j, Illust& illust) { | |||
|     illust_details.at("upload_timestamp").get_to(illust.upload_time); | ||||
| 
 | ||||
|     if (full_data) { | ||||
|         if (illust_details.contains("comment") && illust_details["comment"].is_string()) { | ||||
|             illust.comment = illust_details["comment"].get<std::string>(); | ||||
|         if (illust_details.contains("comment_html") && illust_details["comment_html"].is_string()) { | ||||
|             illust.comment_html = illust_details["comment_html"].get<std::string>(); | ||||
|         } | ||||
|         illust_details.at("display_tags").get_to(illust.tags); | ||||
|     } | ||||
|  | @ -179,7 +179,7 @@ void from_json(const nlohmann::json& j, SearchResults& search_results) { | |||
|             // and i cba to use regex for it, especially when it's not even used in this context
 | ||||
|             .upload_time = -1, | ||||
| 
 | ||||
|             .comment = std::nullopt, | ||||
|             .comment_html = std::nullopt, | ||||
|             .tags = std::move(tags), | ||||
|             .images = {get_illust_images(i, std::nullopt)}, | ||||
|             .page_count = i.at("pageCount").get<size_t>() | ||||
|  |  | |||
|  | @ -50,7 +50,7 @@ struct Illust { | |||
|     bool ai_generated; | ||||
|     time_t upload_time; | ||||
| 
 | ||||
|     std::optional<std::string> comment; | ||||
|     std::optional<std::string> comment_html; | ||||
|     std::vector<Tag> tags; | ||||
|     std::vector<Images> images; | ||||
|     size_t page_count; | ||||
|  |  | |||
|  | @ -2,6 +2,8 @@ | |||
| 
 | ||||
| #include "routes.h" | ||||
| #include "../blankie/murl.h" | ||||
| #include "../blankie/escape.h" | ||||
| #include "../blankie/unescape.h" | ||||
| #include "../servehelper.h" | ||||
| #include "../numberhelper.h" | ||||
| #include "../pixivclient.h" | ||||
|  | @ -9,10 +11,9 @@ | |||
| static inline Element generate_user_link(const httplib::Request& req, const Config& config, const Illust& illust); | ||||
| static inline Element generate_images(const httplib::Request& req, const Config& config, const Illust& illust); | ||||
| static inline Element generate_preview_images(const httplib::Request& req, const Config& config, const Illust& illust); | ||||
| static inline Nodes parse_description_line(const httplib::Request& req, const Config& config, std::string str); | ||||
| static inline Element generate_description(const httplib::Request& req, const Config& config, const std::string& description); | ||||
| static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust); | ||||
| static inline std::string generate_description_text(const httplib::Request& req, const Config& config, std::string description); | ||||
| static inline blankie::html::HTMLString fix_description_links(const httplib::Request& req, const Config& config, blankie::html::HTMLString str); | ||||
| static inline std::string generate_description_text(const httplib::Request& req, const Config& config, blankie::html::HTMLString description); | ||||
| static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview); | ||||
| 
 | ||||
| static inline bool is_true(const std::string& str); | ||||
|  | @ -48,8 +49,8 @@ void artworks_route(const httplib::Request& req, httplib::Response& res, const C | |||
|         !preview ? generate_images(req, config, illust) : generate_preview_images(req, config, illust), | ||||
|         Element("br") | ||||
|     }); | ||||
|     if (illust.comment) { | ||||
|         body.nodes.push_back(generate_description(req, config, *illust.comment)); | ||||
|     if (illust.comment_html) { | ||||
|         body.nodes.push_back(Element("p", {fix_description_links(req, config, blankie::html::HTMLString(*illust.comment_html))})); | ||||
|     } | ||||
|     body.nodes.push_back(generate_illust_tags(req, config, illust)); | ||||
|     body.nodes.push_back(Element("p", {time_to_string(illust.upload_time)})); | ||||
|  | @ -122,61 +123,6 @@ static inline Element generate_preview_images(const httplib::Request& req, const | |||
|     return div; | ||||
| } | ||||
| 
 | ||||
| static inline Nodes parse_description_line(const httplib::Request& req, const Config& config, std::string str) { | ||||
|     Nodes nodes; | ||||
|     std::smatch sm; | ||||
| 
 | ||||
|     while (std::regex_search(str, sm, blankie::murl::full_url_regex)) { | ||||
|         std::string prefix = sm.prefix(); | ||||
|         std::string url_str = sm.str(0); | ||||
|         std::string suffix = sm.suffix(); | ||||
| 
 | ||||
|         if (prefix.ends_with('(') && url_str.ends_with(')')) { | ||||
|             url_str.pop_back(); | ||||
|             suffix.insert(0, 1, ')'); | ||||
|         } | ||||
|         if (!prefix.empty()) { | ||||
|             nodes.push_back(std::move(prefix)); | ||||
|         } | ||||
| 
 | ||||
|         blankie::murl::Url url(std::move(url_str)); | ||||
|         url_str = url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net") | ||||
|             ? proxy_pixiv_url(req, config, std::move(url)) | ||||
|             : url.to_string(); | ||||
|         nodes.push_back(Element("a", {{"href", url_str}}, {url_str})); | ||||
| 
 | ||||
|         str = std::move(suffix); | ||||
|     } | ||||
|     if (!str.empty()) { | ||||
|         nodes.push_back(std::move(str)); | ||||
|     } | ||||
| 
 | ||||
|     return nodes; | ||||
| } | ||||
| 
 | ||||
| static inline Element generate_description(const httplib::Request& req, const Config& config, const std::string& description) { | ||||
|     Element p("p"); | ||||
|     size_t pos = 0; | ||||
|     size_t last_pos = 0; | ||||
|     auto add = [&](std::string str) { | ||||
|         if (!p.nodes.empty()) { | ||||
|             p.nodes.push_back(Element("br")); | ||||
|         } | ||||
|         Nodes nodes = parse_description_line(req, config, std::move(str)); | ||||
|         p.nodes.insert(p.nodes.end(), nodes.begin(), nodes.end()); | ||||
|     }; | ||||
| 
 | ||||
|     while ((pos = description.find('\n', pos)) != std::string::npos) { | ||||
|         add(description.substr(last_pos, pos - last_pos)); | ||||
|         last_pos = ++pos; | ||||
|     } | ||||
|     if (description.size() > last_pos) { | ||||
|         add(description.substr(last_pos)); | ||||
|     } | ||||
| 
 | ||||
|     return p; | ||||
| } | ||||
| 
 | ||||
| static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust) { | ||||
|     Element div("div", {{"class", "illust-tags"}}, {}); | ||||
| 
 | ||||
|  | @ -198,33 +144,63 @@ static inline Element generate_illust_tags(const httplib::Request& req, const Co | |||
|     return div; | ||||
| } | ||||
| 
 | ||||
| static inline std::string generate_description_text(const httplib::Request& req, const Config& config, std::string description) { | ||||
| const std::regex start_link_regex("<a href=\"([^\"]+?)\"(?: target=\"_blank\")?>"); | ||||
| static inline blankie::html::HTMLString fix_description_links(const httplib::Request& req, const Config& config, blankie::html::HTMLString str) { | ||||
|     using namespace std::string_literals; | ||||
| 
 | ||||
|     blankie::html::HTMLString out; | ||||
|     std::smatch sm; | ||||
| 
 | ||||
|     out.str.reserve(str.str.size()); | ||||
|     while (std::regex_search(str.str, sm, start_link_regex)) { | ||||
|         out.str += sm.prefix(); | ||||
| 
 | ||||
|         std::string url_str; | ||||
|         blankie::murl::Url url(sm.str(1)); | ||||
|         if (url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net")) { | ||||
|             url_str = proxy_pixiv_url(req, config, std::move(url)); | ||||
|         } else if (url.path == "/jump.php") { | ||||
|             url_str = blankie::murl::unescape(std::move(url.query)); | ||||
|         } else { | ||||
|             url_str = url.to_string(); | ||||
|         } | ||||
|         out.str += "<a href=\""s + blankie::html::escape(std::move(url_str)) + "\">"; | ||||
| 
 | ||||
|         str.str = sm.suffix(); | ||||
|     } | ||||
|     out.str += std::move(str.str); | ||||
| 
 | ||||
|     return out; | ||||
| } | ||||
| 
 | ||||
| const std::regex link_regex("<a href=\"([^\"]+?)\">.+?</a>"); | ||||
| const std::regex tag_regex("<(.+?)>"); | ||||
| static inline std::string generate_description_text(const httplib::Request& req, const Config& config, blankie::html::HTMLString description) { | ||||
|     description = fix_description_links(req, config, std::move(description)); | ||||
| 
 | ||||
|     std::string new_description; | ||||
|     std::smatch sm; | ||||
| 
 | ||||
|     new_description.reserve(description.size()); | ||||
|     while (std::regex_search(description, sm, blankie::murl::full_url_regex)) { | ||||
|         std::string prefix = sm.prefix(); | ||||
|         std::string url_str = sm.str(0); | ||||
|         std::string suffix = sm.suffix(); | ||||
| 
 | ||||
|         if (prefix.ends_with('(') && url_str.ends_with(')')) { | ||||
|             url_str.pop_back(); | ||||
|             suffix.insert(0, 1, ')'); | ||||
|         } | ||||
|         new_description += std::move(prefix); | ||||
| 
 | ||||
|         blankie::murl::Url url(std::move(url_str)); | ||||
|         url_str = url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net") | ||||
|             ? proxy_pixiv_url(req, config, std::move(url)) | ||||
|             : url.to_string(); | ||||
|         new_description += std::move(url_str); | ||||
| 
 | ||||
|         description = std::move(suffix); | ||||
|     new_description.reserve(description.str.size()); | ||||
|     while (std::regex_search(description.str, sm, link_regex)) { | ||||
|         new_description += sm.prefix(); | ||||
|         new_description += sm.str(1); | ||||
|         description.str = sm.suffix(); | ||||
|     } | ||||
|     new_description += std::move(description); | ||||
|     new_description += std::move(description.str); | ||||
| 
 | ||||
|     return new_description; | ||||
|     description.str = std::move(new_description); | ||||
|     new_description.reserve(description.str.size()); | ||||
|     while (std::regex_search(description.str, sm, tag_regex)) { | ||||
|         new_description += sm.prefix(); | ||||
|         if (sm.str(1) == "br /") { | ||||
|             new_description += '\n'; | ||||
|         } | ||||
|         description.str = sm.suffix(); | ||||
|     } | ||||
|     new_description += std::move(description.str); | ||||
| 
 | ||||
|     return blankie::html::unescape(std::move(new_description)); | ||||
| } | ||||
| 
 | ||||
| static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview) { | ||||
|  | @ -238,8 +214,8 @@ static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config | |||
|         Element("meta", {{"property", "og:site_name"}, {"content", "Pixwhile"}}, {}), | ||||
|         Element("meta", {{"property", "og:url"}, {"content", std::move(url)}}, {}) | ||||
|     }); | ||||
|     if (illust.comment) { | ||||
|         nodes.push_back(Element("meta", {{"property", "og:description"}, {"content", generate_description_text(req, config, *illust.comment)}}, {})); | ||||
|     if (illust.comment_html) { | ||||
|         nodes.push_back(Element("meta", {{"property", "og:description"}, {"content", generate_description_text(req, config, blankie::html::HTMLString(*illust.comment_html))}}, {})); | ||||
|     } | ||||
| 
 | ||||
|     // i don't even know what multiple og:images do anymore
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue