Fix HTML in the description not being parsed

https://pixwhile.zangetsu.kaizoku.cyou/artworks/87524102
This commit is contained in:
blankie 2023-09-29 15:14:58 +10:00
parent c8a079d85e
commit 50a52e38d4
Signed by: blankie
GPG Key ID: CC15FC822C7F61F5
11 changed files with 2282 additions and 99 deletions

View File

@ -27,7 +27,7 @@ add_link_options(${FLAGS})
add_executable(${PROJECT_NAME} main.cpp misc.cpp config.cpp servehelper.cpp numberhelper.cpp pixivclient.cpp pixivmodels.cpp hiredis_wrapper.cpp add_executable(${PROJECT_NAME} main.cpp misc.cpp config.cpp servehelper.cpp numberhelper.cpp pixivclient.cpp pixivmodels.cpp hiredis_wrapper.cpp
routes/home.cpp routes/css.cpp routes/artworks.cpp routes/tags.cpp routes/guess_extension.cpp routes/users/common.cpp routes/users/illustrations.cpp routes/home.cpp routes/css.cpp routes/artworks.cpp routes/tags.cpp routes/guess_extension.cpp routes/users/common.cpp routes/users/illustrations.cpp
blankie/serializer.cpp blankie/escape.cpp blankie/murl.cpp) blankie/serializer.cpp blankie/escape.cpp blankie/unescape.cpp blankie/murl.cpp)
set_target_properties(${PROJECT_NAME} set_target_properties(${PROJECT_NAME}
PROPERTIES PROPERTIES
CXX_STANDARD 20 CXX_STANDARD 20

View File

@ -26,11 +26,6 @@
// Looser than RFC 3986, but fragments might as well own everything // Looser than RFC 3986, but fragments might as well own everything
#define FRAGMENT "([^ ]*)" #define FRAGMENT "([^ ]*)"
#define FULL_HTTP_HTTPS_REGEX \
"(?:(https?)?:)?//" AUTHORITY \
PATH_ABEMPTY \
"(?:\\?" QUERY ")?" \
"(?:#" FRAGMENT ")?"
#define HTTP_HTTPS_REGEX \ #define HTTP_HTTPS_REGEX \
"(?:(https?)?:)?(?://" AUTHORITY ")?" \ "(?:(https?)?:)?(?://" AUTHORITY ")?" \
PATH_ABEMPTY \ PATH_ABEMPTY \
@ -45,8 +40,7 @@ static std::string tolower(std::string str);
namespace blankie { namespace blankie {
namespace murl { namespace murl {
std::regex full_url_regex(FULL_HTTP_HTTPS_REGEX, std::regex::icase); const std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase);
std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase);
Url::Url(const std::string& str) { Url::Url(const std::string& str) {
std::smatch sm; std::smatch sm;

View File

@ -6,8 +6,6 @@
namespace blankie { namespace blankie {
namespace murl { namespace murl {
extern std::regex full_url_regex;
struct Url { struct Url {
std::string scheme; std::string scheme;
std::string userinfo; std::string userinfo;

View File

@ -36,6 +36,8 @@ std::string Element::serialize() const {
out += escape(*text); out += escape(*text);
} else if (const std::string* str = std::get_if<std::string>(&node)) { } else if (const std::string* str = std::get_if<std::string>(&node)) {
out += escape(*str); out += escape(*str);
} else if (const HTMLString* html_str = std::get_if<HTMLString>(&node)) {
out += html_str->str;
} else { } else {
throw std::runtime_error("Encountered unknown node"); throw std::runtime_error("Encountered unknown node");
} }

View File

@ -9,9 +9,15 @@ namespace blankie {
namespace html { namespace html {
struct Element; struct Element;
struct HTMLString {
HTMLString() = default;
explicit HTMLString(std::string str_) : str(std::move(str_)) {}
std::string str;
};
typedef std::pair<const char*, std::string> Attribute; typedef std::pair<const char*, std::string> Attribute;
typedef std::variant<Element, const char*, std::string> Node; typedef std::variant<Element, const char*, std::string, HTMLString> Node;
struct Element { struct Element {
const char* tag; const char* tag;

125
blankie/unescape.cpp Normal file
View File

@ -0,0 +1,125 @@
#include <string>
#include <climits>
#include <stdexcept>
#include "unescape.h"
#include "unescape_data.h"
// https://stackoverflow.com/a/42013433
static inline size_t codepoint_to_utf8(char* buf, const unsigned long code) {
// you see, i don't care
// https://t.me/NightShadowsHangout/670534
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
if (code <= 0x7F) {
buf[0] = static_cast<char>(code);
return 1;
} else if (code <= 0x7FF) {
buf[0] = 0xC0 | static_cast<char>(code >> 6); /* 110xxxxx */
buf[1] = 0x80 | (code & 0x3F); /* 10xxxxxx */
return 2;
} else if (code <= 0xFFFF) {
buf[0] = 0xE0 | static_cast<char>(code >> 12); /* 1110xxxx */
buf[1] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
buf[2] = 0x80 | (code & 0x3F); /* 10xxxxxx */
return 3;
} else if (code <= 0x10FFFF) {
buf[0] = 0xF0 | static_cast<char>(code >> 18); /* 11110xxx */
buf[1] = 0x80 | ((code >> 12) & 0x3F); /* 10xxxxxx */
buf[2] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
buf[3] = 0x80 | (code & 0x3F); /* 10xxxxxx */
return 4;
#pragma GCC diagnostic pop
} else {
throw std::invalid_argument("codepoint passed is bigger than 0x10FFFF");
}
}
static inline bool isdigit(char ch) {
return ch >= '0' && ch <= '9';
}
static inline bool ishex(char ch) {
return isdigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
static inline unsigned long decode_numeric_entity(const std::string& entity) {
unsigned long codepoint;
char* last_converted_char;
errno = 0;
if (entity[1] == 'x' || entity[1] == 'X') {
if (entity.size() <= 2 || !ishex(entity[2])) {
return 0;
}
codepoint = strtoul(&entity.c_str()[2], &last_converted_char, 16);
} else {
if (entity.size() <= 1 || !isdigit(entity[1])) {
return 0;
}
codepoint = strtoul(&entity.c_str()[1], &last_converted_char, 10);
}
if ((codepoint == ULONG_MAX && errno == ERANGE) || last_converted_char[0] != '\0') {
return 0;
}
if (codepoint >= 0x80 && codepoint <= 0x9F) {
codepoint = windows1252_repl[codepoint - 0x80];
}
if (!codepoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
codepoint = 0xFFFD;
}
return codepoint;
}
static inline unsigned long decode_string_entity(const std::string& entity) {
for (const Entity& i : string_entities) {
if (entity == i.string) {
return i.codepoint;
}
}
return 0;
}
static inline unsigned long decode_entity(std::string entity) {
return !entity.empty() && entity[0] == '#'
? decode_numeric_entity(entity)
: decode_string_entity(entity);
}
namespace blankie {
namespace html {
[[nodiscard]] std::string unescape(const std::string& str) {
std::string output;
unsigned long codepoint;
char codepoint_buf[4];
size_t offset = 0, old_offset = 0, offset_end, codepoint_buf_size;
output.reserve(str.size());
while ((offset = str.find('&', offset)) != std::string::npos) {
offset_end = str.find(';', ++offset);
if (offset_end == std::string::npos) {
break;
}
codepoint = decode_entity(str.substr(offset, offset_end - offset));
if (codepoint) {
codepoint_buf_size = codepoint_to_utf8(codepoint_buf, codepoint);
output.append(str, old_offset, offset - old_offset - 1);
output.append(codepoint_buf, codepoint_buf_size);
old_offset = offset = offset_end + 1;
}
}
if (str.size() > old_offset) {
output.append(str, old_offset, std::string::npos);
}
return output;
}
} // namespace html
} // namespace blankie

9
blankie/unescape.h Normal file
View File

@ -0,0 +1,9 @@
#pragma once
namespace blankie {
namespace html {
[[nodiscard]] std::string unescape(const std::string& str);
} // namespace html
} // namespace blankie

2073
blankie/unescape_data.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -117,8 +117,8 @@ void from_json(const nlohmann::json& j, Illust& illust) {
illust_details.at("upload_timestamp").get_to(illust.upload_time); illust_details.at("upload_timestamp").get_to(illust.upload_time);
if (full_data) { if (full_data) {
if (illust_details.contains("comment") && illust_details["comment"].is_string()) { if (illust_details.contains("comment_html") && illust_details["comment_html"].is_string()) {
illust.comment = illust_details["comment"].get<std::string>(); illust.comment_html = illust_details["comment_html"].get<std::string>();
} }
illust_details.at("display_tags").get_to(illust.tags); illust_details.at("display_tags").get_to(illust.tags);
} }
@ -179,7 +179,7 @@ void from_json(const nlohmann::json& j, SearchResults& search_results) {
// and i cba to use regex for it, especially when it's not even used in this context // and i cba to use regex for it, especially when it's not even used in this context
.upload_time = -1, .upload_time = -1,
.comment = std::nullopt, .comment_html = std::nullopt,
.tags = std::move(tags), .tags = std::move(tags),
.images = {get_illust_images(i, std::nullopt)}, .images = {get_illust_images(i, std::nullopt)},
.page_count = i.at("pageCount").get<size_t>() .page_count = i.at("pageCount").get<size_t>()

View File

@ -50,7 +50,7 @@ struct Illust {
bool ai_generated; bool ai_generated;
time_t upload_time; time_t upload_time;
std::optional<std::string> comment; std::optional<std::string> comment_html;
std::vector<Tag> tags; std::vector<Tag> tags;
std::vector<Images> images; std::vector<Images> images;
size_t page_count; size_t page_count;

View File

@ -2,6 +2,8 @@
#include "routes.h" #include "routes.h"
#include "../blankie/murl.h" #include "../blankie/murl.h"
#include "../blankie/escape.h"
#include "../blankie/unescape.h"
#include "../servehelper.h" #include "../servehelper.h"
#include "../numberhelper.h" #include "../numberhelper.h"
#include "../pixivclient.h" #include "../pixivclient.h"
@ -9,10 +11,9 @@
static inline Element generate_user_link(const httplib::Request& req, const Config& config, const Illust& illust); static inline Element generate_user_link(const httplib::Request& req, const Config& config, const Illust& illust);
static inline Element generate_images(const httplib::Request& req, const Config& config, const Illust& illust); static inline Element generate_images(const httplib::Request& req, const Config& config, const Illust& illust);
static inline Element generate_preview_images(const httplib::Request& req, const Config& config, const Illust& illust); static inline Element generate_preview_images(const httplib::Request& req, const Config& config, const Illust& illust);
static inline Nodes parse_description_line(const httplib::Request& req, const Config& config, std::string str);
static inline Element generate_description(const httplib::Request& req, const Config& config, const std::string& description);
static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust); static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust);
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, std::string description); static inline blankie::html::HTMLString fix_description_links(const httplib::Request& req, const Config& config, blankie::html::HTMLString str);
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, blankie::html::HTMLString description);
static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview); static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview);
static inline bool is_true(const std::string& str); static inline bool is_true(const std::string& str);
@ -48,8 +49,8 @@ void artworks_route(const httplib::Request& req, httplib::Response& res, const C
!preview ? generate_images(req, config, illust) : generate_preview_images(req, config, illust), !preview ? generate_images(req, config, illust) : generate_preview_images(req, config, illust),
Element("br") Element("br")
}); });
if (illust.comment) { if (illust.comment_html) {
body.nodes.push_back(generate_description(req, config, *illust.comment)); body.nodes.push_back(Element("p", {fix_description_links(req, config, blankie::html::HTMLString(*illust.comment_html))}));
} }
body.nodes.push_back(generate_illust_tags(req, config, illust)); body.nodes.push_back(generate_illust_tags(req, config, illust));
body.nodes.push_back(Element("p", {time_to_string(illust.upload_time)})); body.nodes.push_back(Element("p", {time_to_string(illust.upload_time)}));
@ -122,61 +123,6 @@ static inline Element generate_preview_images(const httplib::Request& req, const
return div; return div;
} }
static inline Nodes parse_description_line(const httplib::Request& req, const Config& config, std::string str) {
Nodes nodes;
std::smatch sm;
while (std::regex_search(str, sm, blankie::murl::full_url_regex)) {
std::string prefix = sm.prefix();
std::string url_str = sm.str(0);
std::string suffix = sm.suffix();
if (prefix.ends_with('(') && url_str.ends_with(')')) {
url_str.pop_back();
suffix.insert(0, 1, ')');
}
if (!prefix.empty()) {
nodes.push_back(std::move(prefix));
}
blankie::murl::Url url(std::move(url_str));
url_str = url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net")
? proxy_pixiv_url(req, config, std::move(url))
: url.to_string();
nodes.push_back(Element("a", {{"href", url_str}}, {url_str}));
str = std::move(suffix);
}
if (!str.empty()) {
nodes.push_back(std::move(str));
}
return nodes;
}
static inline Element generate_description(const httplib::Request& req, const Config& config, const std::string& description) {
Element p("p");
size_t pos = 0;
size_t last_pos = 0;
auto add = [&](std::string str) {
if (!p.nodes.empty()) {
p.nodes.push_back(Element("br"));
}
Nodes nodes = parse_description_line(req, config, std::move(str));
p.nodes.insert(p.nodes.end(), nodes.begin(), nodes.end());
};
while ((pos = description.find('\n', pos)) != std::string::npos) {
add(description.substr(last_pos, pos - last_pos));
last_pos = ++pos;
}
if (description.size() > last_pos) {
add(description.substr(last_pos));
}
return p;
}
static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust) { static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust) {
Element div("div", {{"class", "illust-tags"}}, {}); Element div("div", {{"class", "illust-tags"}}, {});
@ -198,33 +144,63 @@ static inline Element generate_illust_tags(const httplib::Request& req, const Co
return div; return div;
} }
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, std::string description) { const std::regex start_link_regex("<a href=\"([^\"]+?)\"(?: target=\"_blank\")?>");
static inline blankie::html::HTMLString fix_description_links(const httplib::Request& req, const Config& config, blankie::html::HTMLString str) {
using namespace std::string_literals;
blankie::html::HTMLString out;
std::smatch sm;
out.str.reserve(str.str.size());
while (std::regex_search(str.str, sm, start_link_regex)) {
out.str += sm.prefix();
std::string url_str;
blankie::murl::Url url(sm.str(1));
if (url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net")) {
url_str = proxy_pixiv_url(req, config, std::move(url));
} else if (url.path == "/jump.php") {
url_str = blankie::murl::unescape(std::move(url.query));
} else {
url_str = url.to_string();
}
out.str += "<a href=\""s + blankie::html::escape(std::move(url_str)) + "\">";
str.str = sm.suffix();
}
out.str += std::move(str.str);
return out;
}
const std::regex link_regex("<a href=\"([^\"]+?)\">.+?</a>");
const std::regex tag_regex("<(.+?)>");
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, blankie::html::HTMLString description) {
description = fix_description_links(req, config, std::move(description));
std::string new_description; std::string new_description;
std::smatch sm; std::smatch sm;
new_description.reserve(description.size()); new_description.reserve(description.str.size());
while (std::regex_search(description, sm, blankie::murl::full_url_regex)) { while (std::regex_search(description.str, sm, link_regex)) {
std::string prefix = sm.prefix(); new_description += sm.prefix();
std::string url_str = sm.str(0); new_description += sm.str(1);
std::string suffix = sm.suffix(); description.str = sm.suffix();
if (prefix.ends_with('(') && url_str.ends_with(')')) {
url_str.pop_back();
suffix.insert(0, 1, ')');
} }
new_description += std::move(prefix); new_description += std::move(description.str);
blankie::murl::Url url(std::move(url_str)); description.str = std::move(new_description);
url_str = url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net") new_description.reserve(description.str.size());
? proxy_pixiv_url(req, config, std::move(url)) while (std::regex_search(description.str, sm, tag_regex)) {
: url.to_string(); new_description += sm.prefix();
new_description += std::move(url_str); if (sm.str(1) == "br /") {
new_description += '\n';
description = std::move(suffix);
} }
new_description += std::move(description); description.str = sm.suffix();
}
new_description += std::move(description.str);
return new_description; return blankie::html::unescape(std::move(new_description));
} }
static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview) { static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview) {
@ -238,8 +214,8 @@ static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config
Element("meta", {{"property", "og:site_name"}, {"content", "Pixwhile"}}, {}), Element("meta", {{"property", "og:site_name"}, {"content", "Pixwhile"}}, {}),
Element("meta", {{"property", "og:url"}, {"content", std::move(url)}}, {}) Element("meta", {{"property", "og:url"}, {"content", std::move(url)}}, {})
}); });
if (illust.comment) { if (illust.comment_html) {
nodes.push_back(Element("meta", {{"property", "og:description"}, {"content", generate_description_text(req, config, *illust.comment)}}, {})); nodes.push_back(Element("meta", {{"property", "og:description"}, {"content", generate_description_text(req, config, blankie::html::HTMLString(*illust.comment_html))}}, {}));
} }
// i don't even know what multiple og:images do anymore // i don't even know what multiple og:images do anymore