Fix HTML in the description not being parsed
https://pixwhile.zangetsu.kaizoku.cyou/artworks/87524102
This commit is contained in:
parent
c8a079d85e
commit
50a52e38d4
|
@ -27,7 +27,7 @@ add_link_options(${FLAGS})
|
|||
|
||||
add_executable(${PROJECT_NAME} main.cpp misc.cpp config.cpp servehelper.cpp numberhelper.cpp pixivclient.cpp pixivmodels.cpp hiredis_wrapper.cpp
|
||||
routes/home.cpp routes/css.cpp routes/artworks.cpp routes/tags.cpp routes/guess_extension.cpp routes/users/common.cpp routes/users/illustrations.cpp
|
||||
blankie/serializer.cpp blankie/escape.cpp blankie/murl.cpp)
|
||||
blankie/serializer.cpp blankie/escape.cpp blankie/unescape.cpp blankie/murl.cpp)
|
||||
set_target_properties(${PROJECT_NAME}
|
||||
PROPERTIES
|
||||
CXX_STANDARD 20
|
||||
|
|
|
@ -26,11 +26,6 @@
|
|||
// Looser than RFC 3986, but fragments might as well own everything
|
||||
#define FRAGMENT "([^ ]*)"
|
||||
|
||||
#define FULL_HTTP_HTTPS_REGEX \
|
||||
"(?:(https?)?:)?//" AUTHORITY \
|
||||
PATH_ABEMPTY \
|
||||
"(?:\\?" QUERY ")?" \
|
||||
"(?:#" FRAGMENT ")?"
|
||||
#define HTTP_HTTPS_REGEX \
|
||||
"(?:(https?)?:)?(?://" AUTHORITY ")?" \
|
||||
PATH_ABEMPTY \
|
||||
|
@ -45,8 +40,7 @@ static std::string tolower(std::string str);
|
|||
namespace blankie {
|
||||
namespace murl {
|
||||
|
||||
std::regex full_url_regex(FULL_HTTP_HTTPS_REGEX, std::regex::icase);
|
||||
std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase);
|
||||
const std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase);
|
||||
|
||||
Url::Url(const std::string& str) {
|
||||
std::smatch sm;
|
||||
|
|
|
@ -6,8 +6,6 @@
|
|||
namespace blankie {
|
||||
namespace murl {
|
||||
|
||||
extern std::regex full_url_regex;
|
||||
|
||||
struct Url {
|
||||
std::string scheme;
|
||||
std::string userinfo;
|
||||
|
|
|
@ -36,6 +36,8 @@ std::string Element::serialize() const {
|
|||
out += escape(*text);
|
||||
} else if (const std::string* str = std::get_if<std::string>(&node)) {
|
||||
out += escape(*str);
|
||||
} else if (const HTMLString* html_str = std::get_if<HTMLString>(&node)) {
|
||||
out += html_str->str;
|
||||
} else {
|
||||
throw std::runtime_error("Encountered unknown node");
|
||||
}
|
||||
|
|
|
@ -9,9 +9,15 @@ namespace blankie {
|
|||
namespace html {
|
||||
|
||||
struct Element;
|
||||
struct HTMLString {
|
||||
HTMLString() = default;
|
||||
explicit HTMLString(std::string str_) : str(std::move(str_)) {}
|
||||
|
||||
std::string str;
|
||||
};
|
||||
|
||||
typedef std::pair<const char*, std::string> Attribute;
|
||||
typedef std::variant<Element, const char*, std::string> Node;
|
||||
typedef std::variant<Element, const char*, std::string, HTMLString> Node;
|
||||
|
||||
struct Element {
|
||||
const char* tag;
|
||||
|
|
|
@ -0,0 +1,125 @@
|
|||
#include <string>
|
||||
#include <climits>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "unescape.h"
|
||||
#include "unescape_data.h"
|
||||
|
||||
// https://stackoverflow.com/a/42013433
|
||||
static inline size_t codepoint_to_utf8(char* buf, const unsigned long code) {
|
||||
// you see, i don't care
|
||||
// https://t.me/NightShadowsHangout/670534
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wconversion"
|
||||
if (code <= 0x7F) {
|
||||
buf[0] = static_cast<char>(code);
|
||||
return 1;
|
||||
} else if (code <= 0x7FF) {
|
||||
buf[0] = 0xC0 | static_cast<char>(code >> 6); /* 110xxxxx */
|
||||
buf[1] = 0x80 | (code & 0x3F); /* 10xxxxxx */
|
||||
return 2;
|
||||
} else if (code <= 0xFFFF) {
|
||||
buf[0] = 0xE0 | static_cast<char>(code >> 12); /* 1110xxxx */
|
||||
buf[1] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
|
||||
buf[2] = 0x80 | (code & 0x3F); /* 10xxxxxx */
|
||||
return 3;
|
||||
} else if (code <= 0x10FFFF) {
|
||||
buf[0] = 0xF0 | static_cast<char>(code >> 18); /* 11110xxx */
|
||||
buf[1] = 0x80 | ((code >> 12) & 0x3F); /* 10xxxxxx */
|
||||
buf[2] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
|
||||
buf[3] = 0x80 | (code & 0x3F); /* 10xxxxxx */
|
||||
return 4;
|
||||
#pragma GCC diagnostic pop
|
||||
} else {
|
||||
throw std::invalid_argument("codepoint passed is bigger than 0x10FFFF");
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool isdigit(char ch) {
|
||||
return ch >= '0' && ch <= '9';
|
||||
}
|
||||
|
||||
static inline bool ishex(char ch) {
|
||||
return isdigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
|
||||
}
|
||||
|
||||
static inline unsigned long decode_numeric_entity(const std::string& entity) {
|
||||
unsigned long codepoint;
|
||||
char* last_converted_char;
|
||||
|
||||
errno = 0;
|
||||
if (entity[1] == 'x' || entity[1] == 'X') {
|
||||
if (entity.size() <= 2 || !ishex(entity[2])) {
|
||||
return 0;
|
||||
}
|
||||
codepoint = strtoul(&entity.c_str()[2], &last_converted_char, 16);
|
||||
} else {
|
||||
if (entity.size() <= 1 || !isdigit(entity[1])) {
|
||||
return 0;
|
||||
}
|
||||
codepoint = strtoul(&entity.c_str()[1], &last_converted_char, 10);
|
||||
}
|
||||
|
||||
if ((codepoint == ULONG_MAX && errno == ERANGE) || last_converted_char[0] != '\0') {
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (codepoint >= 0x80 && codepoint <= 0x9F) {
|
||||
codepoint = windows1252_repl[codepoint - 0x80];
|
||||
}
|
||||
if (!codepoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
|
||||
codepoint = 0xFFFD;
|
||||
}
|
||||
|
||||
return codepoint;
|
||||
}
|
||||
|
||||
static inline unsigned long decode_string_entity(const std::string& entity) {
|
||||
for (const Entity& i : string_entities) {
|
||||
if (entity == i.string) {
|
||||
return i.codepoint;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline unsigned long decode_entity(std::string entity) {
|
||||
return !entity.empty() && entity[0] == '#'
|
||||
? decode_numeric_entity(entity)
|
||||
: decode_string_entity(entity);
|
||||
}
|
||||
|
||||
namespace blankie {
|
||||
namespace html {
|
||||
|
||||
[[nodiscard]] std::string unescape(const std::string& str) {
|
||||
std::string output;
|
||||
unsigned long codepoint;
|
||||
char codepoint_buf[4];
|
||||
size_t offset = 0, old_offset = 0, offset_end, codepoint_buf_size;
|
||||
|
||||
output.reserve(str.size());
|
||||
while ((offset = str.find('&', offset)) != std::string::npos) {
|
||||
offset_end = str.find(';', ++offset);
|
||||
if (offset_end == std::string::npos) {
|
||||
break;
|
||||
}
|
||||
|
||||
codepoint = decode_entity(str.substr(offset, offset_end - offset));
|
||||
if (codepoint) {
|
||||
codepoint_buf_size = codepoint_to_utf8(codepoint_buf, codepoint);
|
||||
output.append(str, old_offset, offset - old_offset - 1);
|
||||
output.append(codepoint_buf, codepoint_buf_size);
|
||||
old_offset = offset = offset_end + 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (str.size() > old_offset) {
|
||||
output.append(str, old_offset, std::string::npos);
|
||||
}
|
||||
return output;
|
||||
}
|
||||
|
||||
} // namespace html
|
||||
} // namespace blankie
|
|
@ -0,0 +1,9 @@
|
|||
#pragma once
|
||||
|
||||
namespace blankie {
|
||||
namespace html {
|
||||
|
||||
[[nodiscard]] std::string unescape(const std::string& str);
|
||||
|
||||
} // namespace html
|
||||
} // namespace blankie
|
File diff suppressed because it is too large
Load Diff
|
@ -117,8 +117,8 @@ void from_json(const nlohmann::json& j, Illust& illust) {
|
|||
illust_details.at("upload_timestamp").get_to(illust.upload_time);
|
||||
|
||||
if (full_data) {
|
||||
if (illust_details.contains("comment") && illust_details["comment"].is_string()) {
|
||||
illust.comment = illust_details["comment"].get<std::string>();
|
||||
if (illust_details.contains("comment_html") && illust_details["comment_html"].is_string()) {
|
||||
illust.comment_html = illust_details["comment_html"].get<std::string>();
|
||||
}
|
||||
illust_details.at("display_tags").get_to(illust.tags);
|
||||
}
|
||||
|
@ -179,7 +179,7 @@ void from_json(const nlohmann::json& j, SearchResults& search_results) {
|
|||
// and i cba to use regex for it, especially when it's not even used in this context
|
||||
.upload_time = -1,
|
||||
|
||||
.comment = std::nullopt,
|
||||
.comment_html = std::nullopt,
|
||||
.tags = std::move(tags),
|
||||
.images = {get_illust_images(i, std::nullopt)},
|
||||
.page_count = i.at("pageCount").get<size_t>()
|
||||
|
|
|
@ -50,7 +50,7 @@ struct Illust {
|
|||
bool ai_generated;
|
||||
time_t upload_time;
|
||||
|
||||
std::optional<std::string> comment;
|
||||
std::optional<std::string> comment_html;
|
||||
std::vector<Tag> tags;
|
||||
std::vector<Images> images;
|
||||
size_t page_count;
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
#include "routes.h"
|
||||
#include "../blankie/murl.h"
|
||||
#include "../blankie/escape.h"
|
||||
#include "../blankie/unescape.h"
|
||||
#include "../servehelper.h"
|
||||
#include "../numberhelper.h"
|
||||
#include "../pixivclient.h"
|
||||
|
@ -9,10 +11,9 @@
|
|||
static inline Element generate_user_link(const httplib::Request& req, const Config& config, const Illust& illust);
|
||||
static inline Element generate_images(const httplib::Request& req, const Config& config, const Illust& illust);
|
||||
static inline Element generate_preview_images(const httplib::Request& req, const Config& config, const Illust& illust);
|
||||
static inline Nodes parse_description_line(const httplib::Request& req, const Config& config, std::string str);
|
||||
static inline Element generate_description(const httplib::Request& req, const Config& config, const std::string& description);
|
||||
static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust);
|
||||
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, std::string description);
|
||||
static inline blankie::html::HTMLString fix_description_links(const httplib::Request& req, const Config& config, blankie::html::HTMLString str);
|
||||
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, blankie::html::HTMLString description);
|
||||
static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview);
|
||||
|
||||
static inline bool is_true(const std::string& str);
|
||||
|
@ -48,8 +49,8 @@ void artworks_route(const httplib::Request& req, httplib::Response& res, const C
|
|||
!preview ? generate_images(req, config, illust) : generate_preview_images(req, config, illust),
|
||||
Element("br")
|
||||
});
|
||||
if (illust.comment) {
|
||||
body.nodes.push_back(generate_description(req, config, *illust.comment));
|
||||
if (illust.comment_html) {
|
||||
body.nodes.push_back(Element("p", {fix_description_links(req, config, blankie::html::HTMLString(*illust.comment_html))}));
|
||||
}
|
||||
body.nodes.push_back(generate_illust_tags(req, config, illust));
|
||||
body.nodes.push_back(Element("p", {time_to_string(illust.upload_time)}));
|
||||
|
@ -122,61 +123,6 @@ static inline Element generate_preview_images(const httplib::Request& req, const
|
|||
return div;
|
||||
}
|
||||
|
||||
static inline Nodes parse_description_line(const httplib::Request& req, const Config& config, std::string str) {
|
||||
Nodes nodes;
|
||||
std::smatch sm;
|
||||
|
||||
while (std::regex_search(str, sm, blankie::murl::full_url_regex)) {
|
||||
std::string prefix = sm.prefix();
|
||||
std::string url_str = sm.str(0);
|
||||
std::string suffix = sm.suffix();
|
||||
|
||||
if (prefix.ends_with('(') && url_str.ends_with(')')) {
|
||||
url_str.pop_back();
|
||||
suffix.insert(0, 1, ')');
|
||||
}
|
||||
if (!prefix.empty()) {
|
||||
nodes.push_back(std::move(prefix));
|
||||
}
|
||||
|
||||
blankie::murl::Url url(std::move(url_str));
|
||||
url_str = url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net")
|
||||
? proxy_pixiv_url(req, config, std::move(url))
|
||||
: url.to_string();
|
||||
nodes.push_back(Element("a", {{"href", url_str}}, {url_str}));
|
||||
|
||||
str = std::move(suffix);
|
||||
}
|
||||
if (!str.empty()) {
|
||||
nodes.push_back(std::move(str));
|
||||
}
|
||||
|
||||
return nodes;
|
||||
}
|
||||
|
||||
static inline Element generate_description(const httplib::Request& req, const Config& config, const std::string& description) {
|
||||
Element p("p");
|
||||
size_t pos = 0;
|
||||
size_t last_pos = 0;
|
||||
auto add = [&](std::string str) {
|
||||
if (!p.nodes.empty()) {
|
||||
p.nodes.push_back(Element("br"));
|
||||
}
|
||||
Nodes nodes = parse_description_line(req, config, std::move(str));
|
||||
p.nodes.insert(p.nodes.end(), nodes.begin(), nodes.end());
|
||||
};
|
||||
|
||||
while ((pos = description.find('\n', pos)) != std::string::npos) {
|
||||
add(description.substr(last_pos, pos - last_pos));
|
||||
last_pos = ++pos;
|
||||
}
|
||||
if (description.size() > last_pos) {
|
||||
add(description.substr(last_pos));
|
||||
}
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
static inline Element generate_illust_tags(const httplib::Request& req, const Config& config, const Illust& illust) {
|
||||
Element div("div", {{"class", "illust-tags"}}, {});
|
||||
|
||||
|
@ -198,33 +144,63 @@ static inline Element generate_illust_tags(const httplib::Request& req, const Co
|
|||
return div;
|
||||
}
|
||||
|
||||
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, std::string description) {
|
||||
const std::regex start_link_regex("<a href=\"([^\"]+?)\"(?: target=\"_blank\")?>");
|
||||
static inline blankie::html::HTMLString fix_description_links(const httplib::Request& req, const Config& config, blankie::html::HTMLString str) {
|
||||
using namespace std::string_literals;
|
||||
|
||||
blankie::html::HTMLString out;
|
||||
std::smatch sm;
|
||||
|
||||
out.str.reserve(str.str.size());
|
||||
while (std::regex_search(str.str, sm, start_link_regex)) {
|
||||
out.str += sm.prefix();
|
||||
|
||||
std::string url_str;
|
||||
blankie::murl::Url url(sm.str(1));
|
||||
if (url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net")) {
|
||||
url_str = proxy_pixiv_url(req, config, std::move(url));
|
||||
} else if (url.path == "/jump.php") {
|
||||
url_str = blankie::murl::unescape(std::move(url.query));
|
||||
} else {
|
||||
url_str = url.to_string();
|
||||
}
|
||||
out.str += "<a href=\""s + blankie::html::escape(std::move(url_str)) + "\">";
|
||||
|
||||
str.str = sm.suffix();
|
||||
}
|
||||
out.str += std::move(str.str);
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
const std::regex link_regex("<a href=\"([^\"]+?)\">.+?</a>");
|
||||
const std::regex tag_regex("<(.+?)>");
|
||||
static inline std::string generate_description_text(const httplib::Request& req, const Config& config, blankie::html::HTMLString description) {
|
||||
description = fix_description_links(req, config, std::move(description));
|
||||
|
||||
std::string new_description;
|
||||
std::smatch sm;
|
||||
|
||||
new_description.reserve(description.size());
|
||||
while (std::regex_search(description, sm, blankie::murl::full_url_regex)) {
|
||||
std::string prefix = sm.prefix();
|
||||
std::string url_str = sm.str(0);
|
||||
std::string suffix = sm.suffix();
|
||||
|
||||
if (prefix.ends_with('(') && url_str.ends_with(')')) {
|
||||
url_str.pop_back();
|
||||
suffix.insert(0, 1, ')');
|
||||
new_description.reserve(description.str.size());
|
||||
while (std::regex_search(description.str, sm, link_regex)) {
|
||||
new_description += sm.prefix();
|
||||
new_description += sm.str(1);
|
||||
description.str = sm.suffix();
|
||||
}
|
||||
new_description += std::move(prefix);
|
||||
new_description += std::move(description.str);
|
||||
|
||||
blankie::murl::Url url(std::move(url_str));
|
||||
url_str = url.is_host_equal("pixiv.net") || url.is_host_equal("www.pixiv.net")
|
||||
? proxy_pixiv_url(req, config, std::move(url))
|
||||
: url.to_string();
|
||||
new_description += std::move(url_str);
|
||||
|
||||
description = std::move(suffix);
|
||||
description.str = std::move(new_description);
|
||||
new_description.reserve(description.str.size());
|
||||
while (std::regex_search(description.str, sm, tag_regex)) {
|
||||
new_description += sm.prefix();
|
||||
if (sm.str(1) == "br /") {
|
||||
new_description += '\n';
|
||||
}
|
||||
new_description += std::move(description);
|
||||
description.str = sm.suffix();
|
||||
}
|
||||
new_description += std::move(description.str);
|
||||
|
||||
return new_description;
|
||||
return blankie::html::unescape(std::move(new_description));
|
||||
}
|
||||
|
||||
static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config& config, const Illust& illust, bool preview) {
|
||||
|
@ -238,8 +214,8 @@ static inline Nodes generate_ogp_nodes(const httplib::Request& req, const Config
|
|||
Element("meta", {{"property", "og:site_name"}, {"content", "Pixwhile"}}, {}),
|
||||
Element("meta", {{"property", "og:url"}, {"content", std::move(url)}}, {})
|
||||
});
|
||||
if (illust.comment) {
|
||||
nodes.push_back(Element("meta", {{"property", "og:description"}, {"content", generate_description_text(req, config, *illust.comment)}}, {}));
|
||||
if (illust.comment_html) {
|
||||
nodes.push_back(Element("meta", {{"property", "og:description"}, {"content", generate_description_text(req, config, blankie::html::HTMLString(*illust.comment_html))}}, {}));
|
||||
}
|
||||
|
||||
// i don't even know what multiple og:images do anymore
|
||||
|
|
Loading…
Reference in New Issue