126 lines
3.9 KiB
C++
126 lines
3.9 KiB
C++
#include <string>
|
|
#include <climits>
|
|
#include <stdexcept>
|
|
|
|
#include "unescape.h"
|
|
#include "unescape_data.h"
|
|
|
|
// https://stackoverflow.com/a/42013433
|
|
static inline size_t codepoint_to_utf8(char* buf, const unsigned long code) {
|
|
// you see, i don't care
|
|
// https://t.me/NightShadowsHangout/670534
|
|
#pragma GCC diagnostic push
|
|
#pragma GCC diagnostic ignored "-Wconversion"
|
|
if (code <= 0x7F) {
|
|
buf[0] = static_cast<char>(code);
|
|
return 1;
|
|
} else if (code <= 0x7FF) {
|
|
buf[0] = 0xC0 | static_cast<char>(code >> 6); /* 110xxxxx */
|
|
buf[1] = 0x80 | (code & 0x3F); /* 10xxxxxx */
|
|
return 2;
|
|
} else if (code <= 0xFFFF) {
|
|
buf[0] = 0xE0 | static_cast<char>(code >> 12); /* 1110xxxx */
|
|
buf[1] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
|
|
buf[2] = 0x80 | (code & 0x3F); /* 10xxxxxx */
|
|
return 3;
|
|
} else if (code <= 0x10FFFF) {
|
|
buf[0] = 0xF0 | static_cast<char>(code >> 18); /* 11110xxx */
|
|
buf[1] = 0x80 | ((code >> 12) & 0x3F); /* 10xxxxxx */
|
|
buf[2] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
|
|
buf[3] = 0x80 | (code & 0x3F); /* 10xxxxxx */
|
|
return 4;
|
|
#pragma GCC diagnostic pop
|
|
} else {
|
|
throw std::invalid_argument("codepoint passed is bigger than 0x10FFFF");
|
|
}
|
|
}
|
|
|
|
static inline bool isdigit(char ch) {
|
|
return ch >= '0' && ch <= '9';
|
|
}
|
|
|
|
static inline bool ishex(char ch) {
|
|
return isdigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
|
|
}
|
|
|
|
static inline unsigned long decode_numeric_entity(const std::string& entity) {
|
|
unsigned long codepoint;
|
|
char* last_converted_char;
|
|
|
|
errno = 0;
|
|
if (entity[1] == 'x' || entity[1] == 'X') {
|
|
if (entity.size() <= 2 || !ishex(entity[2])) {
|
|
return 0;
|
|
}
|
|
codepoint = strtoul(&entity.c_str()[2], &last_converted_char, 16);
|
|
} else {
|
|
if (entity.size() <= 1 || !isdigit(entity[1])) {
|
|
return 0;
|
|
}
|
|
codepoint = strtoul(&entity.c_str()[1], &last_converted_char, 10);
|
|
}
|
|
|
|
if ((codepoint == ULONG_MAX && errno == ERANGE) || last_converted_char[0] != '\0') {
|
|
return 0;
|
|
}
|
|
|
|
if (codepoint >= 0x80 && codepoint <= 0x9F) {
|
|
codepoint = windows1252_repl[codepoint - 0x80];
|
|
}
|
|
if (!codepoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
|
|
codepoint = 0xFFFD;
|
|
}
|
|
|
|
return codepoint;
|
|
}
|
|
|
|
static inline unsigned long decode_string_entity(const std::string& entity) {
|
|
for (const Entity& i : string_entities) {
|
|
if (entity == i.string) {
|
|
return i.codepoint;
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static inline unsigned long decode_entity(std::string entity) {
|
|
return !entity.empty() && entity[0] == '#'
|
|
? decode_numeric_entity(entity)
|
|
: decode_string_entity(entity);
|
|
}
|
|
|
|
namespace blankie {
|
|
namespace html {
|
|
|
|
[[nodiscard]] std::string unescape(const std::string& str) {
|
|
std::string output;
|
|
unsigned long codepoint;
|
|
char codepoint_buf[4];
|
|
size_t offset = 0, old_offset = 0, offset_end, codepoint_buf_size;
|
|
|
|
output.reserve(str.size());
|
|
while ((offset = str.find('&', offset)) != std::string::npos) {
|
|
offset_end = str.find(';', ++offset);
|
|
if (offset_end == std::string::npos) {
|
|
break;
|
|
}
|
|
|
|
codepoint = decode_entity(str.substr(offset, offset_end - offset));
|
|
if (codepoint) {
|
|
codepoint_buf_size = codepoint_to_utf8(codepoint_buf, codepoint);
|
|
output.append(str, old_offset, offset - old_offset - 1);
|
|
output.append(codepoint_buf, codepoint_buf_size);
|
|
old_offset = offset = offset_end + 1;
|
|
}
|
|
}
|
|
|
|
if (str.size() > old_offset) {
|
|
output.append(str, old_offset, std::string::npos);
|
|
}
|
|
return output;
|
|
}
|
|
|
|
} // namespace html
|
|
} // namespace blankie
|