pixwhile/blankie/unescape.cpp

126 lines
3.9 KiB
C++

#include <string>
#include <climits>
#include <stdexcept>
#include "unescape.h"
#include "unescape_data.h"
// https://stackoverflow.com/a/42013433
static inline size_t codepoint_to_utf8(char* buf, const unsigned long code) {
// you see, i don't care
// https://t.me/NightShadowsHangout/670534
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wconversion"
if (code <= 0x7F) {
buf[0] = static_cast<char>(code);
return 1;
} else if (code <= 0x7FF) {
buf[0] = 0xC0 | static_cast<char>(code >> 6); /* 110xxxxx */
buf[1] = 0x80 | (code & 0x3F); /* 10xxxxxx */
return 2;
} else if (code <= 0xFFFF) {
buf[0] = 0xE0 | static_cast<char>(code >> 12); /* 1110xxxx */
buf[1] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
buf[2] = 0x80 | (code & 0x3F); /* 10xxxxxx */
return 3;
} else if (code <= 0x10FFFF) {
buf[0] = 0xF0 | static_cast<char>(code >> 18); /* 11110xxx */
buf[1] = 0x80 | ((code >> 12) & 0x3F); /* 10xxxxxx */
buf[2] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */
buf[3] = 0x80 | (code & 0x3F); /* 10xxxxxx */
return 4;
#pragma GCC diagnostic pop
} else {
throw std::invalid_argument("codepoint passed is bigger than 0x10FFFF");
}
}
static inline bool isdigit(char ch) {
return ch >= '0' && ch <= '9';
}
static inline bool ishex(char ch) {
return isdigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
static inline unsigned long decode_numeric_entity(const std::string& entity) {
unsigned long codepoint;
char* last_converted_char;
errno = 0;
if (entity[1] == 'x' || entity[1] == 'X') {
if (entity.size() <= 2 || !ishex(entity[2])) {
return 0;
}
codepoint = strtoul(&entity.c_str()[2], &last_converted_char, 16);
} else {
if (entity.size() <= 1 || !isdigit(entity[1])) {
return 0;
}
codepoint = strtoul(&entity.c_str()[1], &last_converted_char, 10);
}
if ((codepoint == ULONG_MAX && errno == ERANGE) || last_converted_char[0] != '\0') {
return 0;
}
if (codepoint >= 0x80 && codepoint <= 0x9F) {
codepoint = windows1252_repl[codepoint - 0x80];
}
if (!codepoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
codepoint = 0xFFFD;
}
return codepoint;
}
static inline unsigned long decode_string_entity(const std::string& entity) {
for (const Entity& i : string_entities) {
if (entity == i.string) {
return i.codepoint;
}
}
return 0;
}
static inline unsigned long decode_entity(std::string entity) {
return !entity.empty() && entity[0] == '#'
? decode_numeric_entity(entity)
: decode_string_entity(entity);
}
namespace blankie {
namespace html {
[[nodiscard]] std::string unescape(const std::string& str) {
std::string output;
unsigned long codepoint;
char codepoint_buf[4];
size_t offset = 0, old_offset = 0, offset_end, codepoint_buf_size;
output.reserve(str.size());
while ((offset = str.find('&', offset)) != std::string::npos) {
offset_end = str.find(';', ++offset);
if (offset_end == std::string::npos) {
break;
}
codepoint = decode_entity(str.substr(offset, offset_end - offset));
if (codepoint) {
codepoint_buf_size = codepoint_to_utf8(codepoint_buf, codepoint);
output.append(str, old_offset, offset - old_offset - 1);
output.append(codepoint_buf, codepoint_buf_size);
old_offset = offset = offset_end + 1;
}
}
if (str.size() > old_offset) {
output.append(str, old_offset, std::string::npos);
}
return output;
}
} // namespace html
} // namespace blankie