#include #include #include #include "unescape.h" #include "unescape_data.h" // https://stackoverflow.com/a/42013433 static inline size_t codepoint_to_utf8(char* buf, const unsigned long code) { // you see, i don't care // https://t.me/NightShadowsHangout/670534 #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wconversion" if (code <= 0x7F) { buf[0] = static_cast(code); return 1; } else if (code <= 0x7FF) { buf[0] = 0xC0 | static_cast(code >> 6); /* 110xxxxx */ buf[1] = 0x80 | (code & 0x3F); /* 10xxxxxx */ return 2; } else if (code <= 0xFFFF) { buf[0] = 0xE0 | static_cast(code >> 12); /* 1110xxxx */ buf[1] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */ buf[2] = 0x80 | (code & 0x3F); /* 10xxxxxx */ return 3; } else if (code <= 0x10FFFF) { buf[0] = 0xF0 | static_cast(code >> 18); /* 11110xxx */ buf[1] = 0x80 | ((code >> 12) & 0x3F); /* 10xxxxxx */ buf[2] = 0x80 | ((code >> 6) & 0x3F); /* 10xxxxxx */ buf[3] = 0x80 | (code & 0x3F); /* 10xxxxxx */ return 4; #pragma GCC diagnostic pop } else { throw std::invalid_argument("codepoint passed is bigger than 0x10FFFF"); } } static inline bool isdigit(char ch) { return ch >= '0' && ch <= '9'; } static inline bool ishex(char ch) { return isdigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); } static inline unsigned long decode_numeric_entity(const std::string& entity) { unsigned long codepoint; char* last_converted_char; errno = 0; if (entity[1] == 'x' || entity[1] == 'X') { if (entity.size() <= 2 || !ishex(entity[2])) { return 0; } codepoint = strtoul(&entity.c_str()[2], &last_converted_char, 16); } else { if (entity.size() <= 1 || !isdigit(entity[1])) { return 0; } codepoint = strtoul(&entity.c_str()[1], &last_converted_char, 10); } if ((codepoint == ULONG_MAX && errno == ERANGE) || last_converted_char[0] != '\0') { return 0; } if (codepoint >= 0x80 && codepoint <= 0x9F) { codepoint = windows1252_repl[codepoint - 0x80]; } if (!codepoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) { codepoint = 0xFFFD; } return codepoint; } static inline unsigned long decode_string_entity(const std::string& entity) { for (const Entity& i : string_entities) { if (entity == i.string) { return i.codepoint; } } return 0; } static inline unsigned long decode_entity(std::string entity) { return !entity.empty() && entity[0] == '#' ? decode_numeric_entity(entity) : decode_string_entity(entity); } namespace blankie { namespace html { [[nodiscard]] std::string unescape(const std::string& str) { std::string output; unsigned long codepoint; char codepoint_buf[4]; size_t offset = 0, old_offset = 0, offset_end, codepoint_buf_size; output.reserve(str.size()); while ((offset = str.find('&', offset)) != std::string::npos) { offset_end = str.find(';', ++offset); if (offset_end == std::string::npos) { break; } codepoint = decode_entity(str.substr(offset, offset_end - offset)); if (codepoint) { codepoint_buf_size = codepoint_to_utf8(codepoint_buf, codepoint); output.append(str, old_offset, offset - old_offset - 1); output.append(codepoint_buf, codepoint_buf_size); old_offset = offset = offset_end + 1; } } if (str.size() > old_offset) { output.append(str, old_offset, std::string::npos); } return output; } } // namespace html } // namespace blankie