pixwhile/blankie/unescape.cpp

#include <string>
#include <climits>
#include <stdexcept>

#include "unescape.h"
#include "unescape_data.h"

// https://stackoverflow.com/a/42013433
static inline size_t codepoint_to_utf8(char* buf, const unsigned long code) {
    // you see, i don't care
    // https://t.me/NightShadowsHangout/670534
    #pragma GCC diagnostic push
    #pragma GCC diagnostic ignored "-Wconversion"
    if (code <= 0x7F) {
        buf[0] = static_cast<char>(code);
        return 1;
    } else if (code <= 0x7FF) {
        buf[0] = 0xC0 | static_cast<char>(code >> 6);   /* 110xxxxx */
        buf[1] = 0x80 | (code & 0x3F);                  /* 10xxxxxx */
        return 2;
    } else if (code <= 0xFFFF) {
        buf[0] = 0xE0 | static_cast<char>(code >> 12);  /* 1110xxxx */
        buf[1] = 0x80 | ((code >> 6) & 0x3F);           /* 10xxxxxx */
        buf[2] = 0x80 | (code & 0x3F);                  /* 10xxxxxx */
        return 3;
    } else if (code <= 0x10FFFF) {
        buf[0] = 0xF0 | static_cast<char>(code >> 18);  /* 11110xxx */
        buf[1] = 0x80 | ((code >> 12) & 0x3F);          /* 10xxxxxx */
        buf[2] = 0x80 | ((code >> 6) & 0x3F);           /* 10xxxxxx */
        buf[3] = 0x80 | (code & 0x3F);                  /* 10xxxxxx */
        return 4;
    #pragma GCC diagnostic pop
    } else {
        throw std::invalid_argument("codepoint passed is bigger than 0x10FFFF");
    }
}

static inline bool isdigit(char ch) {
    return ch >= '0' && ch <= '9';
}

static inline bool ishex(char ch) {
    return isdigit(ch) || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}

static inline unsigned long decode_numeric_entity(const std::string& entity) {
    unsigned long codepoint;
    char* last_converted_char;

    errno = 0;
    if (entity[1] == 'x' || entity[1] == 'X') {
        if (entity.size() <= 2 || !ishex(entity[2])) {
            return 0;
        }
        codepoint = strtoul(&entity.c_str()[2], &last_converted_char, 16);
    } else {
        if (entity.size() <= 1 || !isdigit(entity[1])) {
            return 0;
        }
        codepoint = strtoul(&entity.c_str()[1], &last_converted_char, 10);
    }

    if ((codepoint == ULONG_MAX && errno == ERANGE) || last_converted_char[0] != '\0') {
        return 0;
    }

    if (codepoint >= 0x80 && codepoint <= 0x9F) {
        codepoint = windows1252_repl[codepoint - 0x80];
    }
    if (!codepoint || codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) {
        codepoint = 0xFFFD;
    }

    return codepoint;
}

static inline unsigned long decode_string_entity(const std::string& entity) {
    for (const Entity& i : string_entities) {
        if (entity == i.string) {
            return i.codepoint;
        }
    }

    return 0;
}

static inline unsigned long decode_entity(std::string entity) {
    return !entity.empty() && entity[0] == '#'
        ? decode_numeric_entity(entity)
        : decode_string_entity(entity);
}

namespace blankie {
namespace html {

[[nodiscard]] std::string unescape(const std::string& str) {
    std::string output;
    unsigned long codepoint;
    char codepoint_buf[4];
    size_t offset = 0, old_offset = 0, offset_end, codepoint_buf_size;

    output.reserve(str.size());
    while ((offset = str.find('&', offset)) != std::string::npos) {
        offset_end = str.find(';', ++offset);
        if (offset_end == std::string::npos) {
            break;
        }

        codepoint = decode_entity(str.substr(offset, offset_end - offset));
        if (codepoint) {
            codepoint_buf_size = codepoint_to_utf8(codepoint_buf, codepoint);
            output.append(str, old_offset, offset - old_offset - 1);
            output.append(codepoint_buf, codepoint_buf_size);
            old_offset = offset = offset_end + 1;
        }
    }

    if (str.size() > old_offset) {
        output.append(str, old_offset, std::string::npos);
    }
    return output;
}

} // namespace html
} // namespace blankie