#include #include #include #include #include "murl.h" #include "../numberhelper.h" #define UNRESERVED "[\\w\\d\\-._~]" #define PCT_ENCODED "%[\\da-f]{2}" #define SUB_DELIMS "[!$&'()*+,;=]" #define USERINFO "((?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|:)*)" // A lot looser than RFC 3986, but things will go very wrong very quickly if I comply // Hostname does not allow for stuff like "//The", but it is not important here #define HOSTCHAR "[\\w\\d\\-.]" #define HOST "(\\[[\\da-f:.]+\\]|localhost|" HOSTCHAR "+(?:\\." HOSTCHAR "+)+)" #define PORT "(\\d*)" #define AUTHORITY "(?:" USERINFO "@)?" HOST "(?::" PORT ")?" #define PCHAR "(?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|[:@])" #define SEGMENT PCHAR "*" #define PATH_ABEMPTY "((?:/" SEGMENT ")*)" #define QUERY "((?:" PCHAR "|[/?])*)" // Looser than RFC 3986, but fragments might as well own everything #define FRAGMENT "([^ ]*)" #define HTTP_HTTPS_REGEX \ "(?:(https?)?:)?(?://" AUTHORITY ")?" \ PATH_ABEMPTY \ "(?:\\?" QUERY ")?" \ "(?:#" FRAGMENT ")?" static inline void hexencode(char c, char out[2]); static inline char hexdecode(char nibble1, char nibble2); static void handle_segment(std::vector& segments, const std::string& str, size_t offset, size_t length); static std::string tolower(std::string str); namespace blankie { namespace murl { const std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase); Url::Url(const std::string& str) { std::smatch sm; if (!std::regex_match(str, sm, url_regex)) { throw std::invalid_argument(str + " is not a URL"); } this->scheme = sm.str(1); this->userinfo = sm.str(2); this->hostname = sm.str(3); this->port = sm[4].length() > 0 ? to_int(sm.str(4)) : -1; this->path = sm.str(5); this->query = sm.str(6); this->fragment = sm.str(7); } std::string Url::to_string() const { std::string str; if (!this->hostname.empty()) { if (!this->scheme.empty()) { str += this->scheme + ':'; } str += "//"; if (!this->userinfo.empty()) { str += this->userinfo + '@'; } str += this->hostname; if (this->port != -1) { str += ':'; str += std::to_string(this->port); } } str += this->path; if (!this->query.empty()) { str += '?'; str += this->query; } if (!this->fragment.empty()) { str += '#'; str += this->fragment; } return str; } bool Url::is_host_equal(std::string other) const { return tolower(this->hostname) == tolower(std::move(other)); } std::string escape(const std::string& in) { std::string out; char encoded[2]; size_t pos = 0; size_t last_pos = 0; out.reserve(in.size()); while ((pos = in.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", pos)) != std::string::npos) { out.append(in, last_pos, pos - last_pos); hexencode(in[pos], encoded); out += '%'; out.append(encoded, 2); pos++; last_pos = pos; } if (in.size() > last_pos) { out.append(in, last_pos); } return out; } std::string unescape(const std::string& in) { std::string out; size_t pos = 0; size_t last_pos = 0; out.reserve(in.size()); while ((pos = in.find('%', pos)) != std::string::npos) { if (pos + 2 >= in.size()) { throw std::invalid_argument("String abruptly terminated while finding percent-encoded nibbles"); } out.append(in, last_pos, pos - last_pos); out += hexdecode(in[pos + 1], in[pos + 2]); pos += 3; last_pos = pos; } if (in.size() > last_pos) { out.append(in, last_pos); } return out; } std::string normalize_path(const std::string& str) { std::vector segments; std::string res; size_t pos = 0; size_t last_pos = 0; bool starts_with_slash = false; bool ends_with_slash = false; if (str.size() > 0 && str[0] == '/') { starts_with_slash = true; last_pos = pos = 1; } while ((pos = str.find('/', pos)) != std::string::npos) { handle_segment(segments, str, last_pos, pos - last_pos); last_pos = ++pos; } if (str.size() > last_pos) { handle_segment(segments, str, last_pos, str.size() - last_pos); } if (str.size() > 1 && str.back() == '/' && !segments.empty()) { ends_with_slash = true; } if (starts_with_slash) { res += '/'; } for (size_t i = 0; i < segments.size(); i++) { if (i != 0) { res += '/'; } res += std::move(segments[i]); } if (ends_with_slash) { res += '/'; } return res; } } // namespace murl } // namespace blankie static inline void hexencode(char c, char out[2]) { char nibble1 = (c >> 4) & 0xF; char nibble2 = c & 0xF; auto hexencode = [](char nibble) { return static_cast(nibble < 10 ? '0' + nibble : 'A' + nibble - 10); }; out[0] = hexencode(nibble1); out[1] = hexencode(nibble2); } static inline char hexdecode(char nibble1, char nibble2) { auto hexdecode = [](char nibble) { if (nibble >= '0' && nibble <= '9') return static_cast(nibble - '0'); if (nibble >= 'A' && nibble <= 'F') return static_cast(nibble - 'A' + 10); if (nibble >= 'a' && nibble <= 'f') return static_cast(nibble - 'a' + 10); throw std::invalid_argument("Invalid percent-encoded nibble received"); }; return static_cast((hexdecode(nibble1) << 4) | hexdecode(nibble2)); } static void handle_segment(std::vector& segments, const std::string& str, size_t offset, size_t length) { if (length == 2 && str[offset] == '.' && str[offset + 1] == '.') { if (segments.empty()) { return; } segments.pop_back(); } else if (length == 1 && str[offset] == '.') { // do nothing } else if (length != 0) { segments.push_back(str.substr(offset, length)); } } static std::string tolower(std::string str) { for (size_t i = 0; i < str.size(); i++) { str[i] = static_cast(tolower(str[i])); } return str; }