231 lines
6.5 KiB
C++
231 lines
6.5 KiB
C++
#include <regex>
|
|
#include <vector>
|
|
#include <ctype.h>
|
|
#include <climits>
|
|
|
|
#include "murl.h"
|
|
#include "../numberhelper.h"
|
|
|
|
#define UNRESERVED "[\\w\\d\\-._~]"
|
|
#define PCT_ENCODED "%[\\da-f]{2}"
|
|
// A space is added to sub-delims to make it work with URLs that have a space
|
|
#define SUB_DELIMS "[!$&'()*+,;= ]"
|
|
|
|
#define USERINFO "((?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|:)*)"
|
|
// A lot looser than RFC 3986, but things will go very wrong very quickly if I comply
|
|
// Hostname does not allow for stuff like "//The", but it is not important here
|
|
#define HOSTCHAR "[\\w\\d\\-.]"
|
|
#define HOST "(\\[[\\da-f:.]+\\]|localhost|" HOSTCHAR "+(?:\\." HOSTCHAR "+)+)"
|
|
#define PORT "(\\d*)"
|
|
#define AUTHORITY "(?:" USERINFO "@)?" HOST "(?::" PORT ")?"
|
|
|
|
#define PCHAR "(?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|[:@])"
|
|
#define SEGMENT PCHAR "*"
|
|
#define PATH_ABEMPTY "((?:/" SEGMENT ")*)"
|
|
|
|
#define QUERY "((?:" PCHAR "|[/?])*)"
|
|
// Looser than RFC 3986, but fragments might as well own everything
|
|
#define FRAGMENT "([^ ]*)"
|
|
|
|
#define FULL_HTTP_HTTPS_REGEX \
|
|
"(?:(https?)?:)?//" AUTHORITY \
|
|
PATH_ABEMPTY \
|
|
"(?:\\?" QUERY ")?" \
|
|
"(?:#" FRAGMENT ")?"
|
|
#define HTTP_HTTPS_REGEX \
|
|
"(?:(https?)?:)?(?://" AUTHORITY ")?" \
|
|
PATH_ABEMPTY \
|
|
"(?:\\?" QUERY ")?" \
|
|
"(?:#" FRAGMENT ")?"
|
|
|
|
static inline void hexencode(char c, char out[2]);
|
|
static inline char hexdecode(char nibble1, char nibble2);
|
|
static void handle_segment(std::vector<std::string>& segments, const std::string& str, size_t offset, size_t length);
|
|
static std::string tolower(std::string str);
|
|
|
|
namespace blankie {
|
|
namespace murl {
|
|
|
|
std::regex full_url_regex(FULL_HTTP_HTTPS_REGEX, std::regex::icase);
|
|
std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase);
|
|
|
|
Url::Url(const std::string& str) {
|
|
std::smatch sm;
|
|
if (!std::regex_match(str, sm, url_regex)) {
|
|
throw std::invalid_argument(str + " is not a URL");
|
|
}
|
|
this->scheme = sm.str(1);
|
|
this->userinfo = sm.str(2);
|
|
this->hostname = sm.str(3);
|
|
this->port = sm[4].length() > 0 ? to_int(sm.str(4)) : -1;
|
|
this->path = sm.str(5);
|
|
this->query = sm.str(6);
|
|
this->fragment = sm.str(7);
|
|
}
|
|
|
|
std::string Url::to_string() const {
|
|
std::string str;
|
|
if (!this->hostname.empty()) {
|
|
if (!this->scheme.empty()) {
|
|
str += this->scheme + ':';
|
|
}
|
|
str += "//";
|
|
if (!this->userinfo.empty()) {
|
|
str += this->userinfo + '@';
|
|
}
|
|
str += this->hostname;
|
|
if (this->port != -1) {
|
|
str += ':';
|
|
str += std::to_string(this->port);
|
|
}
|
|
}
|
|
str += this->path;
|
|
if (!this->query.empty()) {
|
|
str += '?';
|
|
str += this->query;
|
|
}
|
|
if (!this->fragment.empty()) {
|
|
str += '#';
|
|
str += this->fragment;
|
|
}
|
|
return str;
|
|
}
|
|
|
|
bool Url::is_host_equal(std::string other) const {
|
|
return tolower(this->hostname) == tolower(std::move(other));
|
|
}
|
|
|
|
std::string escape(const std::string& in) {
|
|
std::string out;
|
|
char encoded[2];
|
|
size_t pos = 0;
|
|
size_t last_pos = 0;
|
|
|
|
out.reserve(in.size());
|
|
while ((pos = in.find_first_not_of("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", pos)) != std::string::npos) {
|
|
out.append(in, last_pos, pos - last_pos);
|
|
hexencode(in[pos], encoded);
|
|
out += '%';
|
|
out.append(encoded, 2);
|
|
pos++;
|
|
last_pos = pos;
|
|
}
|
|
|
|
if (in.size() > last_pos) {
|
|
out.append(in, last_pos);
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
std::string unescape(const std::string& in) {
|
|
std::string out;
|
|
size_t pos = 0;
|
|
size_t last_pos = 0;
|
|
|
|
out.reserve(in.size());
|
|
while ((pos = in.find('%', pos)) != std::string::npos) {
|
|
if (pos + 2 >= in.size()) {
|
|
throw std::invalid_argument("String abruptly terminated while finding percent-encoded nibbles");
|
|
}
|
|
out.append(in, last_pos, pos - last_pos);
|
|
out += hexdecode(in[pos + 1], in[pos + 2]);
|
|
pos += 3;
|
|
last_pos = pos;
|
|
}
|
|
|
|
if (in.size() > last_pos) {
|
|
out.append(in, last_pos);
|
|
}
|
|
|
|
return out;
|
|
}
|
|
|
|
std::string normalize_path(const std::string& str) {
|
|
std::vector<std::string> segments;
|
|
std::string res;
|
|
size_t pos = 0;
|
|
size_t last_pos = 0;
|
|
bool starts_with_slash = false;
|
|
bool ends_with_slash = false;
|
|
|
|
if (str.size() > 0 && str[0] == '/') {
|
|
starts_with_slash = true;
|
|
last_pos = pos = 1;
|
|
}
|
|
|
|
while ((pos = str.find('/', pos)) != std::string::npos) {
|
|
handle_segment(segments, str, last_pos, pos - last_pos);
|
|
last_pos = ++pos;
|
|
}
|
|
if (str.size() > last_pos) {
|
|
handle_segment(segments, str, last_pos, str.size() - last_pos);
|
|
}
|
|
|
|
if (str.size() > 1 && str.back() == '/' && !segments.empty()) {
|
|
ends_with_slash = true;
|
|
}
|
|
|
|
if (starts_with_slash) {
|
|
res += '/';
|
|
}
|
|
for (size_t i = 0; i < segments.size(); i++) {
|
|
if (i != 0) {
|
|
res += '/';
|
|
}
|
|
res += std::move(segments[i]);
|
|
}
|
|
if (ends_with_slash) {
|
|
res += '/';
|
|
}
|
|
|
|
return res;
|
|
}
|
|
|
|
} // namespace murl
|
|
} // namespace blankie
|
|
|
|
static inline void hexencode(char c, char out[2]) {
|
|
char nibble1 = (c >> 4) & 0xF;
|
|
char nibble2 = c & 0xF;
|
|
|
|
auto hexencode = [](char nibble) {
|
|
return static_cast<char>(nibble < 10
|
|
? '0' + nibble
|
|
: 'A' + nibble - 10);
|
|
};
|
|
out[0] = hexencode(nibble1);
|
|
out[1] = hexencode(nibble2);
|
|
}
|
|
|
|
static inline char hexdecode(char nibble1, char nibble2) {
|
|
auto hexdecode = [](char nibble) {
|
|
if (nibble >= '0' && nibble <= '9') return static_cast<char>(nibble - '0');
|
|
if (nibble >= 'A' && nibble <= 'F') return static_cast<char>(nibble - 'A' + 10);
|
|
if (nibble >= 'a' && nibble <= 'f') return static_cast<char>(nibble - 'a' + 10);
|
|
throw std::invalid_argument("Invalid percent-encoded nibble received");
|
|
};
|
|
|
|
return static_cast<char>((hexdecode(nibble1) << 4) | hexdecode(nibble2));
|
|
}
|
|
|
|
static void handle_segment(std::vector<std::string>& segments, const std::string& str, size_t offset, size_t length) {
|
|
if (length == 2 && str[offset] == '.' && str[offset + 1] == '.') {
|
|
if (segments.empty()) {
|
|
return;
|
|
}
|
|
segments.pop_back();
|
|
} else if (length == 1 && str[offset] == '.') {
|
|
// do nothing
|
|
} else if (length != 0) {
|
|
segments.push_back(str.substr(offset, length));
|
|
}
|
|
}
|
|
|
|
static std::string tolower(std::string str) {
|
|
for (size_t i = 0; i < str.size(); i++) {
|
|
str[i] = static_cast<char>(tolower(str[i]));
|
|
}
|
|
return str;
|
|
}
|