pixwhile/blankie/murl.cpp

159 lines
4.4 KiB
C++
Raw Normal View History

2023-04-06 12:24:09 +00:00
#include <regex>
#include <vector>
2023-04-09 17:34:13 +00:00
#include <ctype.h>
2023-04-06 12:24:09 +00:00
#include <climits>
#include "murl.h"
#include "../numberhelper.h"
2023-04-06 12:24:09 +00:00
#define UNRESERVED "[\\w\\d\\-._~]"
#define PCT_ENCODED "%[\\da-f]{2}"
// A space is added to sub-delims to make it work with URLs that have a space
#define SUB_DELIMS "[!$&'()*+,;= ]"
#define USERINFO "((?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|:)*)"
// A lot looser than RFC 3986, but things will go very wrong very quickly if I comply
2023-04-09 17:08:01 +00:00
// Hostname does not allow for stuff like "//The", but it is not important here
#define HOSTCHAR "[\\w\\d\\-.]"
2023-04-09 17:34:13 +00:00
#define HOST "(\\[[\\da-f:.]+\\]|localhost|" HOSTCHAR "+(?:\\." HOSTCHAR "+)+)"
2023-04-06 12:24:09 +00:00
#define PORT "(\\d*)"
#define AUTHORITY "(?:" USERINFO "@)?" HOST "(?::" PORT ")?"
#define PCHAR "(?:" UNRESERVED "|" PCT_ENCODED "|" SUB_DELIMS "|[:@])"
#define SEGMENT PCHAR "*"
#define PATH_ABEMPTY "((?:/" SEGMENT ")*)"
#define QUERY "((?:" PCHAR "|[/?])*)"
// Looser than RFC 3986, but fragments might as well own everything
2023-04-09 17:08:01 +00:00
#define FRAGMENT "([^ ]*)"
2023-04-06 12:24:09 +00:00
2023-04-09 17:08:01 +00:00
#define FULL_HTTP_HTTPS_REGEX \
"(?:(https?)?:)?//" AUTHORITY \
PATH_ABEMPTY \
"(?:\\?" QUERY ")?" \
"(?:#" FRAGMENT ")?"
#define HTTP_HTTPS_REGEX \
2023-04-06 12:24:09 +00:00
"(?:(https?)?:)?(?://" AUTHORITY ")?" \
PATH_ABEMPTY \
"(?:\\?" QUERY ")?" \
"(?:#" FRAGMENT ")?"
static void handle_segment(std::vector<std::string>& segments, const std::string& str, size_t offset, size_t length);
2023-04-09 17:34:13 +00:00
static std::string tolower(std::string str);
2023-04-06 12:24:09 +00:00
namespace blankie {
namespace murl {
2023-04-09 17:08:01 +00:00
std::regex full_url_regex(FULL_HTTP_HTTPS_REGEX, std::regex::icase);
std::regex url_regex(HTTP_HTTPS_REGEX, std::regex::icase);
2023-04-06 12:24:09 +00:00
Url::Url(const std::string& str) {
std::smatch sm;
if (!std::regex_match(str, sm, url_regex)) {
throw std::invalid_argument(str + " is not a URL");
}
this->scheme = sm.str(1);
this->userinfo = sm.str(2);
this->hostname = sm.str(3);
this->port = sm[4].length() > 0 ? to_int(sm.str(4)) : -1;
this->path = sm.str(5);
this->query = sm.str(6);
this->fragment = sm.str(7);
}
std::string Url::to_string() const {
std::string str;
if (!this->hostname.empty()) {
if (!this->scheme.empty()) {
str += this->scheme + ':';
}
str += "//";
if (!this->userinfo.empty()) {
str += this->userinfo + '@';
}
str += this->hostname;
if (this->port != -1) {
str += ':';
str += std::to_string(this->port);
}
}
str += this->path;
if (!this->query.empty()) {
str += '?';
str += this->query;
}
if (!this->fragment.empty()) {
str += '#';
str += this->fragment;
}
return str;
}
2023-04-09 17:34:13 +00:00
bool Url::is_host_equal(std::string other) const {
return tolower(this->hostname) == tolower(std::move(other));
}
2023-04-06 12:24:09 +00:00
std::string normalize_path(const std::string& str) {
std::vector<std::string> segments;
std::string res;
size_t pos = 0;
size_t last_pos = 0;
bool starts_with_slash = false;
bool ends_with_slash = false;
if (str.size() > 0 && str[0] == '/') {
starts_with_slash = true;
last_pos = pos = 1;
}
while ((pos = str.find('/', pos)) != std::string::npos) {
handle_segment(segments, str, last_pos, pos - last_pos);
last_pos = ++pos;
}
if (str.size() > last_pos) {
handle_segment(segments, str, last_pos, str.size() - last_pos);
}
if (str.size() > 1 && str.back() == '/' && !segments.empty()) {
ends_with_slash = true;
}
if (starts_with_slash) {
res += '/';
}
for (size_t i = 0; i < segments.size(); i++) {
if (i != 0) {
res += '/';
}
res += std::move(segments[i]);
}
if (ends_with_slash) {
res += '/';
}
return res;
}
2023-04-10 14:24:11 +00:00
} // namespace murl
} // namespace blankie
2023-04-06 12:24:09 +00:00
static void handle_segment(std::vector<std::string>& segments, const std::string& str, size_t offset, size_t length) {
if (length == 2 && str[offset] == '.' && str[offset + 1] == '.') {
if (segments.empty()) {
return;
}
segments.pop_back();
} else if (length == 1 && str[offset] == '.') {
// do nothing
} else if (length != 0) {
segments.push_back(str.substr(offset, length));
}
}
2023-04-09 17:34:13 +00:00
static std::string tolower(std::string str) {
for (size_t i = 0; i < str.size(); i++) {
str[i] = static_cast<char>(tolower(str[i]));
}
return str;
}