mangafetchi/src/utils.rs

746 lines
32 KiB
Rust

use crate::structs;
use std::io::Write;
use std::path::PathBuf;
use std::fs::{create_dir, File};
use url::Url;
use quick_xml::Reader;
use quick_xml::events::Event;
extern crate reqwest;
extern crate serde_json;
fn generate_slug(text: &str) -> String {
let mut text = text.to_string()
.to_lowercase()
.replace(&['à', 'á', 'ạ', 'ả', 'ã', 'â', 'ầ', 'ấ', 'ậ', 'ẩ', 'ẫ', 'ă', 'ằ', 'ắ', 'ặ', 'ẳ', 'ẵ'][..], "a")
.replace(&['è', 'é', 'ẹ', 'ẻ', 'ẽ', 'ê', 'ề', 'ế', 'ệ', 'ể', 'ễ'][..], "e")
.replace(&['ì', 'í', 'ị', 'ỉ', 'ĩ'][..], "i")
.replace(&['ò', 'ó', 'ọ', 'ỏ', 'õ', 'ô', 'ồ', 'ố', 'ộ', 'ổ', 'ỗ', 'ơ', 'ờ', 'ớ', 'ợ', 'ở', 'ỡ'][..], "o")
.replace(&['ù', 'ú', 'ụ', 'ủ', 'ũ', 'ư', 'ừ', 'ứ', 'ự', 'ử', 'ữ'][..], "u")
.replace(&['ỳ', 'ý', 'ỵ', 'ỷ', 'ỹ'][..], "y")
.replace('đ', "d")
.replace(&['!', '@', '%', '^', '*', '(', ')', '+', '=', '<', '>', '?', '/', ',', '.', ':', ';', '\'', ' ', '"', '&', '#', '[', ']', '~', '-'][..], "_");
while text.contains("__") {
text = text.replace("__", "_");
}
text.trim_matches('_').to_string()
}
fn remove_html(text: &str) -> String {
let mut removed = String::new();
let mut reader = Reader::from_str(&text);
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Text(e)) => removed.push_str(&e.unescape_and_decode(&reader).unwrap()),
Err(err) => panic!("Error at position {}: {}", reader.buffer_position(), err),
Ok(Event::Eof) => break,
_ => ()
};
buf.clear();
}
removed
}
pub async fn search(client: reqwest::Client, query: &str) -> Result<Vec<structs::SearchResult>, structs::Error> {
let text = client.post("https://mangakakalot.com/home_json_search")
.form(&[("searchword", &generate_slug(&query))])
.send()
.await?
.text()
.await?;
if text.is_empty() {
return Ok(Vec::new());
}
let mut results: Vec<structs::SearchResult> = serde_json::from_str(&text)?;
for i in 0..results.len() {
let old_result = &results[i];
results[i] = structs::SearchResult {
id: old_result.id.clone(),
name: remove_html(&old_result.name),
name_unsigned: old_result.name_unsigned.clone(),
last_chapter: remove_html(&old_result.last_chapter),
image: old_result.image.clone(),
author: remove_html(&old_result.author),
story_link: old_result.story_link.clone()
};
}
Ok(results)
}
pub async fn get_manga(client: reqwest::Client, manga_id: &str) -> Result<structs::MangaOption, structs::Error> {
let text = client.get(&format!("https://mangakakalot.com/manga/{}", &manga_id))
.send()
.await?
.text()
.await?;
let resp = parse_mangakakalot_manga(&text, &manga_id);
Ok(match resp {
structs::MangaOption::Manga(_) => resp,
structs::MangaOption::Redirect(redirect) => {
let text = client.get(&redirect.url)
.send()
.await?
.text()
.await?;
let resp = match Url::parse(&redirect.url)?.host_str().unwrap() {
"mangakakalot.com" => parse_mangakakalot_manga(&text, &manga_id),
"manganelo.com" => parse_manganelo_manga(&text, &manga_id),
_ => panic!("Unknown URL: {}", &redirect.url)
};
match resp {
structs::MangaOption::Manga(_) => resp,
structs::MangaOption::Redirect(_) => panic!("Nested redirect"),
structs::MangaOption::DoesNotExist => structs::MangaOption::DoesNotExist
}
},
structs::MangaOption::DoesNotExist => structs::MangaOption::DoesNotExist
})
}
fn parse_manganelo_manga(text: &str, manga_id: &str) -> structs::MangaOption {
let mut split: Vec<&str> = text.splitn(2, "\n").collect();
let screaming_doctype = split[0].to_uppercase();
split[0] = &screaming_doctype;
let text = split.join("\n");
let mut name: Option<String> = None;
let mut status: Option<String> = None;
let mut last_updated: Option<String> = None;
let mut summary: Option<String> = None;
let mut authors = Vec::new();
let mut genres = Vec::new();
let mut chapters = Vec::new();
let mut is_inside_h1 = false;
let mut is_inside_a = false;
let mut is_inside_td = false;
let mut is_inside_authors = false;
let mut is_inside_genres = false;
let mut is_inside_status = false;
let mut is_inside_stre_value = false;
let mut is_inside_h3 = false;
let mut is_inside_ul = false;
let mut is_inside_description = false;
let mut tmp_chapter_link: Option<String> = None;
let mut reader = Reader::from_str(&text);
reader.check_end_names(false);
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) => {
match e.name() {
b"a" => {
is_inside_a = true;
if is_inside_ul {
let href = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => attribute.key == b"href",
Err(_) => false
}
});
if href.is_some() {
match href.unwrap().unwrap().unescape_and_decode_value(&reader) {
Ok(text) => tmp_chapter_link = Some(text),
Err(_) => ()
};
}
}
},
b"ul" => is_inside_ul = true,
b"h1" => is_inside_h1 = true,
b"h3" => is_inside_h3 = true,
b"td" => {
let is_table_value = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => {
attribute.key == b"class" &&
match attribute.unescape_and_decode_value(&reader) {
Ok(text) => text.as_str() == "table-value",
Err(_) => false
}
},
Err(_) => false
}
}).is_some();
if is_table_value {
is_inside_td = true;
}
},
b"i" => {
let class = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => attribute.key == b"class",
Err(_) => false
}
});
if class.is_some() {
match class.unwrap().unwrap().unescape_and_decode_value(&reader) {
Ok(class_name) => {
match class_name.as_str() {
"info-author" => is_inside_authors = true,
"info-status" => is_inside_status = true,
"info-genres" => is_inside_genres = true,
_ => ()
};
},
Err(_) => ()
};
}
},
b"span" => {
let is_stre_value = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => {
attribute.key == b"class" &&
match attribute.unescape_and_decode_value(&reader) {
Ok(text) => text.as_str() == "stre-value",
Err(_) => false
}
},
Err(_) => false
}
}).is_some();
if is_stre_value {
is_inside_stre_value = true;
}
},
b"div" => {
let is_description = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => {
attribute.key == b"class" &&
match attribute.unescape_and_decode_value(&reader) {
Ok(text) => text.as_str() == "panel-story-info-description",
Err(_) => false
}
},
Err(_) => false
}
}).is_some();
if is_description {
is_inside_description = true;
}
},
_ => ()
};
},
Ok(Event::Text(e)) => {
let text = match e.unescape_and_decode(&reader) {
Ok(text) => text,
Err(_) => {
buf.clear();
continue;
}
};
let text = text.trim();
if name.is_none() && is_inside_h1 {
name = Some(text.to_string());
} else if is_inside_authors && is_inside_td && is_inside_a {
authors.push(text.to_string());
} else if is_inside_status && is_inside_td {
status = Some(text.to_string());
} else if is_inside_genres && is_inside_td && is_inside_a {
genres.push(text.to_string());
} else if last_updated.is_none() && is_inside_stre_value {
last_updated = Some(text.to_string());
} else if is_inside_description && !is_inside_h3 && !text.is_empty() {
if summary.is_some() {
summary.as_mut().unwrap().push_str(text);
} else {
summary = Some(text.to_string());
}
} else if is_inside_ul && is_inside_a && tmp_chapter_link.is_some() {
let chapter_name = match text.splitn(2, &[':', '-'][..]).nth(1) {
Some(text) => Some(text.trim().to_string()),
None => None
};
match tmp_chapter_link.unwrap().rsplitn(2, '_').nth(0) {
Some(chapter_number) => {
chapters.push(structs::Chapter {
chapter_number: chapter_number.to_string(),
chapter_name: chapter_name,
domain: "manganelo.com".to_string()
});
},
None => ()
};
tmp_chapter_link = None;
} else if text.starts_with("REDIRECT : ") {
return structs::MangaOption::Redirect(structs::Redirect { url: text.splitn(2, ':').nth(1).unwrap().trim().to_string() });
}
},
Ok(Event::End(e)) => {
match e.name() {
b"a" => is_inside_a = false,
b"h1" => is_inside_h1 = false,
b"h3" => is_inside_h3 = false,
b"td" => {
if is_inside_td {
is_inside_td = false;
is_inside_authors = false;
is_inside_genres = false;
is_inside_status = false;
}
},
b"div" => is_inside_description = false,
b"ul" => break,
_ => ()
};
},
Err(err) => panic!("Error at position {}: {}", reader.buffer_position(), err),
Ok(Event::Eof) => break,
_ => ()
};
buf.clear();
}
chapters.reverse();
structs::MangaOption::Manga(structs::Manga {
id: manga_id.to_string(),
name: name.unwrap(),
authors: authors,
status: status,
last_updated: last_updated,
genres: genres,
summary: summary,
chapters: chapters
}
)
}
fn parse_mangakakalot_manga(text: &str, manga_id: &str) -> structs::MangaOption {
let mut split: Vec<&str> = text.splitn(2, "\n").collect();
let screaming_doctype = split[0].to_uppercase();
split[0] = &screaming_doctype;
let text = split.join("\n");
let mut is_inside_title = false;
let mut is_title_real = false;
let mut is_inside_chapter_list = false;
let mut is_inside_manga_info = false;
let mut is_inside_authors = false;
let mut is_inside_genres = false;
let mut is_inside_a = false;
let mut is_inside_row = false;
let mut name: Option<String> = None;
let mut status: Option<String> = None;
let mut last_updated: Option<String> = None;
let mut summary: Option<String> = None;
let mut is_inside_noidungm = false;
let mut is_inside_h1 = false;
let mut is_inside_h2 = false;
let mut authors = Vec::new();
let mut genres = Vec::new();
let mut chapters = Vec::new();
let mut tmp_chapter_link: Option<String> = None;
let mut reader = Reader::from_str(&text);
reader.check_end_names(false);
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) => {
match e.name() {
b"ul" => {
let is_manga_info_text = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => {
attribute.key == b"class" &&
match attribute.unescape_and_decode_value(&reader) {
Ok(text) => text.as_str() == "manga-info-text",
Err(_) => false
}
},
Err(_) => false
}
}).is_some();
if is_manga_info_text {
is_inside_manga_info = true;
}
},
b"div" => {
let class = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => attribute.key == b"class",
Err(_) => false
}
});
if class.is_some() {
match class.unwrap().unwrap().unescape_and_decode_value(&reader) {
Ok(class_name) => {
match class_name.as_str() {
"chapter-list" => is_inside_chapter_list = true,
"row" => is_inside_row = true,
_ => ()
};
},
Err(_) => ()
};
}
let inside_noidungm = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => {
attribute.key == b"id" &&
match attribute.unescape_and_decode_value(&reader) {
Ok(text) => text.as_str() == "noidungm",
Err(_) => false
}
},
Err(_) => false
}
}).is_some();
if inside_noidungm {
is_inside_noidungm = true;
}
},
b"h1" => is_inside_h1 = true,
b"h2" => is_inside_h2 = true,
b"a" => {
is_inside_a = true;
if is_inside_chapter_list {
let href = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => attribute.key == b"href",
Err(_) => false
}
});
if href.is_some() {
match href.unwrap().unwrap().unescape_and_decode_value(&reader) {
Ok(text) => tmp_chapter_link = Some(text),
Err(_) => ()
};
}
}
},
b"title" => is_inside_title = true,
_ => ()
};
},
Ok(Event::Text(e)) => {
let text = match e.unescape_and_decode(&reader) {
Ok(text) => text,
Err(_) => {
buf.clear();
continue;
}
};
let text = text.trim();
if is_inside_manga_info {
if is_inside_h1 {
name = Some(text.to_string());
} else if is_inside_authors && is_inside_a {
authors.push(text.to_string());
} else if is_inside_genres && is_inside_a {
genres.push(text.to_string());
} else {
match text.splitn(2, ' ').nth(0).unwrap() {
"Author(s)" => is_inside_authors = true,
"Status" => {
match text.splitn(3, ' ').nth(2) {
Some(text) => status = Some(text.to_string()),
None => ()
};
},
"Last" => {
if text.starts_with("Last updated : ") {
match text.splitn(4, ' ').nth(3) {
Some(text) => last_updated = Some(text.to_string()),
None => ()
};
}
},
"Genres" => is_inside_genres = true,
_ => ()
}
}
} else if is_inside_noidungm && !is_inside_h2 && !text.is_empty(){
if summary.is_some() {
summary.as_mut().unwrap().push_str(text);
} else {
summary = Some(text.to_string());
}
} else if is_inside_chapter_list && is_inside_a && tmp_chapter_link.is_some() {
let chapter_name = match text.splitn(2, &[':', '-'][..]).nth(1) {
Some(text) => Some(text.trim().to_string()),
None => None
};
match tmp_chapter_link.unwrap().rsplitn(2, '_').nth(0) {
Some(chapter_number) => {
chapters.push(structs::Chapter {
chapter_number: chapter_number.to_string(),
chapter_name: chapter_name,
domain: "mangakakalot.com".to_string()
});
},
None => ()
};
tmp_chapter_link = None;
} else if is_inside_title {
is_title_real = !text.is_empty();
} else if text.trim().starts_with("REDIRECT : ") {
return structs::MangaOption::Redirect(structs::Redirect { url: text.splitn(2, ':').nth(1).unwrap().trim().to_string() });
}
},
Ok(Event::Empty(ref e)) => {
if is_inside_noidungm && e.name() == b"br" && summary.is_some() {
summary.as_mut().unwrap().push('\n');
}
},
Ok(Event::End(e)) => {
match e.name() {
b"ul" => is_inside_manga_info = false,
b"li" => {
is_inside_authors = false;
is_inside_genres = false;
},
b"div" => {
if is_inside_noidungm {
is_inside_noidungm = false;
} else if is_inside_row {
is_inside_row = false;
} else if is_inside_chapter_list {
break;
}
},
b"h1" => is_inside_h1 = false,
b"h2" => is_inside_h2 = false,
b"a" => is_inside_a = false,
b"title" => {
if !is_title_real {
return structs::MangaOption::DoesNotExist;
}
},
_ => ()
};
},
Err(err) => panic!("Error at position {}: {}", reader.buffer_position(), err),
Ok(Event::Eof) => break,
_ => ()
};
buf.clear();
}
chapters.reverse();
structs::MangaOption::Manga(structs::Manga {
id: manga_id.to_string(),
name: name.unwrap(),
authors: authors,
status: status,
last_updated: last_updated,
genres: genres,
summary: summary,
chapters: chapters
})
}
pub async fn get_pages(client: reqwest::Client, chapter: &structs::Chapter, manga_id: &str) -> Result<Vec<String>, reqwest::Error> {
let text = client.get(&format!("https://{}/chapter/{}/chapter_{}", &chapter.domain, &manga_id, &chapter.chapter_number))
.send()
.await?
.text()
.await?;
Ok(match chapter.domain.as_str() {
"mangakakalot.com" => parse_mangakakalot_pages(&text),
"manganelo.com" => parse_manganelo_pages(&text),
_ => panic!("Unknown domain: {}", &chapter.domain)
})
}
fn parse_mangakakalot_pages(text: &str) -> Vec<String> {
let mut split: Vec<&str> = text.splitn(2, "\n").collect();
let screaming_doctype = split[0].to_uppercase();
split[0] = &screaming_doctype;
let text = split.join("\n");
let mut is_inside_pages = false;
let mut is_inside_ads = false;
let mut pages = Vec::new();
let mut reader = Reader::from_str(&text);
reader.check_end_names(false);
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) => {
if e.name() == b"div" {
if is_inside_pages {
is_inside_ads = true;
} else {
let inside_pages = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => {
attribute.key == b"id" &&
match attribute.unescape_and_decode_value(&reader) {
Ok(text) => text.as_str() == "vungdoc",
Err(_) => false
}
},
Err(_) => false
}
}).is_some();
if inside_pages {
is_inside_pages = true;
}
}
}
},
Ok(Event::Empty(ref e)) => {
if e.name() == b"img" {
let mut src: Option<String> = None;
let mut alt: Option<String> = None;
for attribute in e.attributes() {
match attribute {
Ok(attribute) => {
match attribute.key {
b"src" => {
match attribute.unescape_and_decode_value(&reader) {
Ok(src_text) => src = Some(src_text),
Err(_) => ()
};
},
b"alt" => {
match attribute.unescape_and_decode_value(&reader) {
Ok(alt_text) => alt = Some(alt_text),
Err(_) => ()
};
},
_ => ()
};
},
Err(_) => ()
};
}
if src.is_some() && alt.is_some() {
pages.push(src.unwrap());
}
}
},
Ok(Event::End(e)) => {
if e.name() == b"div" {
if is_inside_ads {
is_inside_ads = false;
} else if is_inside_pages {
is_inside_pages = false;
}
}
},
Err(err) => panic!("Error at position {}: {}", reader.buffer_position(), err),
Ok(Event::Eof) => break,
_ => ()
}
buf.clear();
}
pages
}
fn parse_manganelo_pages(text: &str) -> Vec<String> {
let mut split: Vec<&str> = text.splitn(2, "\n").collect();
let screaming_doctype = split[0].to_uppercase();
split[0] = &screaming_doctype;
let text = split.join("\n");
let mut is_inside_pages = false;
let mut is_inside_ads = false;
let mut pages = Vec::new();
let mut reader = Reader::from_str(&text);
reader.check_end_names(false);
let mut buf = Vec::new();
loop {
match reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) => {
if e.name() == b"div" {
if is_inside_pages {
is_inside_ads = true;
} else {
let inside_pages = e.attributes()
.find(|attribute| {
match attribute.as_ref() {
Ok(attribute) => {
attribute.key == b"class" &&
match attribute.unescape_and_decode_value(&reader) {
Ok(text) => text.as_str() == "container-chapter-reader",
Err(_) => false
}
},
Err(_) => false
}
}).is_some();
if inside_pages {
is_inside_pages = true;
}
}
}
},
Ok(Event::Empty(ref e)) => {
if is_inside_pages && !is_inside_ads && e.name() == b"img" {
let mut src: Option<String> = None;
let mut alt: Option<String> = None;
for attribute in e.attributes() {
match attribute {
Ok(attribute) => {
match attribute.key {
b"src" => {
match attribute.unescape_and_decode_value(&reader) {
Ok(src_text) => src = Some(src_text),
Err(_) => ()
};
},
b"alt" => {
match attribute.unescape_and_decode_value(&reader) {
Ok(alt_text) => alt = Some(alt_text),
Err(_) => ()
};
},
_ => ()
};
},
Err(_) => ()
};
}
if src.is_some() && alt.is_some() {
pages.push(src.unwrap());
}
}
},
Ok(Event::End(e)) => {
if e.name() == b"div" {
if is_inside_ads {
is_inside_ads = false;
} else if is_inside_pages {
is_inside_pages = false;
}
}
},
Err(err) => panic!("Error at position {}: {}", reader.buffer_position(), err),
Ok(Event::Eof) => break,
_ => ()
}
buf.clear();
}
pages
}
pub async fn download_file(client: reqwest::Client, url: &str, file_name: &PathBuf, referer: &str) -> Result<bool, structs::Error> {
let resp = client.get(url)
.header("Referer", referer)
.send()
.await?;
match resp.headers().get("Content-Type") {
Some(header_value) => {
if header_value.to_str().unwrap_or_default().starts_with("image/") {
let bytes = resp.bytes().await?;
if !file_name.parent().unwrap().is_dir() {
create_dir(file_name.parent().unwrap())?;
}
let mut file = File::create(&file_name)?;
file.write_all(&bytes)?;
return Ok(true);
}
return Ok(false);
},
None => Ok(false)
}
}