Scrape results page instead of using the API

For some reason, some search queries fail with a "does not exist" error,
like "seven young goats". There actually was a doujin with "seven young
goats" in the title (254818), but it seems like it was deleted. Perhaps
the web frontend ignores deleted doujins, but the API doesn't and fails?
This commit is contained in:
blankie 2023-10-28 12:51:18 +11:00
parent 841d740475
commit 13c365c3a7
Signed by: blankie
GPG Key ID: CC15FC822C7F61F5
5 changed files with 97 additions and 17 deletions

10
Cargo.lock generated
View File

@ -420,6 +420,7 @@ dependencies = [
name = "nhentairs"
version = "0.5.6"
dependencies = [
"quick-xml",
"reqwest",
"serde",
"serde_json",
@ -528,6 +529,15 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "quick-xml"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
dependencies = [
"memchr",
]
[[package]]
name = "quote"
version = "1.0.33"

View File

@ -14,3 +14,4 @@ serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
reqwest = "0.11"
tokio = { version = "1.33", features = ["rt-multi-thread", "sync", "time"] }
quick-xml = "0.31"

View File

@ -1,5 +1,7 @@
use crate::structs;
use quick_xml::events::Event;
use quick_xml::Reader;
use std::env;
use std::process::exit;
@ -38,12 +40,63 @@ pub async fn get_sauce_info(
pub async fn get_search_info(
client: reqwest::Client,
search_query: &str,
) -> Result<structs::SearchInfo, structs::Error> {
let uri = "https://nhentai.net/api/galleries/search";
) -> Result<Vec<structs::MiniGalleryInfo>, structs::Error> {
let resp = client
.get(uri)
.query(&[("query", search_query)])
.get("https://nhentai.net/search/")
.query(&[("q", search_query)])
.send()
.await?;
Ok(serde_json::from_str(&resp.text().await?)?)
let text = resp.text().await?;
let mut results = Vec::new();
let mut gallery_info = structs::MiniGalleryInfo {
id: 0,
title: "".to_string(),
};
let mut reading_gallery = false;
let mut reader = Reader::from_str(&text);
reader.trim_text(true).check_end_names(false);
loop {
match reader.read_event() {
Ok(Event::Start(e)) if e.local_name().as_ref() == "a".as_bytes() => {
let class_attribute = match e.try_get_attribute("class")? {
Some(a) => a,
None => continue,
};
if class_attribute.decode_and_unescape_value(&reader)? != "cover" {
continue;
}
let href_attribute = match e.try_get_attribute("href")? {
Some(a) => a,
None => return Err(structs::Error::Unknown("failed to find href in <a>")),
};
let href = href_attribute.decode_and_unescape_value(&reader)?;
let id_str = match href.split('/').nth(2) {
Some(i) => i,
None => return Err(structs::Error::Unknown("failed to find id in <a href>")),
};
reading_gallery = true;
gallery_info.id = id_str.parse()?;
}
Ok(Event::Text(e)) if reading_gallery => {
gallery_info.title.push_str(&e.unescape()?);
}
Ok(Event::End(e)) if reading_gallery && e.local_name().as_ref() == "a".as_bytes() => {
results.push(gallery_info);
reading_gallery = false;
gallery_info = structs::MiniGalleryInfo {
id: 0,
title: "".to_string(),
};
}
Ok(Event::Eof) => break,
// why cast? i have no idea, the compiler just doesn't see the From
Err(err) => return Err(structs::Error::QuickXML(err)),
_ => {}
};
}
Ok(results)
}

View File

@ -17,15 +17,11 @@ pub async fn run(args: env::Args) {
let search_info = api::get_search_info(api::get_client(), &query)
.await
.unwrap();
if search_info.num_pages < 1 {
if search_info.len() < 1 {
eprintln!("No results found");
exit(1);
}
for result in search_info.result {
let mut title = &result.title.english.unwrap_or_default();
if title == "" {
title = &result.title.japanese.as_ref().unwrap();
}
println!("{}: {}", result.id, &title);
for result in search_info {
println!("{}: {}", result.id, result.title);
}
}

View File

@ -3,6 +3,7 @@ use serde::{Deserialize, Deserializer};
use std::collections::BTreeMap;
use std::fmt;
use std::marker::PhantomData;
use std::num::ParseIntError;
#[derive(Deserialize, Debug)]
pub struct GalleryTitleInfo {
@ -60,11 +61,10 @@ pub enum GalleryInfo {
Error(GalleryInfoError),
}
#[derive(Deserialize, Debug)]
pub struct SearchInfo {
pub result: Vec<GalleryInfoSuccess>,
pub num_pages: i32,
pub per_page: i32,
#[derive(Debug)]
pub struct MiniGalleryInfo {
pub id: i32,
pub title: String,
}
impl fmt::Display for GalleryInfoSuccess {
@ -112,6 +112,9 @@ impl fmt::Display for GalleryInfoSuccess {
pub enum Error {
Reqwest(reqwest::Error),
SerdeJSON(serde_json::Error),
QuickXML(quick_xml::Error),
ParseInt(ParseIntError),
Unknown(&'static str),
}
impl From<reqwest::Error> for Error {
@ -128,11 +131,28 @@ impl From<serde_json::Error> for Error {
}
}
impl From<quick_xml::Error> for Error {
#[inline]
fn from(error: quick_xml::Error) -> Error {
Error::QuickXML(error)
}
}
impl From<ParseIntError> for Error {
#[inline]
fn from(error: ParseIntError) -> Error {
Error::ParseInt(error)
}
}
impl fmt::Display for Error {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
let str = match self {
Error::Reqwest(err) => format!("reqwest error: {}", err),
Error::SerdeJSON(err) => format!("serde_json error: {}", err),
Error::QuickXML(err) => format!("quick_xml error: {}", err),
Error::ParseInt(err) => format!("parse int error: {}", err),
Error::Unknown(err) => err.to_string(),
};
formatter.write_str(&str)
}