Markdown generation is complete.

Time for the hard shit.
This commit is contained in:
NA 2024-01-25 16:13:09 +00:00
parent 80f3d5b423
commit d46d8b209b
6 changed files with 382 additions and 9 deletions

View file

@ -8,6 +8,8 @@
"rust-analyzer.showUnlinkedFileNotification": false, "rust-analyzer.showUnlinkedFileNotification": false,
"cSpell.ignoreWords": [ "cSpell.ignoreWords": [
"autotools", "autotools",
"chrono",
"indicatif",
"reqwest", "reqwest",
"royalroad", "royalroad",
"ureq" "ureq"

276
Cargo.lock generated
View file

@ -39,6 +39,21 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]] [[package]]
name = "anstream" name = "anstream"
version = "0.6.11" version = "0.6.11"
@ -153,12 +168,32 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "cesu8"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c"
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.0" version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb"
dependencies = [
"android-tzdata",
"iana-time-zone",
"js-sys",
"num-traits",
"wasm-bindgen",
"windows-targets 0.52.0",
]
[[package]] [[package]]
name = "clap" name = "clap"
version = "4.4.18" version = "4.4.18"
@ -205,6 +240,29 @@ version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"
[[package]]
name = "combine"
version = "4.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
dependencies = [
"bytes",
"memchr",
]
[[package]]
name = "console"
version = "0.15.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb"
dependencies = [
"encode_unicode",
"lazy_static",
"libc",
"unicode-width",
"windows-sys 0.52.0",
]
[[package]] [[package]]
name = "core-foundation" name = "core-foundation"
version = "0.9.4" version = "0.9.4"
@ -276,6 +334,12 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591"
[[package]]
name = "encode_unicode"
version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f"
[[package]] [[package]]
name = "encoding_rs" name = "encoding_rs"
version = "0.8.33" version = "0.8.33"
@ -467,6 +531,20 @@ version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f"
[[package]]
name = "html2md"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "be92446e11d68f5d71367d571c229d09ced1f24ab6d08ea0bff329d5f6c0b2a3"
dependencies = [
"html5ever",
"jni",
"lazy_static",
"markup5ever_rcdom",
"percent-encoding",
"regex",
]
[[package]] [[package]]
name = "html5ever" name = "html5ever"
version = "0.26.0" version = "0.26.0"
@ -552,6 +630,29 @@ dependencies = [
"tokio-native-tls", "tokio-native-tls",
] ]
[[package]]
name = "iana-time-zone"
version = "0.1.59"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]] [[package]]
name = "idna" name = "idna"
version = "0.5.0" version = "0.5.0"
@ -572,6 +673,28 @@ dependencies = [
"hashbrown", "hashbrown",
] ]
[[package]]
name = "indicatif"
version = "0.17.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25"
dependencies = [
"console",
"instant",
"number_prefix",
"portable-atomic",
"unicode-width",
]
[[package]]
name = "instant"
version = "0.1.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c"
dependencies = [
"cfg-if",
]
[[package]] [[package]]
name = "ipnet" name = "ipnet"
version = "2.9.0" version = "2.9.0"
@ -584,6 +707,26 @@ version = "1.0.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c"
[[package]]
name = "jni"
version = "0.19.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c6df18c2e3db7e453d3c6ac5b3e9d5182664d28788126d39b91f2d1e22b017ec"
dependencies = [
"cesu8",
"combine",
"jni-sys",
"log",
"thiserror",
"walkdir",
]
[[package]]
name = "jni-sys"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130"
[[package]] [[package]]
name = "js-sys" name = "js-sys"
version = "0.3.67" version = "0.3.67"
@ -647,6 +790,18 @@ dependencies = [
"tendril", "tendril",
] ]
[[package]]
name = "markup5ever_rcdom"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2"
dependencies = [
"html5ever",
"markup5ever",
"tendril",
"xml5ever",
]
[[package]] [[package]]
name = "memchr" name = "memchr"
version = "2.7.1" version = "2.7.1"
@ -703,6 +858,15 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54"
[[package]]
name = "num-traits"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
]
[[package]] [[package]]
name = "num_cpus" name = "num_cpus"
version = "1.16.0" version = "1.16.0"
@ -713,6 +877,12 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "number_prefix"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]] [[package]]
name = "object" name = "object"
version = "0.32.2" version = "0.32.2"
@ -795,6 +965,12 @@ dependencies = [
"windows-targets 0.48.5", "windows-targets 0.48.5",
] ]
[[package]]
name = "path-slash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e91099d4268b0e11973f036e885d652fb0b21fedcf69738c627f94db6a44f42"
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
version = "2.3.1" version = "2.3.1"
@ -899,6 +1075,12 @@ version = "0.3.29"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb"
[[package]]
name = "portable-atomic"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0"
[[package]] [[package]]
name = "ppv-lite86" name = "ppv-lite86"
version = "0.2.17" version = "0.2.17"
@ -1055,7 +1237,11 @@ name = "royal_road_archiver"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"bytes", "bytes",
"chrono",
"clap", "clap",
"html2md",
"indicatif",
"path-slash",
"regex", "regex",
"reqwest", "reqwest",
"scraper", "scraper",
@ -1110,6 +1296,15 @@ version = "1.0.16"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c"
[[package]]
name = "same-file"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
dependencies = [
"winapi-util",
]
[[package]] [[package]]
name = "schannel" name = "schannel"
version = "0.1.23" version = "0.1.23"
@ -1387,6 +1582,26 @@ dependencies = [
"utf-8", "utf-8",
] ]
[[package]]
name = "thiserror"
version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.48",
]
[[package]] [[package]]
name = "tinyvec" name = "tinyvec"
version = "1.6.0" version = "1.6.0"
@ -1541,6 +1756,16 @@ version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f"
[[package]]
name = "walkdir"
version = "2.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee"
dependencies = [
"same-file",
"winapi-util",
]
[[package]] [[package]]
name = "want" name = "want"
version = "0.3.1" version = "0.3.1"
@ -1632,6 +1857,46 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "winapi"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
dependencies = [
"winapi-i686-pc-windows-gnu",
"winapi-x86_64-pc-windows-gnu",
]
[[package]]
name = "winapi-i686-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
[[package]]
name = "winapi-util"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596"
dependencies = [
"winapi",
]
[[package]]
name = "winapi-x86_64-pc-windows-gnu"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
[[package]]
name = "windows-core"
version = "0.52.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9"
dependencies = [
"windows-targets 0.52.0",
]
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.48.0" version = "0.48.0"
@ -1774,6 +2039,17 @@ dependencies = [
"windows-sys 0.48.0", "windows-sys 0.48.0",
] ]
[[package]]
name = "xml5ever"
version = "0.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650"
dependencies = [
"log",
"mac",
"markup5ever",
]
[[package]] [[package]]
name = "zerocopy" name = "zerocopy"
version = "0.7.32" version = "0.7.32"

View file

@ -15,9 +15,13 @@ path = "src/binary.rs"
[dependencies] [dependencies]
bytes = "1.5.0" bytes = "1.5.0"
chrono = "0.4.33"
clap = { version = "4.4.18", features = ["derive"] } clap = { version = "4.4.18", features = ["derive"] }
html2md = "0.2.14"
indicatif = "0.17.7"
path-slash = "0.2.1"
regex = "1.10.3" regex = "1.10.3"
reqwest = { version = "0.11.23", features = ["rustls", "blocking"] } reqwest = { version = "0.11.23", features = ["rustls", "blocking"] }
scraper = "0.18.1" scraper = "0.18.1"
serde_json = "1.0.111" serde_json = "1.0.111"
url = "2.5.0" url = "2.5.0"

View file

@ -1,3 +1,4 @@
use indicatif::{ProgressBar, ProgressStyle};
use scraper::Html; use scraper::Html;
use url::Url; use url::Url;
@ -9,10 +10,10 @@ pub struct Book {
book_url: Url, book_url: Url,
/// The book's title. /// The book's title.
title: String, pub title: String,
/// The book's author. /// The book's author.
author: String, pub author: String,
/// A Url to the book's cover image. /// A Url to the book's cover image.
cover_image_url: Url, cover_image_url: Url,
@ -21,7 +22,7 @@ pub struct Book {
index_html: Html, index_html: Html,
/// A vector of the book's chapters. /// A vector of the book's chapters.
chapters: Vec<Chapter>, pub chapters: Vec<Chapter>,
} }
impl Book { impl Book {
@ -33,11 +34,25 @@ impl Book {
let mut chapters: Vec<Chapter> = Vec::with_capacity(chapter_names_and_urls.len()); let mut chapters: Vec<Chapter> = Vec::with_capacity(chapter_names_and_urls.len());
println!("\nDownloading and processing chapters:");
// Spawn a progress bar showing how many chapters have been downloaded & processed.
let progress_bar = ProgressBar::new(chapter_names_and_urls.len().try_into().unwrap());
progress_bar.set_style(
ProgressStyle::with_template("[{elapsed_precise}] [{wide_bar:.cyan/blue}] {percent}% ")
.unwrap()
.progress_chars("#>-"),
);
// Generate the chapters and add em to the book.
for i in 0..chapter_names_and_urls.len() { for i in 0..chapter_names_and_urls.len() {
let chapter = Chapter::new(&chapter_names_and_urls[i][0], &chapter_names_and_urls[i][1]); let chapter = Chapter::new(&chapter_names_and_urls[i][0], &chapter_names_and_urls[i][1]);
chapters.push(chapter); chapters.push(chapter);
progress_bar.inc(1);
} }
progress_bar.finish();
Book { Book {
book_url: book_url, book_url: book_url,
title: html::get_title_from_index(&index_html), title: html::get_title_from_index(&index_html),
@ -56,18 +71,18 @@ impl Book {
} }
/// A struct representing a chapter. /// A struct representing a chapter.
struct Chapter { pub struct Chapter {
/// The Url of the chapter. /// The Url of the chapter.
chapter_url: Url, chapter_url: Url,
/// The name of the chapter. /// The name of the chapter.
chapter_name: String, pub chapter_name: String,
/// The raw html data of the page. /// The raw html data of the page.
raw_chapter_html: Html, raw_chapter_html: Html,
/// The isolated chapter html. /// The isolated chapter html.
isolated_chapter_html: Html, pub isolated_chapter_html: Html,
} }
impl Chapter { impl Chapter {
@ -75,7 +90,7 @@ impl Chapter {
let chapter_url = http::string_to_url(&chapter_url); let chapter_url = http::string_to_url(&chapter_url);
let raw_chapter_html = html::string_to_html_document(&http::get_response(chapter_url.clone()).get_text()); let raw_chapter_html = html::string_to_html_document(&http::get_response(chapter_url.clone()).get_text());
Chapter { Chapter {
chapter_url: chapter_url, chapter_url: chapter_url,
chapter_name: chapter_name.to_string(), chapter_name: chapter_name.to_string(),
raw_chapter_html: raw_chapter_html.clone(), raw_chapter_html: raw_chapter_html.clone(),

View file

@ -138,4 +138,24 @@ pub fn isolate_chapter_content(raw_chapter_html: Html) -> Html {
} }
eprintln!("Error! Unable to isolate chapter content"); eprintln!("Error! Unable to isolate chapter content");
exit(1); exit(1);
}
/// Remove all img tags from the html fragment.
pub fn remove_image_tags(html_fragment: Html) -> String {
let mut image_tags: Vec<String> = Vec::new();
let selector = Selector::parse("img").unwrap();
for element in html_fragment.select(&selector) {
if !image_tags.contains(&element.html()) {
image_tags.push(element.html());
}
}
let mut html_fragment = html_fragment.html();
for image_tag in image_tags {
html_fragment = html_fragment.replace(&image_tag, "");
}
return html_fragment;
} }

View file

@ -1,5 +1,6 @@
use std::path::PathBuf; use std::{fs::{File, OpenOptions}, io::Write, path::PathBuf, process::exit};
use chrono::prelude::Local;
use clap::Args; use clap::Args;
use url::Url; use url::Url;
@ -76,4 +77,59 @@ pub fn generate_html(html_args: HtmlArgs, book_url: Url, output_directory: PathB
/// Make sure the Url is valid and the output directory is writable BEFORE passing them to this. /// Make sure the Url is valid and the output directory is writable BEFORE passing them to this.
pub fn generate_markdown(markdown_args: MarkdownArgs, book_url: Url, output_directory: PathBuf) { pub fn generate_markdown(markdown_args: MarkdownArgs, book_url: Url, output_directory: PathBuf) {
let book = book::Book::new(book_url); let book = book::Book::new(book_url);
let output_path = convert_path_to_windows(output_directory.join(format!("{0}.md", book.title)));
// Create the md file. This will crash if it already exists or can not be created.
let mut output_file = match OpenOptions::new().write(true).create_new(true).open(&output_path) {
Ok(output_file) => output_file,
Err(error) => {
eprintln!("Error! Unable to create: {0}\n{error}", output_path.to_string_lossy());
exit(1);
}
};
// Append the book title & author.
let buf = format!("{}\n\nby: {}", &book.title, &book.author);
output_file.write_all(buf.as_bytes()).unwrap();
let buf = format!(
"\nArchived on: {}\n\n",
Local::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, false)
);
output_file.write_all(buf.as_bytes()).unwrap();
for chapter in book.chapters {
let mut buf;
if !markdown_args.no_chapter_titles {
buf = format!("----\n{}", chapter.chapter_name);
output_file.write_all(buf.as_bytes()).unwrap();
}
if markdown_args.no_image_tags {
// Remove image tags or not depending on args.
buf = format!("\n\n{}\n\n", html2md::parse_html(&html::remove_image_tags(chapter.isolated_chapter_html)));
} else {
buf = format!("\n\n{}\n\n", html2md::parse_html(&chapter.isolated_chapter_html.html()));
}
output_file.write_all(buf.as_bytes()).unwrap();
}
}
/// Converts a given path to windows style if needed.
fn convert_path_to_windows(path: PathBuf) -> PathBuf {
// If target os is windows.
#[cfg(target_os = "windows")] {
use path_slash::PathBufExt as _;
return PathBuf::from_slash(path);
}
// If target os is not windows.
#[cfg(not(target_os = "windows"))] {
return path;
}
} }