From d46d8b209b1cb1a0dad286d2984cd30e0545dd64 Mon Sep 17 00:00:00 2001 From: NA Date: Thu, 25 Jan 2024 16:13:09 +0000 Subject: [PATCH] Markdown generation is complete. Time for the hard shit. --- .vscode/settings.json | 2 + Cargo.lock | 276 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 6 +- src/book.rs | 29 +++-- src/html.rs | 20 +++ src/library.rs | 58 ++++++++- 6 files changed, 382 insertions(+), 9 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index aaad453..d163322 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,6 +8,8 @@ "rust-analyzer.showUnlinkedFileNotification": false, "cSpell.ignoreWords": [ "autotools", + "chrono", + "indicatif", "reqwest", "royalroad", "ureq" diff --git a/Cargo.lock b/Cargo.lock index ca0b42a..fa1d271 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -39,6 +39,21 @@ dependencies = [ "memchr", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + [[package]] name = "anstream" version = "0.6.11" @@ -153,12 +168,32 @@ dependencies = [ "libc", ] +[[package]] +name = "cesu8" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d43a04d8753f35258c91f8ec639f792891f748a1edbd759cf1dcea3382ad83c" + [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f13690e35a5e4ace198e7beea2895d29f3a9cc55015fcebe6336bd2010af9eb" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-targets 0.52.0", +] + [[package]] name = "clap" version = "4.4.18" @@ -205,6 +240,29 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "combine" +version = "4.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" +dependencies = [ + "bytes", + "memchr", +] + +[[package]] +name = "console" +version = "0.15.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e1f83fc076bd6dd27517eacdf25fef6c4dfe5f1d7448bafaaf3a26f13b5e4eb" +dependencies = [ + "encode_unicode", + "lazy_static", + "libc", + "unicode-width", + "windows-sys 0.52.0", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -276,6 +334,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "encoding_rs" version = "0.8.33" @@ -467,6 +531,20 @@ version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d3d0e0f38255e7fa3cf31335b3a56f05febd18025f4db5ef7a0cfb4f8da651f" +[[package]] +name = "html2md" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be92446e11d68f5d71367d571c229d09ced1f24ab6d08ea0bff329d5f6c0b2a3" +dependencies = [ + "html5ever", + "jni", + "lazy_static", + "markup5ever_rcdom", + "percent-encoding", + "regex", +] + [[package]] name = "html5ever" version = "0.26.0" @@ -552,6 +630,29 @@ dependencies = [ "tokio-native-tls", ] +[[package]] +name = "iana-time-zone" +version = "0.1.59" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6a67363e2aa4443928ce15e57ebae94fd8949958fd1223c4cfc0cd473ad7539" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "idna" version = "0.5.0" @@ -572,6 +673,28 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "indicatif" +version = "0.17.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb28741c9db9a713d93deb3bb9515c20788cef5815265bee4980e87bde7e0f25" +dependencies = [ + "console", + "instant", + "number_prefix", + "portable-atomic", + "unicode-width", +] + +[[package]] +name = "instant" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a5bbe824c507c5da5956355e86a746d82e0e1464f65d862cc5e71da70e94b2c" +dependencies = [ + "cfg-if", +] + [[package]] name = "ipnet" version = "2.9.0" @@ -584,6 +707,26 @@ version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" +[[package]] +name = "jni" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6df18c2e3db7e453d3c6ac5b3e9d5182664d28788126d39b91f2d1e22b017ec" +dependencies = [ + "cesu8", + "combine", + "jni-sys", + "log", + "thiserror", + "walkdir", +] + +[[package]] +name = "jni-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaf4bc02d17cbdd7ff4c7438cafcdf7fb9a4613313ad11b4f8fefe7d3fa0130" + [[package]] name = "js-sys" version = "0.3.67" @@ -647,6 +790,18 @@ dependencies = [ "tendril", ] +[[package]] +name = "markup5ever_rcdom" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9521dd6750f8e80ee6c53d65e2e4656d7de37064f3a7a5d2d11d05df93839c2" +dependencies = [ + "html5ever", + "markup5ever", + "tendril", + "xml5ever", +] + [[package]] name = "memchr" version = "2.7.1" @@ -703,6 +858,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" +[[package]] +name = "num-traits" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c" +dependencies = [ + "autocfg", +] + [[package]] name = "num_cpus" version = "1.16.0" @@ -713,6 +877,12 @@ dependencies = [ "libc", ] +[[package]] +name = "number_prefix" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" + [[package]] name = "object" version = "0.32.2" @@ -795,6 +965,12 @@ dependencies = [ "windows-targets 0.48.5", ] +[[package]] +name = "path-slash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e91099d4268b0e11973f036e885d652fb0b21fedcf69738c627f94db6a44f42" + [[package]] name = "percent-encoding" version = "2.3.1" @@ -899,6 +1075,12 @@ version = "0.3.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2900ede94e305130c13ddd391e0ab7cbaeb783945ae07a279c268cb05109c6cb" +[[package]] +name = "portable-atomic" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7170ef9988bc169ba16dd36a7fa041e5c4cbeb6a35b76d4c03daded371eae7c0" + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -1055,7 +1237,11 @@ name = "royal_road_archiver" version = "0.1.0" dependencies = [ "bytes", + "chrono", "clap", + "html2md", + "indicatif", + "path-slash", "regex", "reqwest", "scraper", @@ -1110,6 +1296,15 @@ version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + [[package]] name = "schannel" version = "0.1.23" @@ -1387,6 +1582,26 @@ dependencies = [ "utf-8", ] +[[package]] +name = "thiserror" +version = "1.0.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d54378c645627613241d077a3a79db965db602882668f9136ac42af9ecb730ad" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa0faa943b50f3db30a20aa7e265dbc66076993efed8463e8de414e5d06d3471" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.48", +] + [[package]] name = "tinyvec" version = "1.6.0" @@ -1541,6 +1756,16 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "walkdir" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71d857dc86794ca4c280d616f7da00d2dbfd8cd788846559a6813e6aa4b54ee" +dependencies = [ + "same-file", + "winapi-util", +] + [[package]] name = "want" version = "0.3.1" @@ -1632,6 +1857,46 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e6f9198ba0d26b4c9f07dbe6f9ed633e1f3d5b8b414090084349e46a52596" +dependencies = [ + "winapi", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.0", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -1774,6 +2039,17 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "xml5ever" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4034e1d05af98b51ad7214527730626f019682d797ba38b51689212118d8e650" +dependencies = [ + "log", + "mac", + "markup5ever", +] + [[package]] name = "zerocopy" version = "0.7.32" diff --git a/Cargo.toml b/Cargo.toml index 9425c16..43197aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,9 +15,13 @@ path = "src/binary.rs" [dependencies] bytes = "1.5.0" +chrono = "0.4.33" clap = { version = "4.4.18", features = ["derive"] } +html2md = "0.2.14" +indicatif = "0.17.7" +path-slash = "0.2.1" regex = "1.10.3" reqwest = { version = "0.11.23", features = ["rustls", "blocking"] } scraper = "0.18.1" serde_json = "1.0.111" -url = "2.5.0" \ No newline at end of file +url = "2.5.0" diff --git a/src/book.rs b/src/book.rs index 820c4d2..353823e 100644 --- a/src/book.rs +++ b/src/book.rs @@ -1,3 +1,4 @@ +use indicatif::{ProgressBar, ProgressStyle}; use scraper::Html; use url::Url; @@ -9,10 +10,10 @@ pub struct Book { book_url: Url, /// The book's title. - title: String, + pub title: String, /// The book's author. - author: String, + pub author: String, /// A Url to the book's cover image. cover_image_url: Url, @@ -21,7 +22,7 @@ pub struct Book { index_html: Html, /// A vector of the book's chapters. - chapters: Vec, + pub chapters: Vec, } impl Book { @@ -33,11 +34,25 @@ impl Book { let mut chapters: Vec = Vec::with_capacity(chapter_names_and_urls.len()); + println!("\nDownloading and processing chapters:"); + // Spawn a progress bar showing how many chapters have been downloaded & processed. + let progress_bar = ProgressBar::new(chapter_names_and_urls.len().try_into().unwrap()); + progress_bar.set_style( + ProgressStyle::with_template("[{elapsed_precise}] [{wide_bar:.cyan/blue}] {percent}% ") + .unwrap() + .progress_chars("#>-"), + ); + + // Generate the chapters and add em to the book. for i in 0..chapter_names_and_urls.len() { let chapter = Chapter::new(&chapter_names_and_urls[i][0], &chapter_names_and_urls[i][1]); chapters.push(chapter); + + progress_bar.inc(1); } + progress_bar.finish(); + Book { book_url: book_url, title: html::get_title_from_index(&index_html), @@ -56,18 +71,18 @@ impl Book { } /// A struct representing a chapter. -struct Chapter { +pub struct Chapter { /// The Url of the chapter. chapter_url: Url, /// The name of the chapter. - chapter_name: String, + pub chapter_name: String, /// The raw html data of the page. raw_chapter_html: Html, /// The isolated chapter html. - isolated_chapter_html: Html, + pub isolated_chapter_html: Html, } impl Chapter { @@ -75,7 +90,7 @@ impl Chapter { let chapter_url = http::string_to_url(&chapter_url); let raw_chapter_html = html::string_to_html_document(&http::get_response(chapter_url.clone()).get_text()); - Chapter { + Chapter { chapter_url: chapter_url, chapter_name: chapter_name.to_string(), raw_chapter_html: raw_chapter_html.clone(), diff --git a/src/html.rs b/src/html.rs index e75ca1f..049b6cd 100644 --- a/src/html.rs +++ b/src/html.rs @@ -138,4 +138,24 @@ pub fn isolate_chapter_content(raw_chapter_html: Html) -> Html { } eprintln!("Error! Unable to isolate chapter content"); exit(1); +} + +/// Remove all img tags from the html fragment. +pub fn remove_image_tags(html_fragment: Html) -> String { + let mut image_tags: Vec = Vec::new(); + + let selector = Selector::parse("img").unwrap(); + for element in html_fragment.select(&selector) { + if !image_tags.contains(&element.html()) { + image_tags.push(element.html()); + } + } + + let mut html_fragment = html_fragment.html(); + + for image_tag in image_tags { + html_fragment = html_fragment.replace(&image_tag, ""); + } + + return html_fragment; } \ No newline at end of file diff --git a/src/library.rs b/src/library.rs index 7d33759..3a71dd7 100644 --- a/src/library.rs +++ b/src/library.rs @@ -1,5 +1,6 @@ -use std::path::PathBuf; +use std::{fs::{File, OpenOptions}, io::Write, path::PathBuf, process::exit}; +use chrono::prelude::Local; use clap::Args; use url::Url; @@ -76,4 +77,59 @@ pub fn generate_html(html_args: HtmlArgs, book_url: Url, output_directory: PathB /// Make sure the Url is valid and the output directory is writable BEFORE passing them to this. pub fn generate_markdown(markdown_args: MarkdownArgs, book_url: Url, output_directory: PathBuf) { let book = book::Book::new(book_url); + + let output_path = convert_path_to_windows(output_directory.join(format!("{0}.md", book.title))); + + // Create the md file. This will crash if it already exists or can not be created. + let mut output_file = match OpenOptions::new().write(true).create_new(true).open(&output_path) { + Ok(output_file) => output_file, + Err(error) => { + eprintln!("Error! Unable to create: {0}\n{error}", output_path.to_string_lossy()); + exit(1); + } + }; + + // Append the book title & author. + let buf = format!("{}\n\nby: {}", &book.title, &book.author); + output_file.write_all(buf.as_bytes()).unwrap(); + + let buf = format!( + "\nArchived on: {}\n\n", + Local::now().to_rfc3339_opts(chrono::SecondsFormat::Secs, false) + ); + output_file.write_all(buf.as_bytes()).unwrap(); + + for chapter in book.chapters { + let mut buf; + + if !markdown_args.no_chapter_titles { + buf = format!("----\n{}", chapter.chapter_name); + output_file.write_all(buf.as_bytes()).unwrap(); + } + + if markdown_args.no_image_tags { + // Remove image tags or not depending on args. + buf = format!("\n\n{}\n\n", html2md::parse_html(&html::remove_image_tags(chapter.isolated_chapter_html))); + + } else { + buf = format!("\n\n{}\n\n", html2md::parse_html(&chapter.isolated_chapter_html.html())); + } + + output_file.write_all(buf.as_bytes()).unwrap(); + } +} + +/// Converts a given path to windows style if needed. +fn convert_path_to_windows(path: PathBuf) -> PathBuf { + // If target os is windows. + #[cfg(target_os = "windows")] { + use path_slash::PathBufExt as _; + + return PathBuf::from_slash(path); + } + + // If target os is not windows. + #[cfg(not(target_os = "windows"))] { + return path; + } } \ No newline at end of file